Tanguy Pruvot
10 years ago
44 changed files with 1129 additions and 1619 deletions
@ -1,291 +1,292 @@
@@ -1,291 +1,292 @@
|
||||
|
||||
extern "C" |
||||
{ |
||||
#include "sph/sph_blake.h" |
||||
#include "sph/sph_bmw.h" |
||||
#include "sph/sph_groestl.h" |
||||
#include "sph/sph_skein.h" |
||||
#include "sph/sph_jh.h" |
||||
#include "sph/sph_keccak.h" |
||||
#include "miner.h" |
||||
} |
||||
|
||||
#include <stdint.h> |
||||
|
||||
// aus cpu-miner.c |
||||
extern int device_map[8]; |
||||
|
||||
// Speicher für Input/Output der verketteten Hashfunktionen |
||||
static uint32_t *d_hash[8]; |
||||
|
||||
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes |
||||
static uint32_t *d_animeNonces[8]; |
||||
static uint32_t *d_branch1Nonces[8]; |
||||
static uint32_t *d_branch2Nonces[8]; |
||||
static uint32_t *d_branch3Nonces[8]; |
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, int threads); |
||||
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void quark_bmw512_cpu_init(int thr_id, int threads); |
||||
extern void quark_bmw512_cpu_setBlock_80(void *pdata); |
||||
extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order); |
||||
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); |
||||
|
||||
extern void quark_groestl512_cpu_init(int thr_id, int threads); |
||||
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void quark_skein512_cpu_init(int thr_id, int threads); |
||||
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void quark_keccak512_cpu_init(int thr_id, int threads); |
||||
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void quark_jh512_cpu_init(int thr_id, int threads); |
||||
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads); |
||||
extern void quark_check_cpu_setTarget(const void *ptarget); |
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); |
||||
|
||||
extern void quark_compactTest_cpu_init(int thr_id, int threads); |
||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
||||
uint32_t *d_nonces1, size_t *nrm1, |
||||
uint32_t *d_nonces2, size_t *nrm2, |
||||
int order); |
||||
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
||||
uint32_t *d_nonces1, size_t *nrm1, |
||||
int order); |
||||
|
||||
// Original Quarkhash Funktion aus einem miner Quelltext |
||||
inline void animehash(void *state, const void *input) |
||||
{ |
||||
sph_blake512_context ctx_blake; |
||||
sph_bmw512_context ctx_bmw; |
||||
sph_groestl512_context ctx_groestl; |
||||
sph_jh512_context ctx_jh; |
||||
sph_keccak512_context ctx_keccak; |
||||
sph_skein512_context ctx_skein; |
||||
|
||||
unsigned char hash[64]; |
||||
|
||||
sph_bmw512_init(&ctx_bmw); |
||||
// ZBMW; |
||||
sph_bmw512 (&ctx_bmw, (const void*) input, 80); |
||||
sph_bmw512_close(&ctx_bmw, (void*) hash); |
||||
|
||||
sph_blake512_init(&ctx_blake); |
||||
// ZBLAKE; |
||||
sph_blake512 (&ctx_blake, hash, 64); |
||||
sph_blake512_close(&ctx_blake, (void*) hash); |
||||
|
||||
if (hash[0] & 0x8) |
||||
{ |
||||
sph_groestl512_init(&ctx_groestl); |
||||
// ZGROESTL; |
||||
sph_groestl512 (&ctx_groestl, (const void*) hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*) hash); |
||||
} |
||||
else |
||||
{ |
||||
sph_skein512_init(&ctx_skein); |
||||
// ZSKEIN; |
||||
sph_skein512 (&ctx_skein, (const void*) hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*) hash); |
||||
} |
||||
|
||||
sph_groestl512_init(&ctx_groestl); |
||||
// ZGROESTL; |
||||
sph_groestl512 (&ctx_groestl, (const void*) hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*) hash); |
||||
|
||||
sph_jh512_init(&ctx_jh); |
||||
// ZJH; |
||||
sph_jh512 (&ctx_jh, (const void*) hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*) hash); |
||||
|
||||
if (hash[0] & 0x8) |
||||
{ |
||||
sph_blake512_init(&ctx_blake); |
||||
// ZBLAKE; |
||||
sph_blake512 (&ctx_blake, (const void*) hash, 64); |
||||
sph_blake512_close(&ctx_blake, (void*) hash); |
||||
} |
||||
else |
||||
{ |
||||
sph_bmw512_init(&ctx_bmw); |
||||
// ZBMW; |
||||
sph_bmw512 (&ctx_bmw, (const void*) hash, 64); |
||||
sph_bmw512_close(&ctx_bmw, (void*) hash); |
||||
} |
||||
|
||||
sph_keccak512_init(&ctx_keccak); |
||||
// ZKECCAK; |
||||
sph_keccak512 (&ctx_keccak, (const void*) hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*) hash); |
||||
|
||||
sph_skein512_init(&ctx_skein); |
||||
// SKEIN; |
||||
sph_skein512 (&ctx_skein, (const void*) hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*) hash); |
||||
|
||||
if (hash[0] & 0x8) |
||||
{ |
||||
sph_keccak512_init(&ctx_keccak); |
||||
// ZKECCAK; |
||||
sph_keccak512 (&ctx_keccak, (const void*) hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*) hash); |
||||
} |
||||
else |
||||
{ |
||||
sph_jh512_init(&ctx_jh); |
||||
// ZJH; |
||||
sph_jh512 (&ctx_jh, (const void*) hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*) hash); |
||||
} |
||||
|
||||
memcpy(state, hash, 32); |
||||
} |
||||
|
||||
|
||||
struct HashPredicate |
||||
{ |
||||
HashPredicate(uint32_t *hashes, uint32_t startNonce) : |
||||
m_hashes(hashes), |
||||
m_startNonce(startNonce) |
||||
{ } |
||||
|
||||
__device__ |
||||
bool operator()(const uint32_t x) |
||||
{ |
||||
uint32_t *hash = &m_hashes[(x - m_startNonce)*16]; |
||||
return hash[0] & 0x8; |
||||
} |
||||
|
||||
uint32_t *m_hashes; |
||||
uint32_t m_startNonce; |
||||
}; |
||||
|
||||
extern bool opt_benchmark; |
||||
|
||||
extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, |
||||
const uint32_t *ptarget, uint32_t max_nonce, |
||||
unsigned long *hashes_done) |
||||
{ |
||||
const uint32_t first_nonce = pdata[19]; |
||||
|
||||
if (opt_benchmark) |
||||
((uint32_t*)ptarget)[7] = 0x00000f; |
||||
|
||||
const uint32_t Htarg = ptarget[7]; |
||||
|
||||
const int throughput = 256*2048; // 100; |
||||
|
||||
static bool init[8] = {0,0,0,0,0,0,0,0}; |
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(device_map[thr_id]); |
||||
|
||||
// Konstanten kopieren, Speicher belegen |
||||
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); |
||||
quark_blake512_cpu_init(thr_id, throughput); |
||||
quark_groestl512_cpu_init(thr_id, throughput); |
||||
quark_skein512_cpu_init(thr_id, throughput); |
||||
quark_bmw512_cpu_init(thr_id, throughput); |
||||
quark_keccak512_cpu_init(thr_id, throughput); |
||||
quark_jh512_cpu_init(thr_id, throughput); |
||||
quark_check_cpu_init(thr_id, throughput); |
||||
quark_compactTest_cpu_init(thr_id, throughput); |
||||
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput); |
||||
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); |
||||
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); |
||||
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); |
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
uint32_t endiandata[20]; |
||||
for (int k=0; k < 20; k++) |
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); |
||||
|
||||
quark_bmw512_cpu_setBlock_80((void*)endiandata); |
||||
quark_check_cpu_setTarget(ptarget); |
||||
|
||||
do { |
||||
int order = 0; |
||||
size_t nrm1=0, nrm2=0, nrm3=0; |
||||
|
||||
// erstes BMW512 Hash mit CUDA |
||||
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für Blake512 |
||||
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
|
||||
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, |
||||
d_branch3Nonces[thr_id], &nrm3, |
||||
order++); |
||||
|
||||
// nur den Skein Branch weiterverfolgen |
||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für Groestl512 |
||||
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für JH512 |
||||
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) |
||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], |
||||
d_branch1Nonces[thr_id], &nrm1, |
||||
d_branch2Nonces[thr_id], &nrm2, |
||||
order++); |
||||
|
||||
// das ist der bedingte Branch für Blake512 |
||||
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der bedingte Branch für Bmw512 |
||||
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für Keccak512 |
||||
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für Skein512 |
||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) |
||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], |
||||
d_branch1Nonces[thr_id], &nrm1, |
||||
d_branch2Nonces[thr_id], &nrm2, |
||||
order++); |
||||
|
||||
// das ist der bedingte Branch für Keccak512 |
||||
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der bedingte Branch für JH512 |
||||
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU |
||||
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
if (foundNonce != 0xffffffff) |
||||
{ |
||||
uint32_t vhash64[8]; |
||||
be32enc(&endiandata[19], foundNonce); |
||||
animehash(vhash64, endiandata); |
||||
|
||||
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { |
||||
|
||||
pdata[19] = foundNonce; |
||||
*hashes_done = (foundNonce - first_nonce + 1)/2; |
||||
return 1; |
||||
} else { |
||||
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); |
||||
} |
||||
} |
||||
|
||||
pdata[19] += throughput; |
||||
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = (pdata[19] - first_nonce + 1)/2; |
||||
return 0; |
||||
} |
||||
extern "C" |
||||
{ |
||||
#include "sph/sph_blake.h" |
||||
#include "sph/sph_bmw.h" |
||||
#include "sph/sph_groestl.h" |
||||
#include "sph/sph_skein.h" |
||||
#include "sph/sph_jh.h" |
||||
#include "sph/sph_keccak.h" |
||||
#include "miner.h" |
||||
#include "cuda_helper.h" |
||||
} |
||||
|
||||
// aus cpu-miner.c |
||||
extern int device_map[8]; |
||||
|
||||
// Speicher für Input/Output der verketteten Hashfunktionen |
||||
static uint32_t *d_hash[8]; |
||||
|
||||
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes |
||||
static uint32_t *d_animeNonces[8]; |
||||
static uint32_t *d_branch1Nonces[8]; |
||||
static uint32_t *d_branch2Nonces[8]; |
||||
static uint32_t *d_branch3Nonces[8]; |
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, int threads); |
||||
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void quark_bmw512_cpu_init(int thr_id, int threads); |
||||
extern void quark_bmw512_cpu_setBlock_80(void *pdata); |
||||
extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order); |
||||
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); |
||||
|
||||
extern void quark_groestl512_cpu_init(int thr_id, int threads); |
||||
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void quark_skein512_cpu_init(int thr_id, int threads); |
||||
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void quark_keccak512_cpu_init(int thr_id, int threads); |
||||
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void quark_jh512_cpu_init(int thr_id, int threads); |
||||
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads); |
||||
extern void cuda_check_cpu_setTarget(const void *ptarget); |
||||
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); |
||||
|
||||
extern void quark_compactTest_cpu_init(int thr_id, int threads); |
||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
||||
uint32_t *d_nonces1, size_t *nrm1, |
||||
uint32_t *d_nonces2, size_t *nrm2, |
||||
int order); |
||||
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
||||
uint32_t *d_nonces1, size_t *nrm1, |
||||
int order); |
||||
|
||||
// Original Quarkhash Funktion aus einem miner Quelltext |
||||
inline void animehash(void *state, const void *input) |
||||
{ |
||||
sph_blake512_context ctx_blake; |
||||
sph_bmw512_context ctx_bmw; |
||||
sph_groestl512_context ctx_groestl; |
||||
sph_jh512_context ctx_jh; |
||||
sph_keccak512_context ctx_keccak; |
||||
sph_skein512_context ctx_skein; |
||||
|
||||
unsigned char hash[64]; |
||||
|
||||
sph_bmw512_init(&ctx_bmw); |
||||
// ZBMW; |
||||
sph_bmw512 (&ctx_bmw, (const void*) input, 80); |
||||
sph_bmw512_close(&ctx_bmw, (void*) hash); |
||||
|
||||
sph_blake512_init(&ctx_blake); |
||||
// ZBLAKE; |
||||
sph_blake512 (&ctx_blake, hash, 64); |
||||
sph_blake512_close(&ctx_blake, (void*) hash); |
||||
|
||||
if (hash[0] & 0x8) |
||||
{ |
||||
sph_groestl512_init(&ctx_groestl); |
||||
// ZGROESTL; |
||||
sph_groestl512 (&ctx_groestl, (const void*) hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*) hash); |
||||
} |
||||
else |
||||
{ |
||||
sph_skein512_init(&ctx_skein); |
||||
// ZSKEIN; |
||||
sph_skein512 (&ctx_skein, (const void*) hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*) hash); |
||||
} |
||||
|
||||
sph_groestl512_init(&ctx_groestl); |
||||
// ZGROESTL; |
||||
sph_groestl512 (&ctx_groestl, (const void*) hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*) hash); |
||||
|
||||
sph_jh512_init(&ctx_jh); |
||||
// ZJH; |
||||
sph_jh512 (&ctx_jh, (const void*) hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*) hash); |
||||
|
||||
if (hash[0] & 0x8) |
||||
{ |
||||
sph_blake512_init(&ctx_blake); |
||||
// ZBLAKE; |
||||
sph_blake512 (&ctx_blake, (const void*) hash, 64); |
||||
sph_blake512_close(&ctx_blake, (void*) hash); |
||||
} |
||||
else |
||||
{ |
||||
sph_bmw512_init(&ctx_bmw); |
||||
// ZBMW; |
||||
sph_bmw512 (&ctx_bmw, (const void*) hash, 64); |
||||
sph_bmw512_close(&ctx_bmw, (void*) hash); |
||||
} |
||||
|
||||
sph_keccak512_init(&ctx_keccak); |
||||
// ZKECCAK; |
||||
sph_keccak512 (&ctx_keccak, (const void*) hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*) hash); |
||||
|
||||
sph_skein512_init(&ctx_skein); |
||||
// SKEIN; |
||||
sph_skein512 (&ctx_skein, (const void*) hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*) hash); |
||||
|
||||
if (hash[0] & 0x8) |
||||
{ |
||||
sph_keccak512_init(&ctx_keccak); |
||||
// ZKECCAK; |
||||
sph_keccak512 (&ctx_keccak, (const void*) hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*) hash); |
||||
} |
||||
else |
||||
{ |
||||
sph_jh512_init(&ctx_jh); |
||||
// ZJH; |
||||
sph_jh512 (&ctx_jh, (const void*) hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*) hash); |
||||
} |
||||
|
||||
memcpy(state, hash, 32); |
||||
} |
||||
|
||||
|
||||
struct HashPredicate |
||||
{ |
||||
HashPredicate(uint32_t *hashes, uint32_t startNonce) : |
||||
m_hashes(hashes), |
||||
m_startNonce(startNonce) |
||||
{ } |
||||
|
||||
__device__ |
||||
bool operator()(const uint32_t x) |
||||
{ |
||||
uint32_t *hash = &m_hashes[(x - m_startNonce)*16]; |
||||
return hash[0] & 0x8; |
||||
} |
||||
|
||||
uint32_t *m_hashes; |
||||
uint32_t m_startNonce; |
||||
}; |
||||
|
||||
extern bool opt_benchmark; |
||||
|
||||
extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, |
||||
const uint32_t *ptarget, uint32_t max_nonce, |
||||
unsigned long *hashes_done) |
||||
{ |
||||
const uint32_t first_nonce = pdata[19]; |
||||
|
||||
if (opt_benchmark) |
||||
((uint32_t*)ptarget)[7] = 0x00000f; |
||||
|
||||
const uint32_t Htarg = ptarget[7]; |
||||
|
||||
const int throughput = 256*2048; // 100; |
||||
|
||||
static bool init[8] = {0,0,0,0,0,0,0,0}; |
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(device_map[thr_id]); |
||||
|
||||
// Konstanten kopieren, Speicher belegen |
||||
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); |
||||
|
||||
quark_blake512_cpu_init(thr_id, throughput); |
||||
quark_groestl512_cpu_init(thr_id, throughput); |
||||
quark_skein512_cpu_init(thr_id, throughput); |
||||
quark_bmw512_cpu_init(thr_id, throughput); |
||||
quark_keccak512_cpu_init(thr_id, throughput); |
||||
quark_jh512_cpu_init(thr_id, throughput); |
||||
cuda_check_cpu_init(thr_id, throughput); |
||||
quark_compactTest_cpu_init(thr_id, throughput); |
||||
|
||||
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput); |
||||
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); |
||||
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); |
||||
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
uint32_t endiandata[20]; |
||||
for (int k=0; k < 20; k++) |
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); |
||||
|
||||
quark_bmw512_cpu_setBlock_80((void*)endiandata); |
||||
cuda_check_cpu_setTarget(ptarget); |
||||
|
||||
do { |
||||
int order = 0; |
||||
size_t nrm1=0, nrm2=0, nrm3=0; |
||||
|
||||
// erstes BMW512 Hash mit CUDA |
||||
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für Blake512 |
||||
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
|
||||
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, |
||||
d_branch3Nonces[thr_id], &nrm3, |
||||
order++); |
||||
|
||||
// nur den Skein Branch weiterverfolgen |
||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für Groestl512 |
||||
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für JH512 |
||||
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) |
||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], |
||||
d_branch1Nonces[thr_id], &nrm1, |
||||
d_branch2Nonces[thr_id], &nrm2, |
||||
order++); |
||||
|
||||
// das ist der bedingte Branch für Blake512 |
||||
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der bedingte Branch für Bmw512 |
||||
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für Keccak512 |
||||
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der unbedingte Branch für Skein512 |
||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) |
||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], |
||||
d_branch1Nonces[thr_id], &nrm1, |
||||
d_branch2Nonces[thr_id], &nrm2, |
||||
order++); |
||||
|
||||
// das ist der bedingte Branch für Keccak512 |
||||
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// das ist der bedingte Branch für JH512 |
||||
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); |
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU |
||||
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); |
||||
if (foundNonce != 0xffffffff) |
||||
{ |
||||
uint32_t vhash64[8]; |
||||
be32enc(&endiandata[19], foundNonce); |
||||
animehash(vhash64, endiandata); |
||||
|
||||
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { |
||||
|
||||
pdata[19] = foundNonce; |
||||
*hashes_done = (foundNonce - first_nonce + 1)/2; |
||||
return 1; |
||||
} else { |
||||
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); |
||||
} |
||||
} |
||||
|
||||
pdata[19] += throughput; |
||||
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = (pdata[19] - first_nonce + 1)/2; |
||||
return 0; |
||||
} |
||||
|
@ -1,371 +1,368 @@
@@ -1,371 +1,368 @@
|
||||
#include <cuda.h> |
||||
#include "cuda_runtime.h" |
||||
#include "device_launch_parameters.h" |
||||
#include "sm_30_intrinsics.h" |
||||
|
||||
#include <stdio.h> |
||||
#include <memory.h> |
||||
#include <stdint.h> |
||||
|
||||
// aus cpu-miner.c |
||||
extern int device_map[8]; |
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert |
||||
static cudaDeviceProp props[8]; |
||||
|
||||
static uint32_t *d_tempBranch1Nonces[8]; |
||||
static uint32_t *d_numValid[8]; |
||||
static uint32_t *h_numValid[8]; |
||||
|
||||
static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen |
||||
|
||||
// aus heavy.cu |
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
||||
|
||||
// True/False tester |
||||
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); |
||||
|
||||
__device__ uint32_t QuarkTrueTest(uint32_t *inpHash) |
||||
{ |
||||
return ((inpHash[0] & 0x08) == 0x08); |
||||
} |
||||
|
||||
__device__ uint32_t QuarkFalseTest(uint32_t *inpHash) |
||||
{ |
||||
return ((inpHash[0] & 0x08) == 0); |
||||
} |
||||
|
||||
__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest; |
||||
cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8]; |
||||
|
||||
// Setup-Funktionen |
||||
__host__ void quark_compactTest_cpu_init(int thr_id, int threads) |
||||
{ |
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); |
||||
|
||||
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); |
||||
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); |
||||
|
||||
// wir brauchen auch Speicherplatz auf dem Device |
||||
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2); |
||||
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t)); |
||||
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); |
||||
|
||||
uint32_t s1; |
||||
s1 = (threads / 256) * 2; |
||||
|
||||
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
||||
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
||||
} |
||||
|
||||
#if __CUDA_ARCH__ < 300 |
||||
/** |
||||
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 |
||||
*/ |
||||
#undef __shfl_up |
||||
#define __shfl_up(var, delta, width) (0) |
||||
#endif |
||||
|
||||
// Die Summenfunktion (vom NVIDIA SDK) |
||||
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
||||
{ |
||||
extern __shared__ uint32_t sums[]; |
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
//int lane_id = id % warpSize; |
||||
int lane_id = id % width; |
||||
// determine a warp_id within a block |
||||
//int warp_id = threadIdx.x / warpSize; |
||||
int warp_id = threadIdx.x / width; |
||||
|
||||
sums[lane_id] = 0; |
||||
|
||||
// Below is the basic structure of using a shfl instruction |
||||
// for a scan. |
||||
// Record "value" as a variable - we accumulate it along the way |
||||
uint32_t value; |
||||
if(testFunc != NULL) |
||||
{ |
||||
if (id < threads) |
||||
{ |
||||
uint32_t *inpHash; |
||||
if(d_validNonceTable == NULL) |
||||
{ |
||||
// keine Nonce-Liste |
||||
inpHash = &inpHashes[id<<4]; |
||||
}else |
||||
{ |
||||
// Nonce-Liste verfügbar |
||||
int nonce = d_validNonceTable[id] - startNounce; |
||||
inpHash = &inpHashes[nonce<<4]; |
||||
} |
||||
value = (*testFunc)(inpHash); |
||||
}else |
||||
{ |
||||
value = 0; |
||||
} |
||||
}else |
||||
{ |
||||
value = data[id]; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
// Now accumulate in log steps up the chain |
||||
// compute sums, with another thread's value who is |
||||
// distance delta away (i). Note |
||||
// those threads where the thread 'i' away would have |
||||
// been out of bounds of the warp are unaffected. This |
||||
// creates the scan sum. |
||||
#pragma unroll |
||||
|
||||
for (int i=1; i<=width; i*=2) |
||||
{ |
||||
uint32_t n = __shfl_up((int)value, i, width); |
||||
|
||||
if (lane_id >= i) value += n; |
||||
} |
||||
|
||||
// value now holds the scan value for the individual thread |
||||
// next sum the largest values for each warp |
||||
|
||||
// write the sum of the warp to smem |
||||
//if (threadIdx.x % warpSize == warpSize-1) |
||||
if (threadIdx.x % width == width-1) |
||||
{ |
||||
sums[warp_id] = value; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
// |
||||
// scan sum the warp sums |
||||
// the same shfl scan operation, but performed on warp sums |
||||
// |
||||
if (warp_id == 0) |
||||
{ |
||||
uint32_t warp_sum = sums[lane_id]; |
||||
|
||||
for (int i=1; i<=width; i*=2) |
||||
{ |
||||
uint32_t n = __shfl_up((int)warp_sum, i, width); |
||||
|
||||
if (lane_id >= i) warp_sum += n; |
||||
} |
||||
|
||||
sums[lane_id] = warp_sum; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
// perform a uniform add across warps in the block |
||||
// read neighbouring warp's sum and add it to threads value |
||||
uint32_t blockSum = 0; |
||||
|
||||
if (warp_id > 0) |
||||
{ |
||||
blockSum = sums[warp_id-1]; |
||||
} |
||||
|
||||
value += blockSum; |
||||
|
||||
// Now write out our result |
||||
data[id] = value; |
||||
|
||||
// last thread has sum, write write out the block's sum |
||||
if (partial_sums != NULL && threadIdx.x == blockDim.x-1) |
||||
{ |
||||
partial_sums[blockIdx.x] = value; |
||||
} |
||||
} |
||||
|
||||
// Uniform add: add partial sums array |
||||
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) |
||||
{ |
||||
__shared__ uint32_t buf; |
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
|
||||
if (id > len) return; |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
buf = partial_sums[blockIdx.x]; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
data[id] += buf; |
||||
} |
||||
|
||||
// Der Scatter |
||||
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
||||
{ |
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
uint32_t actNounce = id; |
||||
uint32_t value; |
||||
if (id < threads) |
||||
{ |
||||
// uint32_t nounce = startNounce + id; |
||||
uint32_t *inpHash; |
||||
if(d_validNonceTable == NULL) |
||||
{ |
||||
// keine Nonce-Liste |
||||
inpHash = &inpHashes[id<<4]; |
||||
}else |
||||
{ |
||||
// Nonce-Liste verfügbar |
||||
int nonce = d_validNonceTable[id] - startNounce; |
||||
actNounce = nonce; |
||||
inpHash = &inpHashes[nonce<<4]; |
||||
} |
||||
|
||||
value = (*testFunc)(inpHash); |
||||
}else |
||||
{ |
||||
value = 0; |
||||
} |
||||
|
||||
if( value ) |
||||
{ |
||||
int idx = sum[id]; |
||||
if(idx > 0) |
||||
outp[idx-1] = startNounce + actNounce; |
||||
} |
||||
} |
||||
|
||||
__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) |
||||
{ |
||||
if(val == 0) |
||||
return 0; |
||||
|
||||
uint32_t mask = 0x80000000; |
||||
while( (val & mask) == 0 ) mask = mask >> 1; |
||||
|
||||
if( (val & (~mask)) != 0 ) |
||||
return mask << 1; |
||||
|
||||
return mask; |
||||
} |
||||
|
||||
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm, |
||||
uint32_t *d_nonces1, cuda_compactTestFunction_t function, |
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
||||
{ |
||||
int orgThreads = threads; |
||||
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); |
||||
// threadsPerBlock ausrechnen |
||||
int blockSize = 256; |
||||
int nSummen = threads / blockSize; |
||||
|
||||
int thr1 = (threads+blockSize-1) / blockSize; |
||||
int thr2 = threads / (blockSize*blockSize); |
||||
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize; |
||||
int thr3 = (nSummen + blockSize2-1) / blockSize2; |
||||
|
||||
bool callThrid = (thr2 > 0) ? true : false; |
||||
|
||||
// Erster Initialscan |
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>( |
||||
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
// weitere Scans |
||||
if(callThrid) |
||||
{ |
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); |
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); |
||||
}else |
||||
{ |
||||
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); |
||||
} |
||||
|
||||
// Sync + Anzahl merken |
||||
cudaStreamSynchronize(NULL); |
||||
|
||||
if(callThrid) |
||||
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
else |
||||
cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
|
||||
|
||||
// Addieren |
||||
if(callThrid) |
||||
{ |
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); |
||||
} |
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); |
||||
|
||||
// Scatter |
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, |
||||
function, orgThreads, startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
// Sync |
||||
cudaStreamSynchronize(NULL); |
||||
} |
||||
|
||||
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) |
||||
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm, |
||||
uint32_t *d_nonces1, uint32_t *d_nonces2, |
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
||||
{ |
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
/* |
||||
// threadsPerBlock ausrechnen |
||||
int blockSize = 256; |
||||
int thr1 = threads / blockSize; |
||||
int thr2 = threads / (blockSize*blockSize); |
||||
|
||||
// 1 |
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); |
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
||||
cudaStreamSynchronize(NULL); |
||||
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); |
||||
|
||||
// 2 |
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); |
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
||||
cudaStreamSynchronize(NULL); |
||||
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); |
||||
|
||||
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten |
||||
// Schritt 3: Scatter |
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); |
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); |
||||
cudaStreamSynchronize(NULL); |
||||
*/ |
||||
} |
||||
|
||||
__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
||||
uint32_t *d_nonces1, size_t *nrm1, |
||||
uint32_t *d_nonces2, size_t *nrm2, |
||||
int order) |
||||
{ |
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind |
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! |
||||
|
||||
quark_compactTest_cpu_dualCompaction(thr_id, threads, |
||||
h_numValid[thr_id], d_nonces1, d_nonces2, |
||||
startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser |
||||
*nrm1 = (size_t)h_numValid[thr_id][0]; |
||||
*nrm2 = (size_t)h_numValid[thr_id][1]; |
||||
} |
||||
|
||||
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
||||
uint32_t *d_nonces1, size_t *nrm1, |
||||
int order) |
||||
{ |
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind |
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! |
||||
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser |
||||
*nrm1 = (size_t)h_numValid[thr_id][0]; |
||||
} |
||||
#include <stdio.h> |
||||
#include <memory.h> |
||||
|
||||
#include "cuda_helper.h" |
||||
#include <sm_30_intrinsics.h> |
||||
|
||||
// aus cpu-miner.c |
||||
extern int device_map[8]; |
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert |
||||
static cudaDeviceProp props[8]; |
||||
|
||||
static uint32_t *d_tempBranch1Nonces[8]; |
||||
static uint32_t *d_numValid[8]; |
||||
static uint32_t *h_numValid[8]; |
||||
|
||||
static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen |
||||
|
||||
// aus heavy.cu |
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
||||
|
||||
// True/False tester |
||||
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); |
||||
|
||||
__device__ uint32_t QuarkTrueTest(uint32_t *inpHash) |
||||
{ |
||||
return ((inpHash[0] & 0x08) == 0x08); |
||||
} |
||||
|
||||
__device__ uint32_t QuarkFalseTest(uint32_t *inpHash) |
||||
{ |
||||
return ((inpHash[0] & 0x08) == 0); |
||||
} |
||||
|
||||
__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest; |
||||
cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8]; |
||||
|
||||
// Setup-Funktionen |
||||
__host__ void quark_compactTest_cpu_init(int thr_id, int threads) |
||||
{ |
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); |
||||
|
||||
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); |
||||
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); |
||||
|
||||
// wir brauchen auch Speicherplatz auf dem Device |
||||
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2); |
||||
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t)); |
||||
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); |
||||
|
||||
uint32_t s1; |
||||
s1 = (threads / 256) * 2; |
||||
|
||||
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
||||
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
||||
} |
||||
|
||||
#if __CUDA_ARCH__ < 300 |
||||
/** |
||||
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 |
||||
*/ |
||||
#undef __shfl_up |
||||
#define __shfl_up(var, delta, width) (0) |
||||
#endif |
||||
|
||||
// Die Summenfunktion (vom NVIDIA SDK) |
||||
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
||||
{ |
||||
extern __shared__ uint32_t sums[]; |
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
//int lane_id = id % warpSize; |
||||
int lane_id = id % width; |
||||
// determine a warp_id within a block |
||||
//int warp_id = threadIdx.x / warpSize; |
||||
int warp_id = threadIdx.x / width; |
||||
|
||||
sums[lane_id] = 0; |
||||
|
||||
// Below is the basic structure of using a shfl instruction |
||||
// for a scan. |
||||
// Record "value" as a variable - we accumulate it along the way |
||||
uint32_t value; |
||||
if(testFunc != NULL) |
||||
{ |
||||
if (id < threads) |
||||
{ |
||||
uint32_t *inpHash; |
||||
if(d_validNonceTable == NULL) |
||||
{ |
||||
// keine Nonce-Liste |
||||
inpHash = &inpHashes[id<<4]; |
||||
}else |
||||
{ |
||||
// Nonce-Liste verfügbar |
||||
int nonce = d_validNonceTable[id] - startNounce; |
||||
inpHash = &inpHashes[nonce<<4]; |
||||
} |
||||
value = (*testFunc)(inpHash); |
||||
}else |
||||
{ |
||||
value = 0; |
||||
} |
||||
}else |
||||
{ |
||||
value = data[id]; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
// Now accumulate in log steps up the chain |
||||
// compute sums, with another thread's value who is |
||||
// distance delta away (i). Note |
||||
// those threads where the thread 'i' away would have |
||||
// been out of bounds of the warp are unaffected. This |
||||
// creates the scan sum. |
||||
#pragma unroll |
||||
|
||||
for (int i=1; i<=width; i*=2) |
||||
{ |
||||
uint32_t n = __shfl_up((int)value, i, width); |
||||
|
||||
if (lane_id >= i) value += n; |
||||
} |
||||
|
||||
// value now holds the scan value for the individual thread |
||||
// next sum the largest values for each warp |
||||
|
||||
// write the sum of the warp to smem |
||||
//if (threadIdx.x % warpSize == warpSize-1) |
||||
if (threadIdx.x % width == width-1) |
||||
{ |
||||
sums[warp_id] = value; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
// |
||||
// scan sum the warp sums |
||||
// the same shfl scan operation, but performed on warp sums |
||||
// |
||||
if (warp_id == 0) |
||||
{ |
||||
uint32_t warp_sum = sums[lane_id]; |
||||
|
||||
for (int i=1; i<=width; i*=2) |
||||
{ |
||||
uint32_t n = __shfl_up((int)warp_sum, i, width); |
||||
|
||||
if (lane_id >= i) warp_sum += n; |
||||
} |
||||
|
||||
sums[lane_id] = warp_sum; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
|
||||
// perform a uniform add across warps in the block |
||||
// read neighbouring warp's sum and add it to threads value |
||||
uint32_t blockSum = 0; |
||||
|
||||
if (warp_id > 0) |
||||
{ |
||||
blockSum = sums[warp_id-1]; |
||||
} |
||||
|
||||
value += blockSum; |
||||
|
||||
// Now write out our result |
||||
data[id] = value; |
||||
|
||||
// last thread has sum, write write out the block's sum |
||||
if (partial_sums != NULL && threadIdx.x == blockDim.x-1) |
||||
{ |
||||
partial_sums[blockIdx.x] = value; |
||||
} |
||||
} |
||||
|
||||
// Uniform add: add partial sums array |
||||
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) |
||||
{ |
||||
__shared__ uint32_t buf; |
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
|
||||
if (id > len) return; |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
buf = partial_sums[blockIdx.x]; |
||||
} |
||||
|
||||
__syncthreads(); |
||||
data[id] += buf; |
||||
} |
||||
|
||||
// Der Scatter |
||||
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
||||
{ |
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
||||
uint32_t actNounce = id; |
||||
uint32_t value; |
||||
if (id < threads) |
||||
{ |
||||
// uint32_t nounce = startNounce + id; |
||||
uint32_t *inpHash; |
||||
if(d_validNonceTable == NULL) |
||||
{ |
||||
// keine Nonce-Liste |
||||
inpHash = &inpHashes[id<<4]; |
||||
}else |
||||
{ |
||||
// Nonce-Liste verfügbar |
||||
int nonce = d_validNonceTable[id] - startNounce; |
||||
actNounce = nonce; |
||||
inpHash = &inpHashes[nonce<<4]; |
||||
} |
||||
|
||||
value = (*testFunc)(inpHash); |
||||
}else |
||||
{ |
||||
value = 0; |
||||
} |
||||
|
||||
if( value ) |
||||
{ |
||||
int idx = sum[id]; |
||||
if(idx > 0) |
||||
outp[idx-1] = startNounce + actNounce; |
||||
} |
||||
} |
||||
|
||||
__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) |
||||
{ |
||||
if(val == 0) |
||||
return 0; |
||||
|
||||
uint32_t mask = 0x80000000; |
||||
while( (val & mask) == 0 ) mask = mask >> 1; |
||||
|
||||
if( (val & (~mask)) != 0 ) |
||||
return mask << 1; |
||||
|
||||
return mask; |
||||
} |
||||
|
||||
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm, |
||||
uint32_t *d_nonces1, cuda_compactTestFunction_t function, |
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
||||
{ |
||||
int orgThreads = threads; |
||||
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); |
||||
// threadsPerBlock ausrechnen |
||||
int blockSize = 256; |
||||
int nSummen = threads / blockSize; |
||||
|
||||
int thr1 = (threads+blockSize-1) / blockSize; |
||||
int thr2 = threads / (blockSize*blockSize); |
||||
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize; |
||||
int thr3 = (nSummen + blockSize2-1) / blockSize2; |
||||
|
||||
bool callThrid = (thr2 > 0) ? true : false; |
||||
|
||||
// Erster Initialscan |
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>( |
||||
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
// weitere Scans |
||||
if(callThrid) |
||||
{ |
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); |
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); |
||||
}else |
||||
{ |
||||
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); |
||||
} |
||||
|
||||
// Sync + Anzahl merken |
||||
cudaStreamSynchronize(NULL); |
||||
|
||||
if(callThrid) |
||||
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
else |
||||
cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
|
||||
|
||||
// Addieren |
||||
if(callThrid) |
||||
{ |
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); |
||||
} |
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); |
||||
|
||||
// Scatter |
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, |
||||
function, orgThreads, startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
// Sync |
||||
cudaStreamSynchronize(NULL); |
||||
} |
||||
|
||||
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) |
||||
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm, |
||||
uint32_t *d_nonces1, uint32_t *d_nonces2, |
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
||||
{ |
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
/* |
||||
// threadsPerBlock ausrechnen |
||||
int blockSize = 256; |
||||
int thr1 = threads / blockSize; |
||||
int thr2 = threads / (blockSize*blockSize); |
||||
|
||||
// 1 |
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); |
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
||||
cudaStreamSynchronize(NULL); |
||||
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); |
||||
|
||||
// 2 |
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); |
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
||||
cudaStreamSynchronize(NULL); |
||||
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); |
||||
|
||||
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten |
||||
// Schritt 3: Scatter |
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); |
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); |
||||
cudaStreamSynchronize(NULL); |
||||
*/ |
||||
} |
||||
|
||||
__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
||||
uint32_t *d_nonces1, size_t *nrm1, |
||||
uint32_t *d_nonces2, size_t *nrm2, |
||||
int order) |
||||
{ |
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind |
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! |
||||
|
||||
quark_compactTest_cpu_dualCompaction(thr_id, threads, |
||||
h_numValid[thr_id], d_nonces1, d_nonces2, |
||||
startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser |
||||
*nrm1 = (size_t)h_numValid[thr_id][0]; |
||||
*nrm2 = (size_t)h_numValid[thr_id][1]; |
||||
} |
||||
|
||||
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
||||
uint32_t *d_nonces1, size_t *nrm1, |
||||
int order) |
||||
{ |
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind |
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! |
||||
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
||||
|
||||
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser |
||||
*nrm1 = (size_t)h_numValid[thr_id][0]; |
||||
} |
||||
|
Loading…
Reference in new issue