#include #include #include #include // include thrust if possible #if defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ >= 2 && CUDA_VERSION < 7000 #warning "Heavy: incompatible GCC version!" #define USE_THRUST 0 #else #define USE_THRUST 1 #endif #if USE_THRUST #include #include #endif #include "miner.h" extern "C" { #include "sph/sph_keccak.h" #include "sph/sph_blake.h" #include "sph/sph_groestl.h" } #include "hefty1.h" #include "heavy/heavy.h" #include "cuda_helper.h" extern uint32_t *d_hash2output[MAX_GPUS]; extern uint32_t *d_hash3output[MAX_GPUS]; extern uint32_t *d_hash4output[MAX_GPUS]; extern uint32_t *d_hash5output[MAX_GPUS]; #define HEAVYCOIN_BLKHDR_SZ 84 #define MNR_BLKHDR_SZ 80 // nonce-array für die threads uint32_t *heavy_nonceVector[MAX_GPUS]; extern uint32_t *heavy_heftyHashes[MAX_GPUS]; /* Combines top 64-bits from each hash into a single hash */ static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4) { const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 }; int bits; unsigned int i; uint32_t mask; unsigned int k; /* Transpose first 64 bits of each hash into out */ memset(out, 0, 32); bits = 0; for (i = 7; i >= 6; i--) { for (mask = 0x80000000; mask; mask >>= 1) { for (k = 0; k < 4; k++) { out[(255 - bits)/32] <<= 1; if ((hash[k][i] & mask) != 0) out[(255 - bits)/32] |= 1; bits++; } } } } #ifdef _MSC_VER #include static uint32_t __inline bitsset( uint32_t x ) { DWORD r = 0; _BitScanReverse(&r, x); return r; } #else static uint32_t bitsset( uint32_t x ) { return 31-__builtin_clz(x); } #endif // Finde das high bit in einem Multiword-Integer. static int findhighbit(const uint32_t *ptarget, int words) { int i; int highbit = 0; for (i=words-1; i >= 0; --i) { if (ptarget[i] != 0) { highbit = i*32 + bitsset(ptarget[i])+1; break; } } return highbit; } // Generiere ein Multiword-Integer das die Zahl // (2 << highbit) - 1 repräsentiert. static void genmask(uint32_t *ptarget, int words, int highbit) { int i; for (i=words-1; i >= 0; --i) { if ((i+1)*32 <= highbit) ptarget[i] = UINT32_MAX; else if (i*32 > highbit) ptarget[i] = 0x00000000; else ptarget[i] = (1 << (highbit-i*32)) - 1; } } struct check_nonce_for_remove { check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) : m_target(target), m_hashes(hashes), m_hashlen(hashlen), m_startNonce(startNonce) { } uint64_t m_target; uint32_t *m_hashes; uint32_t m_hashlen; uint32_t m_startNonce; __device__ bool operator()(const uint32_t x) { // Position im Hash Buffer uint32_t hashIndex = x - m_startNonce; // Wert des Hashes (als uint64_t) auslesen. // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits) uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6])); bool res = (hashValue & m_target) != hashValue; //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target); // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein. return res; } }; static bool init[MAX_GPUS] = { 0 }; __host__ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; // CUDA will process thousands of threads. uint32_t throughput = cuda_default_throughput(thr_id, (1U << 19) - 256); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); int rc = 0; uint32_t *hash = NULL; uint32_t *cpu_nonceVector = NULL; int nrmCalls[6]; memset(nrmCalls, 0, sizeof(int) * 6); if (opt_benchmark) ptarget[7] = 0x000f; // für jeden Hash ein individuelles Target erstellen basierend // auf dem höchsten Bit, das in ptarget gesetzt ist. int highbit = findhighbit(ptarget, 8); uint32_t target2[2], target3[2], target4[2], target5[2]; genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256 genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512 genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512 genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512 if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); CUDA_LOG_ERROR(); } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); hefty_cpu_init(thr_id, throughput); sha256_cpu_init(thr_id, throughput); keccak512_cpu_init(thr_id, throughput); groestl512_cpu_init(thr_id, throughput); blake512_cpu_init(thr_id, throughput); combine_cpu_init(thr_id, throughput); CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput)); init[thr_id] = true; } // weird but require at least one cudaSetDevice first CUDA_SAFE_CALL(cudaMallocHost(&hash, (size_t) 32 * throughput)); CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, sizeof(uint32_t) * throughput)); if (blocklen == HEAVYCOIN_BLKHDR_SZ) { uint16_t *ext = (uint16_t*) &pdata[20]; if (opt_vote > maxvote && !opt_benchmark) { applog(LOG_WARNING, "Your block reward vote (%hu) exceeds the maxvote reported by the pool (%hu).", opt_vote, maxvote); } if (opt_trust_pool && opt_vote > maxvote) { applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool."); ext[0] = maxvote; } else ext[0] = opt_vote; } // Setze die Blockdaten hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen); sha256_cpu_setBlock(pdata, blocklen); keccak512_cpu_setBlock(pdata, blocklen); groestl512_cpu_setBlock(pdata, blocklen); blake512_cpu_setBlock(pdata, blocklen); do { uint32_t actualNumberOfValuesInNonceVectorGPU = throughput; ////// Compaction init hefty_cpu_hash(thr_id, throughput, pdata[19]); sha256_cpu_hash(thr_id, throughput, pdata[19]); // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier. MyStreamSynchronize(NULL, 1, thr_id); #if USE_THRUST thrust::device_ptr devNoncePtr(heavy_nonceVector[thr_id]); thrust::device_ptr devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput); ////// Compaction uint64_t *t = (uint64_t*) target2; devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19])); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); if(actualNumberOfValuesInNonceVectorGPU == 0) goto emptyNonceVector; keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); ////// Compaction t = (uint64_t*) target3; devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19])); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); if(actualNumberOfValuesInNonceVectorGPU == 0) goto emptyNonceVector; blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); ////// Compaction t = (uint64_t*) target5; devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19])); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); if(actualNumberOfValuesInNonceVectorGPU == 0) goto emptyNonceVector; groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); ////// Compaction t = (uint64_t*) target4; devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19])); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); #else // todo (nvlabs cub ?) actualNumberOfValuesInNonceVectorGPU = 0; #endif if(actualNumberOfValuesInNonceVectorGPU == 0) goto emptyNonceVector; // combine combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash); if (opt_tracegpu) { applog(LOG_BLUE, "heavy GPU hash:"); applog_hash((uchar*)hash); } // Ergebnisse kopieren if(actualNumberOfValuesInNonceVectorGPU > 0) { size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU; cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost); for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++) { uint32_t nonce = cpu_nonceVector[i]; uint32_t *foundhash = &hash[8*i]; if (foundhash[7] <= ptarget[7] && fulltest(foundhash, ptarget)) { uint32_t vhash[8]; pdata[19] += nonce - pdata[19]; heavycoin_hash((uchar*)vhash, (uchar*)pdata, blocklen); if (memcmp(vhash, foundhash, 32)) { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", nonce); } else { work_set_target_ratio(work, vhash); rc = 1; goto exit; } } } } emptyNonceVector: if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } pdata[19] += throughput; } while (!work_restart[thr_id].restart); exit: *hashes_done = pdata[19] - first_nonce; cudaFreeHost(cpu_nonceVector); cudaFreeHost(hash); CUDA_LOG_ERROR(); return rc; } // cleanup extern "C" void free_heavy(int thr_id) { if (!init[thr_id]) return; cudaThreadSynchronize(); cudaFree(heavy_nonceVector[thr_id]); blake512_cpu_free(thr_id); groestl512_cpu_free(thr_id); hefty_cpu_free(thr_id); keccak512_cpu_free(thr_id); sha256_cpu_free(thr_id); combine_cpu_free(thr_id); init[thr_id] = false; cudaDeviceSynchronize(); } __host__ void heavycoin_hash(uchar* output, const uchar* input, int len) { unsigned char hash1[32]; unsigned char hash2[32]; uint32_t hash3[16]; uint32_t hash4[16]; uint32_t hash5[16]; uint32_t *final; SHA256_CTX ctx; sph_keccak512_context keccakCtx; sph_groestl512_context groestlCtx; sph_blake512_context blakeCtx; HEFTY1(input, len, hash1); /* HEFTY1 is new, so take an extra security measure to eliminate * the possiblity of collisions: * * Hash(x) = SHA256(x + HEFTY1(x)) * * N.B. '+' is concatenation. */ SHA256_Init(&ctx); SHA256_Update(&ctx, input, len); SHA256_Update(&ctx, hash1, sizeof(hash1)); SHA256_Final(hash2, &ctx); /* Additional security: Do not rely on a single cryptographic hash * function. Instead, combine the outputs of 4 of the most secure * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512 * and BLAKE512. */ sph_keccak512_init(&keccakCtx); sph_keccak512(&keccakCtx, input, len); sph_keccak512(&keccakCtx, hash1, sizeof(hash1)); sph_keccak512_close(&keccakCtx, (void *)&hash3); sph_groestl512_init(&groestlCtx); sph_groestl512(&groestlCtx, input, len); sph_groestl512(&groestlCtx, hash1, sizeof(hash1)); sph_groestl512_close(&groestlCtx, (void *)&hash4); sph_blake512_init(&blakeCtx); sph_blake512(&blakeCtx, input, len); sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)); sph_blake512_close(&blakeCtx, (void *)&hash5); final = (uint32_t *)output; combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5); }