From 1b65cd05cc7e9bf5fa9a6ac10af3ea4c1d6285ba Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 27 Nov 2014 01:57:12 +0100 Subject: [PATCH] heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... --- Makefile.am | 3 + blake32.cu | 10 ++- ccminer.vcxproj | 10 +-- ccminer.vcxproj.filters | 24 +------ heavy/cuda_blake512.cu | 46 ++++++------ heavy/cuda_blake512.h | 7 -- heavy/cuda_combine.cu | 34 ++++----- heavy/cuda_combine.h | 7 -- heavy/cuda_groestl512.cu | 96 ++++++++++++------------- heavy/cuda_groestl512.h | 9 --- heavy/cuda_hefty1.cu | 139 ++++++++++++++++++----------------- heavy/cuda_hefty1.h | 8 --- heavy/cuda_keccak512.cu | 57 +++++++-------- heavy/cuda_keccak512.h | 9 --- heavy/cuda_sha256.cu | 51 ++++++------- heavy/cuda_sha256.h | 8 --- heavy/heavy.cu | 152 ++++++++++++++++++--------------------- heavy/heavy.h | 30 ++++++++ stats.cpp | 2 +- util.cpp | 18 +++-- 20 files changed, 339 insertions(+), 381 deletions(-) delete mode 100644 heavy/cuda_blake512.h delete mode 100644 heavy/cuda_combine.h delete mode 100644 heavy/cuda_groestl512.h delete mode 100644 heavy/cuda_hefty1.h delete mode 100644 heavy/cuda_keccak512.h delete mode 100644 heavy/cuda_sha256.h create mode 100644 heavy/heavy.h diff --git a/Makefile.am b/Makefile.am index 816d835..d7c27b5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -80,6 +80,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v" blake32.o: blake32.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< +heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu + $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< + keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $< diff --git a/blake32.cu b/blake32.cu index 75e656a..307d1ac 100644 --- a/blake32.cu +++ b/blake32.cu @@ -303,7 +303,11 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin ending[3] = nonce; /* our tested value */ blake256_compress(h, ending, 640, rounds); - +#if 0 + if (trace) { + printf("blake hash[6][7]: %08x %08x\n", h[6], h[7]); + } +#endif //if (h[7] == 0 && high64 <= highTarget) { if (h[7] == 0) { #if NBN == 2 @@ -318,14 +322,14 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin #else resNonce[0] = nonce; #endif - if (trace) { #ifdef _DEBUG + if (trace) { uint64_t high64 = ((uint64_t*)h)[3]; printf("gpu: %16llx\n", high64); printf("gpu: %08x.%08x\n", h[7], h[6]); printf("tgt: %16llx\n", highTarget); -#endif } +#endif } } } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index e4ee1da..7392912 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -173,7 +173,7 @@ 80 true false - compute_30,sm_30;compute_50,sm_50 + compute_50,sm_50 --ptxas-options="-O2" %(AdditionalOptions) @@ -306,12 +306,7 @@ - - - - - - + @@ -358,6 +353,7 @@ + 80 diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index a50b1fa..c2bf060 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -43,9 +43,6 @@ {c3222908-22ba-4586-a637-6363f455b06d} - - {3281db48-f394-49ea-a1ef-6ebd09828d50} - {f3ed23a2-8ce7-41a5-b051-6da56047dc35} @@ -293,23 +290,8 @@ Header Files\sph - - Header Files\CUDA\heavy - - - Header Files\CUDA\heavy - - - Header Files\CUDA\heavy - - - Header Files\CUDA\heavy - - - Header Files\CUDA\heavy - - - Header Files\CUDA\heavy + + Header Files\CUDA Header Files\CUDA @@ -539,4 +521,4 @@ Source Files\CUDA\x11 - + \ No newline at end of file diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu index fe58bc0..b177514 100644 --- a/heavy/cuda_blake512.cu +++ b/heavy/cuda_blake512.cu @@ -3,11 +3,11 @@ #include "cuda_helper.h" -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *d_heftyHashes[8]; -extern uint32_t *d_nonceVector[8]; +// globaler Speicher für alle HeftyHashes aller Threads +extern uint32_t *heavy_heftyHashes[8]; +extern uint32_t *heavy_nonceVector[8]; -// globaler Speicher für unsere Ergebnisse +// globaler Speicher für unsere Ergebnisse uint32_t *d_hash5output[8]; // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU @@ -53,13 +53,13 @@ __constant__ uint64_t c_u512[16]; const uint64_t host_u512[16] = { - 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, + 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, - 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, + 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, - 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, + 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, - 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, + 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL }; @@ -123,7 +123,7 @@ template __global__ void blake512_gpu_hash(int threads, uint32_t int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - // bestimme den aktuellen Zähler + // bestimme den aktuellen Zähler //uint32_t nounce = startNounce + thread; uint32_t nounce = nonceVector[thread]; @@ -141,10 +141,10 @@ template __global__ void blake512_gpu_hash(int threads, uint32_t h[6] = 0x1f83d9abfb41bd6bULL; h[7] = 0x5be0cd19137e2179ULL; - // 128 Byte für die Message + // 128 Byte für die Message uint64_t buf[16]; - // Message für die erste Runde in Register holen + // Message für die erste Runde in Register holen #pragma unroll 16 for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i]; @@ -154,7 +154,7 @@ template __global__ void blake512_gpu_hash(int threads, uint32_t uint32_t *hefty = heftyHashes + 8 * hashPosition; if (BLOCKSIZE == 84) { // den thread-spezifischen Hefty1 hash einsetzen - // aufwändig, weil das nicht mit uint64_t Wörtern aligned ist. + // aufwändig, weil das nicht mit uint64_t Wörtern aligned ist. buf[10] = REPLACE_HIWORD(buf[10], hefty[0]); buf[11] = REPLACE_LOWORD(buf[11], hefty[1]); buf[11] = REPLACE_HIWORD(buf[11], hefty[2]); @@ -173,14 +173,14 @@ template __global__ void blake512_gpu_hash(int threads, uint32_t // erste Runde blake512_compress( h, buf, 0, c_sigma, c_u512 ); - - + + // zweite Runde #pragma unroll 15 for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i]; buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen blake512_compress( h, buf, 1, c_sigma, c_u512 ); - + // Hash rauslassen uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition; #pragma unroll 8 @@ -210,8 +210,8 @@ __host__ void blake512_cpu_init(int thr_id, int threads) sizeof(host_SecondRound), 0, cudaMemcpyHostToDevice); - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads); + // Speicher für alle Ergebnisse belegen + CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads)); } static int BLOCKSIZE = 84; @@ -222,14 +222,14 @@ __host__ void blake512_cpu_setBlock(void *pdata, int len) { unsigned char PaddedMessage[128]; if (len == 84) { - // Message mit Padding für erste Runde bereitstellen + // Message mit Padding für erste Runde bereitstellen memcpy(PaddedMessage, pdata, 84); - memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen + memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen memset(PaddedMessage+116, 0, 12); PaddedMessage[116] = 0x80; } else if (len == 80) { memcpy(PaddedMessage, pdata, 80); - memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen + memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen memset(PaddedMessage+112, 0, 16); PaddedMessage[112] = 0x80; } @@ -246,11 +246,11 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; if (BLOCKSIZE == 80) - blake512_gpu_hash<80><<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + blake512_gpu_hash<80><<>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); else if (BLOCKSIZE == 84) - blake512_gpu_hash<84><<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + blake512_gpu_hash<84><<>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); } diff --git a/heavy/cuda_blake512.h b/heavy/cuda_blake512.h deleted file mode 100644 index 7e24973..0000000 --- a/heavy/cuda_blake512.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _CUDA_BLAKE512_H -#define _CUDA_BLAKE512_H - -void blake512_cpu_init(int thr_id, int threads); -void blake512_cpu_setBlock(void *pdata, int len); -void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce); -#endif diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu index 329c831..b0b2ead 100644 --- a/heavy/cuda_combine.cu +++ b/heavy/cuda_combine.cu @@ -1,16 +1,19 @@ -#include "cuda_helper.h" +#include -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hashoutput[8]; +#include "cuda_helper.h" +// globaler Speicher für unsere Ergebnisse +static uint32_t *d_hashoutput[8]; extern uint32_t *d_hash2output[8]; extern uint32_t *d_hash3output[8]; extern uint32_t *d_hash4output[8]; extern uint32_t *d_hash5output[8]; -extern uint32_t *d_nonceVector[8]; + +extern uint32_t *heavy_nonceVector[8]; /* Combines top 64-bits from each hash into a single hash */ -static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4) +__device__ +static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4) { uint32_t lout[8]; // Combining in Registern machen @@ -98,7 +101,8 @@ static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t * out[i] = lout[i]; } -__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector) +__global__ +void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -116,13 +120,14 @@ __global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *ou } } -// Setup-Funktionen -__host__ void combine_cpu_init(int thr_id, int threads) +__host__ +void combine_cpu_init(int thr_id, int threads) { - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); + // Speicher für alle Ergebnisse belegen + CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads)); } +__host__ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash) { // diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten @@ -133,11 +138,8 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - - combine_gpu_hash<<>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]); + combine_gpu_hash <<>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]); - // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden - cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost); + // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden + CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost)); } diff --git a/heavy/cuda_combine.h b/heavy/cuda_combine.h deleted file mode 100644 index 5bb5832..0000000 --- a/heavy/cuda_combine.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _CUDA_COMBINE_H -#define _CUDA_COMBINE_H - -void combine_cpu_init(int thr_id, int threads); -void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash); - -#endif diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu index 764b072..889002a 100644 --- a/heavy/cuda_groestl512.cu +++ b/heavy/cuda_groestl512.cu @@ -3,11 +3,11 @@ #include "cuda_helper.h" -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *d_heftyHashes[8]; -extern uint32_t *d_nonceVector[8]; +// globaler Speicher für alle HeftyHashes aller Threads +extern uint32_t *heavy_heftyHashes[8]; +extern uint32_t *heavy_nonceVector[8]; -// globaler Speicher für unsere Ergebnisse +// globaler Speicher für unsere Ergebnisse uint32_t *d_hash4output[8]; __constant__ uint32_t groestl_gpu_state[32]; @@ -603,22 +603,22 @@ __device__ void groestl512_perm_P(uint32_t *a) #pragma unroll 16 for(int k=0;k<32;k+=2) { - t[k + 0] = T0up( B32_0(a[k & 0x1f]) ) ^ - T1up( B32_1(a[(k + 2) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 4) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 6) & 0x1f]) ) ^ - T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^ + t[k + 0] = T0up( B32_0(a[k & 0x1f]) ) ^ + T1up( B32_1(a[(k + 2) & 0x1f]) ) ^ + T2up( B32_2(a[(k + 4) & 0x1f]) ) ^ + T3up( B32_3(a[(k + 6) & 0x1f]) ) ^ + T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^ + T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^ + T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^ T3dn( B32_3(a[(k + 23) & 0x1f]) ); - t[k + 1] = T0dn( B32_0(a[k & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^ - T0up( B32_0(a[(k + 9) & 0x1f]) ) ^ - T1up( B32_1(a[(k + 11) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 13) & 0x1f]) ) ^ + t[k + 1] = T0dn( B32_0(a[k & 0x1f]) ) ^ + T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^ + T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^ + T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^ + T0up( B32_0(a[(k + 9) & 0x1f]) ) ^ + T1up( B32_1(a[(k + 11) & 0x1f]) ) ^ + T2up( B32_2(a[(k + 13) & 0x1f]) ) ^ T3up( B32_3(a[(k + 23) & 0x1f]) ); } #pragma unroll 32 @@ -645,22 +645,22 @@ __device__ void groestl512_perm_Q(uint32_t *a) #pragma unroll 16 for(int k=0;k<32;k+=2) { - t[k + 0] = T0up( B32_0(a[(k + 2) & 0x1f]) ) ^ - T1up( B32_1(a[(k + 6) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 10) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 22) & 0x1f]) ) ^ - T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^ + t[k + 0] = T0up( B32_0(a[(k + 2) & 0x1f]) ) ^ + T1up( B32_1(a[(k + 6) & 0x1f]) ) ^ + T2up( B32_2(a[(k + 10) & 0x1f]) ) ^ + T3up( B32_3(a[(k + 22) & 0x1f]) ) ^ + T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^ + T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^ + T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^ T3dn( B32_3(a[(k + 13) & 0x1f]) ); - t[k + 1] = T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^ - T0up( B32_0(a[(k + 1) & 0x1f]) ) ^ - T1up( B32_1(a[(k + 5) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 9) & 0x1f]) ) ^ + t[k + 1] = T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^ + T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^ + T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^ + T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^ + T0up( B32_0(a[(k + 1) & 0x1f]) ) ^ + T1up( B32_1(a[(k + 5) & 0x1f]) ) ^ + T2up( B32_2(a[(k + 9) & 0x1f]) ) ^ T3up( B32_3(a[(k + 13) & 0x1f]) ); } #pragma unroll 32 @@ -677,7 +677,7 @@ template __global__ void groestl512_gpu_hash(int threads, uint32 uint32_t message[32]; uint32_t state[32]; - // lese message ein & verknüpfe diese mit dem hash1 von hefty1 + // lese message ein & verknüpfe diese mit dem hash1 von hefty1 // lese den state ein #pragma unroll 32 @@ -700,7 +700,7 @@ template __global__ void groestl512_gpu_hash(int threads, uint32 #pragma unroll 8 for (int k=0; k<8; ++k) message[BLOCKSIZE/4+k] = heftyHash[k]; - + uint32_t g[32]; #pragma unroll 32 for(int u=0;u<32;u++) @@ -709,7 +709,7 @@ template __global__ void groestl512_gpu_hash(int threads, uint32 // Perm groestl512_perm_P(g); groestl512_perm_Q(message); - + #pragma unroll 32 for(int u=0;u<32;u++) { @@ -753,7 +753,7 @@ __host__ void groestl512_cpu_init(int thr_id, int threads) texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256); texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); - // Speicher für alle Ergebnisse belegen + // Speicher für alle Ergebnisse belegen cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads); } @@ -778,31 +778,27 @@ __host__ void groestl512_cpu_setBlock(void *data, int len) msgBlock[28] = 0x80; msgBlock[31] = 0x01000000; } - // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird - // auf der GPU ausgeführt) + // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird + // auf der GPU ausgeführt) - // setze register + // setze register uint32_t groestl_state_init[32]; memset(groestl_state_init, 0, sizeof(uint32_t) * 32); groestl_state_init[31] = 0x20000; // state speichern - cudaMemcpyToSymbol( groestl_gpu_state, - groestl_state_init, - 128); + cudaMemcpyToSymbol(groestl_gpu_state, groestl_state_init, 128); // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( groestl_gpu_msg, - msgBlock, - 128); + cudaMemcpyToSymbol(groestl_gpu_msg, msgBlock, 128); BLOCKSIZE = len; } __host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) { // Hefty1 Hashes kopieren (eigentlich nur zum debuggen) - if (copy) - cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); + if (copy) + CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice)); } __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) @@ -813,11 +809,11 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; if (BLOCKSIZE == 84) - groestl512_gpu_hash<84><<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + groestl512_gpu_hash<84><<>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); else if (BLOCKSIZE == 80) - groestl512_gpu_hash<80><<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + groestl512_gpu_hash<80><<>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); } diff --git a/heavy/cuda_groestl512.h b/heavy/cuda_groestl512.h deleted file mode 100644 index 0cdc13b..0000000 --- a/heavy/cuda_groestl512.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _CUDA_GROESTL512_H -#define _CUDA_GROESTL512_H - -void groestl512_cpu_init(int thr_id, int threads); -void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -void groestl512_cpu_setBlock(void *data, int len); -void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce); - -#endif \ No newline at end of file diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu index 6d2b324..0ca3105 100644 --- a/heavy/cuda_hefty1.cu +++ b/heavy/cuda_hefty1.cu @@ -1,12 +1,14 @@ #include #include +#include "miner.h" + #include "cuda_helper.h" #define USE_SHARED 1 -// globaler Speicher für alle HeftyHashes aller Threads -uint32_t *d_heftyHashes[8]; +// globaler Speicher für alle HeftyHashes aller Threads +uint32_t *heavy_heftyHashes[8]; /* Hash-Tabellen */ __constant__ uint32_t hefty_gpu_constantTable[64]; @@ -30,7 +32,7 @@ uint32_t hefty_cpu_hashTable[] = { 0x9b05688cUL, 0x1f83d9abUL, 0x5be0cd19UL }; - + uint32_t hefty_cpu_constantTable[] = { 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, @@ -50,11 +52,16 @@ uint32_t hefty_cpu_constantTable[] = { 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL }; -//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -static __host__ __device__ uint32_t S(uint32_t x, int n) +#if 0 +#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#else +__host__ __device__ +static uint32_t S(uint32_t x, int n) { return (((x) >> (n)) | ((x) << (32 - (n)))); } +#endif + #define R(x, n) ((x) >> (n)) #define Ch(x, y, z) ((x & (y ^ z)) ^ z) #define Maj(x, y, z) ((x & (y | z)) | (y & z)) @@ -67,7 +74,9 @@ static __host__ __device__ uint32_t S(uint32_t x, int n) // uint8_t #define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) ) -__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x) + +__host__ __forceinline__ __device__ +uint8_t smoosh2(uint32_t x) { uint16_t w = (x >> 16) ^ (x & 0xffff); uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) ); @@ -77,13 +86,14 @@ __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x) #define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F ) #define getByte(x,y) ( ((x) >> (y)) & 0xFF ) -__host__ __forceinline__ __device__ void Mangle(uint32_t *inp) +__host__ __forceinline__ __device__ +void Mangle(uint32_t *inp) { uint32_t r = smoosh4Quad(inp[0]); uint32_t inp0org; uint32_t tmp0Mask, tmp1Mask; uint32_t in1, in2, isAddition; - uint32_t tmp; + int32_t tmp; uint8_t b; inp[1] = inp[1] ^ S(inp[0], getByte(r, 24)); @@ -92,24 +102,24 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp) tmp = smoosh2(inp[1]); b = getByte(r,tmp); inp0org = S(inp[0], b); - tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 - tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 - - in1 = (inp[2] & ~inp0org) | + tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0 + tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0 + + in1 = (inp[2] & ~inp0org) | (tmp1Mask & ~inp[2] & inp0org) | (~tmp0Mask & ~inp[2] & inp0org); in2 = inp[2] += ~inp0org; isAddition = ~tmp0Mask & tmp1Mask; inp[2] = isAddition ? in2 : in1; - + r += 0x01010101; tmp = smoosh2(inp[1] ^ inp[2]); b = getByte(r,tmp); inp0org = S(inp[0], b); - tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 - tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 + tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0 + tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0 - in1 = (inp[3] & ~inp0org) | + in1 = (inp[3] & ~inp0org) | (tmp1Mask & ~inp[3] & inp0org) | (~tmp0Mask & ~inp[3] & inp0org); in2 = inp[3] += ~inp0org; @@ -119,20 +129,23 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp) inp[0] ^= (inp[1] ^ inp[2]) + inp[3]; } -__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x) +__host__ __forceinline__ __device__ +void Absorb(uint32_t *inp, uint32_t x) { inp[0] ^= x; Mangle(inp); } -__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp) +__host__ __forceinline__ __device__ +uint32_t Squeeze(uint32_t *inp) { uint32_t y = inp[0]; Mangle(inp); return y; } -__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x) +__host__ __forceinline__ __device__ +uint32_t Br(uint32_t *sponge, uint32_t x) { uint32_t r = Squeeze(sponge); uint32_t t = ((r >> 8) & 0x1F); @@ -146,11 +159,12 @@ __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x) return retVal; } -__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) +__device__ __forceinline__ +void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) { uint32_t tmpBr; - uint32_t brG = Br(sponge, regs[6]); + uint32_t brG = Br(sponge, regs[6]); uint32_t brF = Br(sponge, regs[5]); uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; uint32_t brE = Br(sponge, regs[4]); @@ -169,11 +183,12 @@ __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint regs[4] += tmpBr; } -__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) +__host__ +void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) { uint32_t tmpBr; - uint32_t brG = Br(sponge, regs[6]); + uint32_t brG = Br(sponge, regs[6]); uint32_t brF = Br(sponge, regs[5]); uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; uint32_t brE = Br(sponge, regs[4]); @@ -191,11 +206,11 @@ __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t * regs[4] += tmpBr; } -// Die Hash-Funktion -__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash) +__global__ +void hefty_gpu_hash(int threads, uint32_t startNounce, uint32_t *outputHash) { - #if USE_SHARED - extern __shared__ char heftytab[]; +#if USE_SHARED + extern __shared__ unsigned char heftytab[]; if(threadIdx.x < 64) { *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x]; @@ -207,9 +222,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - // bestimme den aktuellen Zähler + // bestimme den aktuellen Zähler uint32_t nounce = startNounce + thread; - + // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory // reduktion von 256 byte auf 128 byte uint32_t W1[16]; @@ -219,7 +234,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa uint32_t regs[8]; uint32_t hash[8]; uint32_t sponge[4]; - + #pragma unroll 4 for(int k=0; k < 4; k++) sponge[k] = hefty_gpu_sponge[k]; @@ -231,7 +246,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa regs[k] = hefty_gpu_register[k]; hash[k] = regs[k]; } - + //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding #pragma unroll 16 for(int k=0;k<16;k++) @@ -252,7 +267,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa } // Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ... - + #pragma unroll 3 for(int k=0;k<3;k++) { @@ -279,7 +294,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa for(int j=0;j<16;j++) W1[j] = W2[j]; } - + #pragma unroll 8 for(int k=0;k<8;k++) hash[k] += regs[k]; @@ -290,27 +305,28 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa } } -// Setup-Funktionen -__host__ void hefty_cpu_init(int thr_id, int threads) +__host__ +void hefty_cpu_init(int thr_id, int threads) { cudaSetDevice(device_map[thr_id]); // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( hefty_gpu_constantTable, + cudaMemcpyToSymbol( hefty_gpu_constantTable, hefty_cpu_constantTable, sizeof(uint32_t) * 64 ); - // Speicher für alle Hefty1 hashes belegen - cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads); + // Speicher für alle Hefty1 hashes belegen + CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads)); } -__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len) +__host__ +void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len) // data muss 80/84-Byte haben! { // Nachricht expandieren und setzen uint32_t msgBlock[32]; - memset(msgBlock, 0, sizeof(uint32_t) * 32); + memset(msgBlock, 0, sizeof(msgBlock)); memcpy(&msgBlock[0], data, len); if (len == 84) { msgBlock[21] |= 0x80; @@ -319,17 +335,17 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len) msgBlock[20] |= 0x80; msgBlock[31] = 640; // bitlen } - + for(int i=0;i<31;i++) // Byteorder drehen msgBlock[i] = SWAB32(msgBlock[i]); - // die erste Runde wird auf der CPU durchgeführt, da diese für + // die erste Runde wird auf der CPU durchgeführt, da diese für // alle Threads gleich ist. Der Hash wird dann an die Threads - // übergeben + // übergeben // Erstelle expandierten Block W - uint32_t W[64]; - memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); + uint32_t W[64]; + memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); for(int j=16;j<64;j++) W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; @@ -344,7 +360,7 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len) { regs[k] = hefty_cpu_hashTable[k]; hash[k] = regs[k]; - } + } // 1. Runde for(int j=0;j<16;j++) @@ -366,39 +382,30 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len) hash[k] += regs[k]; // sponge speichern - - cudaMemcpyToSymbol( hefty_gpu_sponge, - sponge, - sizeof(uint32_t) * 4 ); + cudaMemcpyToSymbol(hefty_gpu_sponge, sponge, 16); // hash speichern - cudaMemcpyToSymbol( hefty_gpu_register, - hash, - sizeof(uint32_t) * 8 ); - + cudaMemcpyToSymbol(hefty_gpu_register, hash, 32); // Blockheader setzen (korrekte Nonce fehlt da drin noch) - cudaMemcpyToSymbol( hefty_gpu_blockHeader, - &msgBlock[16], - 64); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(hefty_gpu_blockHeader, &msgBlock[16], 64)); } -__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) +__host__ +void hefty_cpu_hash(int thr_id, int threads, int startNounce) { - // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, - // alle anderen mit 512 Threads. - int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512; + int threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs - #if USE_SHARED - size_t shared_size = 8 * 64 * sizeof(uint32_t); + // Größe des dynamischen Shared Memory Bereichs +#if USE_SHARED + int shared_size = 8 * 64 * sizeof(uint32_t); #else - size_t shared_size = 0; + int shared_size = 0; #endif - hefty_gpu_hash<<>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); + hefty_gpu_hash <<< grid, block, shared_size >>> (threads, startNounce, heavy_heftyHashes[thr_id]); // Strategisches Sleep Kommando zur Senkung der CPU Last MyStreamSynchronize(NULL, 0, thr_id); diff --git a/heavy/cuda_hefty1.h b/heavy/cuda_hefty1.h deleted file mode 100644 index 17b196c..0000000 --- a/heavy/cuda_hefty1.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _CUDA_HEFTY1_H -#define _CUDA_HEFTY1_H - -void hefty_cpu_hash(int thr_id, int threads, int startNounce); -void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len); -void hefty_cpu_init(int thr_id, int threads); - -#endif \ No newline at end of file diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu index 94aadf9..8c96b66 100644 --- a/heavy/cuda_keccak512.cu +++ b/heavy/cuda_keccak512.cu @@ -3,11 +3,11 @@ #include "cuda_helper.h" -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *d_heftyHashes[8]; -extern uint32_t *d_nonceVector[8]; +// globaler Speicher für alle HeftyHashes aller Threads +extern uint32_t *heavy_heftyHashes[8]; +extern uint32_t *heavy_nonceVector[8]; -// globaler Speicher für unsere Ergebnisse +// globaler Speicher für unsere Ergebnisse uint32_t *d_hash3output[8]; extern uint32_t *d_hash4output[8]; extern uint32_t *d_hash5output[8]; @@ -15,13 +15,11 @@ extern uint32_t *d_hash5output[8]; // der Keccak512 State nach der ersten Runde (72 Bytes) __constant__ uint64_t c_State[25]; -// die Message (72 Bytes) für die zweite Runde auf der GPU +// die Message (72 Bytes) für die zweite Runde auf der GPU __constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding // ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------ -#include "cuda_helper.h" - #define U32TO64_LE(p) \ (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) @@ -144,7 +142,7 @@ template __global__ void keccak512_gpu_hash(int threads, uint32_ int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - // bestimme den aktuellen Zähler + // bestimme den aktuellen Zähler //uint32_t nounce = startNounce + thread; uint32_t nounce = nonceVector[thread]; @@ -156,7 +154,7 @@ template __global__ void keccak512_gpu_hash(int threads, uint32_ #pragma unroll 25 for (int i=0; i < 25; ++i) keccak_gpu_state[i] = c_State[i]; - + // Message2 in den Puffer holen uint32_t msgBlock[18]; mycpy72(msgBlock, c_PaddedMessage2); @@ -167,7 +165,7 @@ template __global__ void keccak512_gpu_hash(int threads, uint32_ // den individuellen Hefty1 Hash einsetzen mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]); - // den Block einmal gut durchschütteln + // den Block einmal gut durchschütteln keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants); // das Hash erzeugen @@ -187,8 +185,8 @@ template __global__ void keccak512_gpu_hash(int threads, uint32_ // ---------------------------- END CUDA keccak512 functions ------------------------------------ -// Setup-Funktionen -__host__ void keccak512_cpu_init(int thr_id, int threads) +__host__ +void keccak512_cpu_init(int thr_id, int threads) { // Kopiere die Hash-Tabellen in den GPU-Speicher cudaMemcpyToSymbol( c_keccak_round_constants, @@ -196,7 +194,7 @@ __host__ void keccak512_cpu_init(int thr_id, int threads) sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice); - // Speicher für alle Ergebnisse belegen + // Speicher für alle Ergebnisse belegen cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads); } @@ -212,23 +210,24 @@ __host__ void keccak512_cpu_init(int thr_id, int threads) static int BLOCKSIZE = 84; -__host__ void keccak512_cpu_setBlock(void *data, int len) +__host__ +void keccak512_cpu_setBlock(void *data, int len) // data muss 80 oder 84-Byte haben! // heftyHash hat 32-Byte { // CH - // state init + // state init uint64_t keccak_cpu_state[25]; memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state)); - // erste Runde + // erste Runde keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants); // state kopieren cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); - // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke - // zu jeweils + // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke + // zu jeweils uint32_t msgBlock[18]; memset(msgBlock, 0, 18 * sizeof(uint32_t)); @@ -238,29 +237,31 @@ __host__ void keccak512_cpu_setBlock(void *data, int len) else if (len == 80) memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8); - // Nachricht abschließen + // Nachricht abschließen if (len == 84) msgBlock[11] = 0x01; else if (len == 80) msgBlock[10] = 0x01; msgBlock[17] = 0x80000000; - - // Message 2 ins Constant Memory kopieren (die variable Nonce und + + // Message 2 ins Constant Memory kopieren (die variable Nonce und // der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden) cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice ); BLOCKSIZE = len; } - -__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) +__host__ +void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) { // Hefty1 Hashes kopieren - if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); + if (copy) + CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice)); //else cudaThreadSynchronize(); } -__host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) +__host__ +void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) { const int threadsperblock = 128; @@ -268,11 +269,11 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; if (BLOCKSIZE==84) - keccak512_gpu_hash<84><<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + keccak512_gpu_hash<84><<>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); else if (BLOCKSIZE==80) - keccak512_gpu_hash<80><<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + keccak512_gpu_hash<80><<>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); } diff --git a/heavy/cuda_keccak512.h b/heavy/cuda_keccak512.h deleted file mode 100644 index 1182447..0000000 --- a/heavy/cuda_keccak512.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _CUDA_KECCAK512_H -#define _CUDA_KECCAK512_H - -void keccak512_cpu_init(int thr_id, int threads); -void keccak512_cpu_setBlock(void *data, int len); -void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce); - -#endif diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu index 043422b..3b63b76 100644 --- a/heavy/cuda_sha256.cu +++ b/heavy/cuda_sha256.cu @@ -3,11 +3,11 @@ #include "cuda_helper.h" -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *d_heftyHashes[8]; -extern uint32_t *d_nonceVector[8]; +// globaler Speicher für alle HeftyHashes aller Threads +extern uint32_t *heavy_heftyHashes[8]; +extern uint32_t *heavy_nonceVector[8]; -// globaler Speicher für unsere Ergebnisse +// globaler Speicher für unsere Ergebnisse uint32_t *d_hash2output[8]; @@ -47,10 +47,10 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - // bestimme den aktuellen Zähler + // bestimme den aktuellen Zähler uint32_t nounce = startNounce + thread; nonceVector[thread] = nounce; - + // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory uint32_t W1[16]; uint32_t W2[16]; @@ -66,10 +66,10 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s regs[k] = sha256_gpu_register[k]; hash[k] = regs[k]; } - + // 2. Runde - //memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke - //memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen + //memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke + //memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen #pragma unroll 16 for(int k=0;k<16;k++) W1[k] = sha256_gpu_blockHeader[k]; @@ -90,7 +90,7 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s uint32_t T1, T2; T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j]; T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - + #pragma unroll 7 for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; regs[0] = T1 + T2; @@ -121,7 +121,7 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s uint32_t T1, T2; T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - + #pragma unroll 7 for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; regs[0] = T1 + T2; @@ -136,14 +136,14 @@ template __global__ void sha256_gpu_hash(int threads, uint32_t s /* for(int j=16;j<64;j++) W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; - + #pragma unroll 64 for(int j=0;j<64;j++) { uint32_t T1, T2; T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j]; T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - + #pragma unroll 7 for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; regs[0] = T1 + T2; @@ -168,7 +168,7 @@ __host__ void sha256_cpu_init(int thr_id, int threads) sha256_cpu_constantTable, sizeof(uint32_t) * 64 ); - // Speicher für alle Ergebnisse belegen + // Speicher für alle Ergebnisse belegen cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads); } @@ -184,25 +184,25 @@ __host__ void sha256_cpu_setBlock(void *data, int len) memset(msgBlock, 0, sizeof(uint32_t) * 32); memcpy(&msgBlock[0], data, len); if (len == 84) { - memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen + memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen msgBlock[29] |= 0x80; msgBlock[31] = 928; // bitlen } else if (len == 80) { - memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen + memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen msgBlock[28] |= 0x80; msgBlock[31] = 896; // bitlen } - + for(int i=0;i<31;i++) // Byteorder drehen msgBlock[i] = SWAB32(msgBlock[i]); - // die erste Runde wird auf der CPU durchgeführt, da diese für + // die erste Runde wird auf der CPU durchgeführt, da diese für // alle Threads gleich ist. Der Hash wird dann an die Threads - // übergeben + // übergeben uint32_t W[64]; // Erstelle expandierten Block W - memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); + memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); for(int j=16;j<64;j++) W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; @@ -223,7 +223,7 @@ __host__ void sha256_cpu_setBlock(void *data, int len) uint32_t T1, T2; T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j]; T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - + //#pragma unroll 7 for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; // sollte mal noch durch memmov ersetzt werden! @@ -251,7 +251,8 @@ __host__ void sha256_cpu_setBlock(void *data, int len) __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) { // Hefty1 Hashes kopieren - if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); + if (copy) + CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice)); //else cudaThreadSynchronize(); } @@ -263,12 +264,12 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce) dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 0; if (BLOCKSIZE == 84) - sha256_gpu_hash<84><<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + sha256_gpu_hash<84><<>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); else if (BLOCKSIZE == 80) { - sha256_gpu_hash<80><<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + sha256_gpu_hash<80><<>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); } } diff --git a/heavy/cuda_sha256.h b/heavy/cuda_sha256.h deleted file mode 100644 index 03385d1..0000000 --- a/heavy/cuda_sha256.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _CUDA_SHA256_H -#define _CUDA_SHA256_H - -void sha256_cpu_init(int thr_id, int threads); -void sha256_cpu_setBlock(void *data, int len); -void sha256_cpu_hash(int thr_id, int threads, int startNounce); -void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -#endif diff --git a/heavy/heavy.cu b/heavy/heavy.cu index f3891e8..eb9a03e 100644 --- a/heavy/heavy.cu +++ b/heavy/heavy.cu @@ -1,35 +1,19 @@ #include -#include -#include - -#include - #include - -#ifndef _WIN32 -#include -#endif - +#include // include thrust -#include #include #include -#include #include "miner.h" -#include "hefty1.h" +extern "C" { #include "sph/sph_keccak.h" #include "sph/sph_blake.h" #include "sph/sph_groestl.h" - -#include "heavy/cuda_hefty1.h" -#include "heavy/cuda_sha256.h" -#include "heavy/cuda_keccak512.h" -#include "heavy/cuda_groestl512.h" -#include "heavy/cuda_blake512.h" -#include "heavy/cuda_combine.h" - +} +#include "hefty1.h" +#include "heavy/heavy.h" #include "cuda_helper.h" extern uint32_t *d_hash2output[8]; @@ -37,11 +21,13 @@ extern uint32_t *d_hash3output[8]; extern uint32_t *d_hash4output[8]; extern uint32_t *d_hash5output[8]; -#define HEAVYCOIN_BLKHDR_SZ 84 -#define MNR_BLKHDR_SZ 80 +#define HEAVYCOIN_BLKHDR_SZ 84 +#define MNR_BLKHDR_SZ 80 + +// nonce-array für die threads +uint32_t *heavy_nonceVector[8]; -// nonce-array für die threads -uint32_t *d_nonceVector[8]; +extern uint32_t *heavy_heftyHashes[8]; /* Combines top 64-bits from each hash into a single hash */ static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4) @@ -71,9 +57,9 @@ static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t #include static uint32_t __inline bitsset( uint32_t x ) { - DWORD r = 0; - _BitScanReverse(&r, x); - return r; + DWORD r = 0; + _BitScanReverse(&r, x); + return r; } #else static uint32_t bitsset( uint32_t x ) @@ -91,21 +77,21 @@ static int findhighbit(const uint32_t *ptarget, int words) { if (ptarget[i] != 0) { highbit = i*32 + bitsset(ptarget[i])+1; - break; + break; } } return highbit; } // Generiere ein Multiword-Integer das die Zahl -// (2 << highbit) - 1 repräsentiert. +// (2 << highbit) - 1 repräsentiert. static void genmask(uint32_t *ptarget, int words, int highbit) { int i; for (i=words-1; i >= 0; --i) { if ((i+1)*32 <= highbit) - ptarget[i] = 0xffffffff; + ptarget[i] = UINT32_MAX; else if (i*32 > highbit) ptarget[i] = 0x00000000; else @@ -114,13 +100,18 @@ static void genmask(uint32_t *ptarget, int words, int highbit) } struct check_nonce_for_remove -{ +{ check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) : m_target(target), m_hashes(hashes), m_hashlen(hashlen), m_startNonce(startNonce) { } + uint64_t m_target; + uint32_t *m_hashes; + uint32_t m_hashlen; + uint32_t m_startNonce; + __device__ bool operator()(const uint32_t x) { @@ -129,53 +120,39 @@ struct check_nonce_for_remove // Wert des Hashes (als uint64_t) auslesen. // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits) uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6])); - // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein. - return (hashValue & m_target) != hashValue; + bool res = (hashValue & m_target) != hashValue; + //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target); + // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein. + return res; } - - uint64_t m_target; - uint32_t *m_hashes; - uint32_t m_hashlen; - uint32_t m_startNonce; }; -int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote, int blocklen); +static bool init[8] = {0,0,0,0,0,0,0,0}; -extern "C" +__host__ int scanhash_heavy(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen) -{ - return scanhash_heavy_cpp(thr_id, pdata, - ptarget, max_nonce, hashes_done, maxvote, blocklen); -} - - -int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote, int blocklen) { const uint32_t first_nonce = pdata[19]; /* to check */ // CUDA will process thousands of threads. - int throughput = opt_work_size ? opt_work_size : (1 << 19); // 128*4096 - throughput = min(throughput, (int)(max_nonce - first_nonce)); + int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*2048 + throughput = min(throughput, (int)(max_nonce - first_nonce)); int rc = 0; uint32_t *hash = NULL; - cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)); uint32_t *cpu_nonceVector = NULL; - cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMallocHost(&hash, throughput*8*sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t))); int nrmCalls[6]; memset(nrmCalls, 0, sizeof(int) * 6); - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000000ff; + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x00ff; - // für jeden Hash ein individuelles Target erstellen basierend - // auf dem höchsten Bit, das in ptarget gesetzt ist. + // für jeden Hash ein individuelles Target erstellen basierend + // auf dem höchsten Bit, das in ptarget gesetzt ist. int highbit = findhighbit(ptarget, 8); uint32_t target2[2], target3[2], target4[2], target5[2]; genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256 @@ -183,7 +160,6 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512 genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512 - static bool init[8] = {0,0,0,0,0,0,0,0}; if (!init[thr_id]) { hefty_cpu_init(thr_id, throughput); @@ -192,8 +168,10 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, groestl512_cpu_init(thr_id, throughput); blake512_cpu_init(thr_id, throughput); combine_cpu_init(thr_id, throughput); + + CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput)); + init[thr_id] = true; - cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput); } if (blocklen == HEAVYCOIN_BLKHDR_SZ) @@ -201,13 +179,13 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, uint16_t *ext = (uint16_t *)&pdata[20]; if (opt_vote > maxvote) { - printf("Warning: Your block reward vote (%hu) exceeds " - "the maxvote reported by the pool (%hu).\n", + applog(LOG_WARNING, "Your block reward vote (%hu) exceeds " + "the maxvote reported by the pool (%hu).", opt_vote, maxvote); } if (opt_trust_pool && opt_vote > maxvote) { - printf("Warning: Capping block reward vote to maxvote reported by pool.\n"); + applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool."); ext[0] = maxvote; } else @@ -222,32 +200,34 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, blake512_cpu_setBlock(pdata, blocklen); do { - uint32_t i; ////// Compaction init - thrust::device_ptr devNoncePtr(d_nonceVector[thr_id]); - thrust::device_ptr devNoncePtrEnd((d_nonceVector[thr_id]) + throughput); + thrust::device_ptr devNoncePtr(heavy_nonceVector[thr_id]); + thrust::device_ptr devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput); uint32_t actualNumberOfValuesInNonceVectorGPU = throughput; + uint64_t *t; hefty_cpu_hash(thr_id, throughput, pdata[19]); //cudaThreadSynchronize(); sha256_cpu_hash(thr_id, throughput, pdata[19]); //cudaThreadSynchronize(); - // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier. + // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier. MyStreamSynchronize(NULL, 1, thr_id); ////// Compaction - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target2), d_hash2output[thr_id], 8, pdata[19])); + t = (uint64_t*) target2; + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19])); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); if(actualNumberOfValuesInNonceVectorGPU == 0) goto emptyNonceVector; - + keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); //cudaThreadSynchronize(); ////// Compaction - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target3), d_hash3output[thr_id], 16, pdata[19])); + t = (uint64_t*) target3; + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19])); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); if(actualNumberOfValuesInNonceVectorGPU == 0) goto emptyNonceVector; @@ -256,7 +236,8 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, //cudaThreadSynchronize(); ////// Compaction - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target5), d_hash5output[thr_id], 16, pdata[19])); + t = (uint64_t*) target5; + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19])); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); if(actualNumberOfValuesInNonceVectorGPU == 0) goto emptyNonceVector; @@ -265,25 +246,31 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, //cudaThreadSynchronize(); ////// Compaction - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target4), d_hash4output[thr_id], 16, pdata[19])); + t = (uint64_t*) target4; + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19])); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); if(actualNumberOfValuesInNonceVectorGPU == 0) goto emptyNonceVector; - + // combine combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash); + if (opt_tracegpu) { + applog(LOG_BLUE, "heavy GPU hash:"); + applog_hash((uchar*)hash); + } + // Ergebnisse kopieren if(actualNumberOfValuesInNonceVectorGPU > 0) { - cudaMemcpy(cpu_nonceVector, d_nonceVector[thr_id], sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU, cudaMemcpyDeviceToHost); + size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU; + CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost)); + cudaDeviceSynchronize(); - for (i=0; isockbuf, "\n")) { bool ret = true; - time_t rstart; - - time(&rstart); + time_t rstart = time(NULL); if (!socket_full(sctx->sock, 60)) { applog(LOG_ERR, "stratum_recv_line timed out"); goto out; @@ -1578,16 +1574,18 @@ void do_gpu_tests(void) uchar buf[128]; uint32_t tgt[8] = { 0 }; - memset(buf, 0, sizeof buf); - buf[0] = 1; buf[64] = 2; - opt_tracegpu = true; work_restart = (struct work_restart*) malloc(sizeof(struct work_restart)); work_restart[0].restart = 1; - tgt[6] = 0xffff; + tgt[7] = 0xffff; + memset(buf, 0, sizeof buf); + // buf[0] = 1; buf[64] = 2; // for endian tests scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14); + memset(buf, 0, sizeof buf); + scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84 + free(work_restart); work_restart = NULL; opt_tracegpu = false;