From d9ea5f72ce465f1feec111f2bdffd08a0c071501 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 18 Aug 2014 03:45:48 +0200 Subject: [PATCH] Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof --- JHA/cuda_jha_compactionTest.cu | 11 +- JHA/cuda_jha_keccak512.cu | 33 +- JHA/jackpotcoin.cu | 19 +- bitslice_transformations_quad.cu | 3 +- ccminer.vcxproj | 9 +- ccminer.vcxproj.filters | 6 + cuda_fugue256.cu | 15 +- cuda_groestlcoin.cu | 18 +- cuda_helper.h | 146 ++++-- cuda_myriadgroestl.cu | 11 +- cuda_nist5.cu | 28 +- groestl_functions_quad.cu | 1 + heavy/cuda_blake512.cu | 28 +- heavy/cuda_combine.cu | 7 +- heavy/cuda_groestl512.cu | 10 +- heavy/cuda_hefty1.cu | 12 +- heavy/cuda_keccak512.cu | 13 +- heavy/cuda_sha256.cu | 7 +- heavy/heavy.cu | 6 +- miner.h | 1 + quark/animecoin.cu | 583 +++++++++++------------ quark/cuda_bmw512.cu | 151 +----- quark/cuda_checkhash.cu | 13 +- quark/cuda_jh512.cu | 2 +- quark/cuda_quark_blake512.cu | 230 ++------- quark/cuda_quark_compactionTest.cu | 739 ++++++++++++++--------------- quark/cuda_quark_groestl512.cu | 15 +- quark/cuda_quark_keccak512.cu | 21 +- quark/cuda_skein512.cu | 27 +- quark/quarkcoin.cu | 20 +- util.c | 4 + x11/cuda_x11_cubehash512.cu | 19 +- x11/cuda_x11_echo.cu | 28 +- x11/cuda_x11_luffa512.cu | 20 +- x11/cuda_x11_shavite512.cu | 59 +-- x11/cuda_x11_simd512.cu | 22 +- x11/x11.cu | 19 +- x13/cuda_x13_fugue512.cu | 217 ++++----- x13/cuda_x13_hamsi512.cu | 100 ++-- x13/x13.cu | 17 +- x15/cuda_x14_shabal512.cu | 18 +- x15/cuda_x15_whirlpool.cu | 6 +- x15/x14.cu | 17 +- x15/x15.cu | 17 +- 44 files changed, 1129 insertions(+), 1619 deletions(-) diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu index 22c0da4..fb91ee3 100644 --- a/JHA/cuda_jha_compactionTest.cu +++ b/JHA/cuda_jha_compactionTest.cu @@ -1,11 +1,8 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#include "sm_30_intrinsics.h" - #include #include -#include + +#include "cuda_helper.h" +#include // aus cpu-miner.c extern int device_map[8]; @@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads) cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) } -#if __CUDA_ARCH__ < 300 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 /** * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 */ diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu index be5b61a..f163ea5 100644 --- a/JHA/cuda_jha_keccak512.cu +++ b/JHA/cuda_jha_keccak512.cu @@ -1,16 +1,7 @@ - - -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); @@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t __constant__ uint64_t c_State[25]; __constant__ uint32_t c_PaddedMessage[18]; -static __device__ uint32_t cuda_swab32(uint32_t x) -{ - return __byte_perm(x, 0, 0x0123); -} - -// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt -#if __CUDA_ARCH__ >= 350 -__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { - uint2 result; - if(offset >= 32) { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } else { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - } - return __double_as_longlong(__hiloint2double(result.y, result.x)); -} -#else -#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) -#endif - #define U32TO64_LE(p) \ (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index bcca41f..c778f30 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -1,4 +1,3 @@ - extern "C" { #include "sph/sph_keccak.h" @@ -7,10 +6,9 @@ extern "C" #include "sph/sph_jh.h" #include "sph/sph_skein.h" #include "miner.h" +#include "cuda_helper.h" } -#include - // aus cpu-miner.c extern int device_map[8]; @@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_check_cpu_init(int thr_id, int threads); -extern void quark_check_cpu_setTarget(const void *ptarget); -extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern void jackpot_compactTest_cpu_init(int thr_id, int threads); extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, @@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, quark_groestl512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); - quark_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2); cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2); @@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); jackpot_keccak512_cpu_setBlock((void*)endiandata, 80); - quark_check_cpu_setTarget(ptarget); + cuda_check_cpu_setTarget(ptarget); do { int order = 0; @@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, } // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { + unsigned int rounds; uint32_t vhash64[8]; be32enc(&endiandata[19], foundNonce); // diese jackpothash Funktion gibt die Zahl der Runden zurück - unsigned int rounds = jackpothash(vhash64, endiandata); + rounds = jackpothash(vhash64, endiandata); if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu index c47f73d..63ce6e2 100644 --- a/bitslice_transformations_quad.cu +++ b/bitslice_transformations_quad.cu @@ -1,5 +1,4 @@ - -#if __CUDA_ARCH__ < 300 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 /** * __shfl() returns the value of var held by the thread whose ID is given by srcLane. * If srcLane is outside the range 0..width-1, the thread's own value of var is returned. diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 2d0c68e..872f85d 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)" true false compute_50,sm_50 - + + false @@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" + + true + %(AdditionalOptions) %(AdditionalOptions) @@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" %(AdditionalOptions) %(AdditionalOptions) + + true + %(AdditionalOptions) %(AdditionalOptions) diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index f1271e3..c4c20f4 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -391,5 +391,11 @@ Source Files\CUDA\x15 + + Source Files\CUDA + + + Source Files\CUDA + \ No newline at end of file diff --git a/cuda_fugue256.cu b/cuda_fugue256.cu index f5ddd4f..d48d4ad 100644 --- a/cuda_fugue256.cu +++ b/cuda_fugue256.cu @@ -1,12 +1,11 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include #include "sph/sph_fugue.h" +#include "cuda_helper.h" +#include + #define USE_SHARED 1 // aus cpu-miner.c @@ -15,14 +14,6 @@ extern int device_map[8]; // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -// schon in sph_fugue.h definiert -//#define SPH_C32(x) ((uint32_t)(x ## U)) - uint32_t *d_fugue256_hashoutput[8]; uint32_t *d_resultNonce[8]; diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index 095d326..b716f50 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -1,23 +1,17 @@ // Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include +#include "cuda_helper.h" +#include + // aus cpu-miner.c extern int device_map[8]; // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; - // diese Struktur wird in der Init Funktion angefordert static cudaDeviceProp props[8]; @@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32]; #include "groestl_functions_quad.cu" #include "bitslice_transformations_quad.cu" -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) +#define SWAB32(x) cuda_swab32(x) -__global__ void __launch_bounds__(256, 4) - groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce) +__global__ __launch_bounds__(256, 4) +void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce) { // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4; diff --git a/cuda_helper.h b/cuda_helper.h index 72f7b7e..a5da3b3 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -1,33 +1,78 @@ #ifndef CUDA_HELPER_H #define CUDA_HELPER_H +#include #include -static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI) -{ -#if __CUDA_ARCH__ >= 130 - return __double_as_longlong(__hiloint2double(HI, LO)); +#if defined(_MSC_VER) +/* reduce warnings */ +#include +#include +#endif + +#include + +extern __device__ __device_builtin__ void __syncthreads(void); + +#ifndef __CUDA_ARCH__ +// define blockDim and threadIdx for host +extern const dim3 blockDim; +extern const uint3 threadIdx; +#endif + +#ifndef SPH_C32 +#define SPH_C32(x) ((uint32_t)(x ## U)) +#endif + +#ifndef SPH_C64 +#define SPH_C64(x) ((uint64_t)(x ## ULL)) +#endif + +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#if __CUDA_ARCH__ < 350 +// Kepler (Compute 3.0) +#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #else - return (unsigned long long)LO | (((unsigned long long)HI) << 32); +// Kepler (Compute 3.5, 5.0) +#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) #endif -} -// das Hi Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t HIWORD(const uint64_t &x) { +__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI) +{ #if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2hiint(__longlong_as_double(x)); + return __double_as_longlong(__hiloint2double(HI, LO)); #else - return (uint32_t)(x >> 32); + return (unsigned long long)LO | (((unsigned long long)HI) << 32); #endif } // das Hi Word in einem 64 Bit Typen ersetzen -static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { +__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); } +// das Lo Word in einem 64 Bit Typen ersetzen +__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { + return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); +} + +// Endian Drehung für 32 Bit Typen +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x) +{ + /* device */ + return __byte_perm(x, x, 0x0123); +} +#else + /* host */ + #define cuda_swab32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +#endif + // das Lo Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t LOWORD(const uint64_t &x) { +__device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) { #if __CUDA_ARCH__ >= 130 return (uint32_t)__double2loint(__longlong_as_double(x)); #else @@ -35,34 +80,51 @@ static __device__ uint32_t LOWORD(const uint64_t &x) { #endif } -// das Lo Word in einem 64 Bit Typen ersetzen -static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { - return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); +// das Hi Word aus einem 64 Bit Typen extrahieren +__device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) { +#if __CUDA_ARCH__ >= 130 + return (uint32_t)__double2hiint(__longlong_as_double(x)); +#else + return (uint32_t)(x >> 32); +#endif } -// Endian Drehung für 32 Bit Typen -static __device__ uint32_t cuda_swab32(uint32_t x) +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x) { - return __byte_perm(x, x, 0x0123); -} + // Input: 77665544 33221100 + // Output: 00112233 44556677 + uint64_t temp[2]; + temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123); + temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123); -// Endian Drehung für 64 Bit Typen -static __device__ uint64_t cuda_swab64(uint64_t x) { - return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x))); + return temp[0] | (temp[1]<<32); } +#else + /* host */ + #define cuda_swab64(x) \ + ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ + (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ + (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ + (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ + (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ + (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ + (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ + (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) +#endif // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt #if __CUDA_ARCH__ >= 350 -__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) { - uint2 result; - if(offset < 32) { - asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } else { - asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - } - return __double_as_longlong(__hiloint2double(result.y, result.x)); +__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) { + uint2 result; + if(offset < 32) { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } else { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); } #else #define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) @@ -70,16 +132,16 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt #if __CUDA_ARCH__ >= 350 -__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { - uint2 result; - if(offset >= 32) { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } else { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - } - return __double_as_longlong(__hiloint2double(result.y, result.x)); +__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) { + uint2 result; + if(offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); } #else #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu index 3a992e0..cbdf77f 100644 --- a/cuda_myriadgroestl.cu +++ b/cuda_myriadgroestl.cu @@ -1,23 +1,16 @@ // Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include +#include "cuda_helper.h" + // aus cpu-miner.c extern int device_map[8]; // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; - // diese Struktur wird in der Init Funktion angefordert static cudaDeviceProp props[8]; diff --git a/cuda_nist5.cu b/cuda_nist5.cu index 4e37f69..0561bba 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -1,4 +1,3 @@ - extern "C" { #include "sph/sph_blake.h" @@ -7,10 +6,9 @@ extern "C" #include "sph/sph_jh.h" #include "sph/sph_keccak.h" #include "miner.h" +#include "cuda_helper.h" } -#include - // aus cpu-miner.c extern int device_map[8]; @@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_check_cpu_init(int thr_id, int threads); -extern void quark_check_cpu_setTarget(const void *ptarget); -extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); // Original nist5hash Funktion aus einem miner Quelltext -inline void nist5hash(void *state, const void *input) +extern "C" void nist5hash(void *state, const void *input) { sph_blake512_context ctx_blake; sph_groestl512_context ctx_groestl; @@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, quark_jh512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); - quark_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; } @@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); quark_blake512_cpu_setBlock_80((void*)endiandata); - quark_check_cpu_setTarget(ptarget); + cuda_check_cpu_setTarget(ptarget); do { int order = 0; - // erstes Blake512 Hash mit CUDA + // Hash with CUDA quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - // das ist der unbedingte Branch für Groestl512 quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - - // das ist der unbedingte Branch für JH512 quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - - // das ist der unbedingte Branch für Keccak512 quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - - // das ist der unbedingte Branch für Skein512 quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu index b7839ab..75f31c6 100644 --- a/groestl_functions_quad.cu +++ b/groestl_functions_quad.cu @@ -1,3 +1,4 @@ +#include "cuda_helper.h" __device__ __forceinline__ void G256_Mul2(uint32_t *regs) { diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu index dfa4240..ea4b82b 100644 --- a/heavy/cuda_blake512.cu +++ b/heavy/cuda_blake512.cu @@ -1,14 +1,7 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; +#include "cuda_helper.h" // globaler Speicher für alle HeftyHashes aller Threads extern uint32_t *d_heftyHashes[8]; @@ -20,7 +13,6 @@ uint32_t *d_hash5output[8]; // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU __constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding) -#include "cuda_helper.h" // ---------------------------- BEGIN CUDA blake512 functions ------------------------------------ @@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] = { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } }; -// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam. -#define SWAP32(x) \ - ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ - (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) - -// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam. -#define SWAP64(x) \ - ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ - (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ - (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ - (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ - (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ - (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ - (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ - (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) +/* in cuda_helper */ +#define SWAP32(x) cuda_swab32(x) +#define SWAP64(x) cuda_swab64(x) __constant__ uint64_t c_SecondRound[15]; diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu index e2a8b72..329c831 100644 --- a/heavy/cuda_combine.cu +++ b/heavy/cuda_combine.cu @@ -1,9 +1,4 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - -// Folgende Definitionen später durch header ersetzen -typedef unsigned int uint32_t; +#include "cuda_helper.h" // globaler Speicher für unsere Ergebnisse uint32_t *d_hashoutput[8]; diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu index 5b1b267..764b072 100644 --- a/heavy/cuda_groestl512.cu +++ b/heavy/cuda_groestl512.cu @@ -1,14 +1,7 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; +#include "cuda_helper.h" // globaler Speicher für alle HeftyHashes aller Threads extern uint32_t *d_heftyHashes[8]; @@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len) cudaMemcpyToSymbol( groestl_gpu_msg, msgBlock, 128); - BLOCKSIZE = len; } diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu index 4b60818..dcbc4d4 100644 --- a/heavy/cuda_hefty1.cu +++ b/heavy/cuda_hefty1.cu @@ -1,10 +1,9 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include +#include "cuda_helper.h" +#include + #define USE_SHARED 1 // aus cpu-miner.c @@ -13,11 +12,6 @@ extern int device_map[8]; // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -// Folgende Definitionen später durch header ersetzen -typedef unsigned int uint32_t; -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; - // diese Struktur wird in der Init Funktion angefordert static cudaDeviceProp props[8]; diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu index eb69e3b..b5ee321 100644 --- a/heavy/cuda_keccak512.cu +++ b/heavy/cuda_keccak512.cu @@ -1,14 +1,7 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; +#include "cuda_helper.h" // globaler Speicher für alle HeftyHashes aller Threads extern uint32_t *d_heftyHashes[8]; @@ -81,8 +74,8 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const uint64_t t[5], u[5], v, w; /* absorb input */ -#pragma unroll 9 - for (i = 0; i < 72 / 8; i++, in += 2) + #pragma unroll 9 + for (i = 0; i < 9 /* 72/8 */; i++, in += 2) s[i] ^= U32TO64_LE(in); for (i = 0; i < 24; i++) { diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu index f520778..043422b 100644 --- a/heavy/cuda_sha256.cu +++ b/heavy/cuda_sha256.cu @@ -1,12 +1,7 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include -// Folgende Definitionen später durch header ersetzen -typedef unsigned int uint32_t; +#include "cuda_helper.h" // globaler Speicher für alle HeftyHashes aller Threads extern uint32_t *d_heftyHashes[8]; diff --git a/heavy/heavy.cu b/heavy/heavy.cu index e9e7907..8abf303 100644 --- a/heavy/heavy.cu +++ b/heavy/heavy.cu @@ -1,7 +1,3 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include #include @@ -34,6 +30,8 @@ #include "heavy/cuda_blake512.h" #include "heavy/cuda_combine.h" +#include "cuda_helper.h" + extern uint32_t *d_hash2output[8]; extern uint32_t *d_hash3output[8]; extern uint32_t *d_hash4output[8]; diff --git a/miner.h b/miner.h index efdb81b..97bac0b 100644 --- a/miner.h +++ b/miner.h @@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void groestlhash(void *state, const void *input); void myriadhash(void *state, const void *input); +void nist5hash(void *state, const void *input); void quarkhash(void *state, const void *input); void x11hash(void *output, const void *input); void x13hash(void *output, const void *input); diff --git a/quark/animecoin.cu b/quark/animecoin.cu index 6d395be..c19275d 100644 --- a/quark/animecoin.cu +++ b/quark/animecoin.cu @@ -1,291 +1,292 @@ - -extern "C" -{ -#include "sph/sph_blake.h" -#include "sph/sph_bmw.h" -#include "sph/sph_groestl.h" -#include "sph/sph_skein.h" -#include "sph/sph_jh.h" -#include "sph/sph_keccak.h" -#include "miner.h" -} - -#include - -// aus cpu-miner.c -extern int device_map[8]; - -// Speicher für Input/Output der verketteten Hashfunktionen -static uint32_t *d_hash[8]; - -// Speicher zur Generierung der Noncevektoren für die bedingten Hashes -static uint32_t *d_animeNonces[8]; -static uint32_t *d_branch1Nonces[8]; -static uint32_t *d_branch2Nonces[8]; -static uint32_t *d_branch3Nonces[8]; - -extern void quark_blake512_cpu_init(int thr_id, int threads); -extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); - -extern void quark_bmw512_cpu_init(int thr_id, int threads); -extern void quark_bmw512_cpu_setBlock_80(void *pdata); -extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order); -extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); - -extern void quark_groestl512_cpu_init(int thr_id, int threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); - -extern void quark_skein512_cpu_init(int thr_id, int threads); -extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); - -extern void quark_keccak512_cpu_init(int thr_id, int threads); -extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); - -extern void quark_jh512_cpu_init(int thr_id, int threads); -extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); - -extern void quark_check_cpu_init(int thr_id, int threads); -extern void quark_check_cpu_setTarget(const void *ptarget); -extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); - -extern void quark_compactTest_cpu_init(int thr_id, int threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, size_t *nrm1, - uint32_t *d_nonces2, size_t *nrm2, - int order); -extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, size_t *nrm1, - int order); - -// Original Quarkhash Funktion aus einem miner Quelltext -inline void animehash(void *state, const void *input) -{ - sph_blake512_context ctx_blake; - sph_bmw512_context ctx_bmw; - sph_groestl512_context ctx_groestl; - sph_jh512_context ctx_jh; - sph_keccak512_context ctx_keccak; - sph_skein512_context ctx_skein; - - unsigned char hash[64]; - - sph_bmw512_init(&ctx_bmw); - // ZBMW; - sph_bmw512 (&ctx_bmw, (const void*) input, 80); - sph_bmw512_close(&ctx_bmw, (void*) hash); - - sph_blake512_init(&ctx_blake); - // ZBLAKE; - sph_blake512 (&ctx_blake, hash, 64); - sph_blake512_close(&ctx_blake, (void*) hash); - - if (hash[0] & 0x8) - { - sph_groestl512_init(&ctx_groestl); - // ZGROESTL; - sph_groestl512 (&ctx_groestl, (const void*) hash, 64); - sph_groestl512_close(&ctx_groestl, (void*) hash); - } - else - { - sph_skein512_init(&ctx_skein); - // ZSKEIN; - sph_skein512 (&ctx_skein, (const void*) hash, 64); - sph_skein512_close(&ctx_skein, (void*) hash); - } - - sph_groestl512_init(&ctx_groestl); - // ZGROESTL; - sph_groestl512 (&ctx_groestl, (const void*) hash, 64); - sph_groestl512_close(&ctx_groestl, (void*) hash); - - sph_jh512_init(&ctx_jh); - // ZJH; - sph_jh512 (&ctx_jh, (const void*) hash, 64); - sph_jh512_close(&ctx_jh, (void*) hash); - - if (hash[0] & 0x8) - { - sph_blake512_init(&ctx_blake); - // ZBLAKE; - sph_blake512 (&ctx_blake, (const void*) hash, 64); - sph_blake512_close(&ctx_blake, (void*) hash); - } - else - { - sph_bmw512_init(&ctx_bmw); - // ZBMW; - sph_bmw512 (&ctx_bmw, (const void*) hash, 64); - sph_bmw512_close(&ctx_bmw, (void*) hash); - } - - sph_keccak512_init(&ctx_keccak); - // ZKECCAK; - sph_keccak512 (&ctx_keccak, (const void*) hash, 64); - sph_keccak512_close(&ctx_keccak, (void*) hash); - - sph_skein512_init(&ctx_skein); - // SKEIN; - sph_skein512 (&ctx_skein, (const void*) hash, 64); - sph_skein512_close(&ctx_skein, (void*) hash); - - if (hash[0] & 0x8) - { - sph_keccak512_init(&ctx_keccak); - // ZKECCAK; - sph_keccak512 (&ctx_keccak, (const void*) hash, 64); - sph_keccak512_close(&ctx_keccak, (void*) hash); - } - else - { - sph_jh512_init(&ctx_jh); - // ZJH; - sph_jh512 (&ctx_jh, (const void*) hash, 64); - sph_jh512_close(&ctx_jh, (void*) hash); - } - - memcpy(state, hash, 32); -} - - -struct HashPredicate -{ - HashPredicate(uint32_t *hashes, uint32_t startNonce) : - m_hashes(hashes), - m_startNonce(startNonce) - { } - - __device__ - bool operator()(const uint32_t x) - { - uint32_t *hash = &m_hashes[(x - m_startNonce)*16]; - return hash[0] & 0x8; - } - - uint32_t *m_hashes; - uint32_t m_startNonce; -}; - -extern bool opt_benchmark; - -extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) -{ - const uint32_t first_nonce = pdata[19]; - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00000f; - - const uint32_t Htarg = ptarget[7]; - - const int throughput = 256*2048; // 100; - - static bool init[8] = {0,0,0,0,0,0,0,0}; - if (!init[thr_id]) - { - cudaSetDevice(device_map[thr_id]); - - // Konstanten kopieren, Speicher belegen - cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); - quark_blake512_cpu_init(thr_id, throughput); - quark_groestl512_cpu_init(thr_id, throughput); - quark_skein512_cpu_init(thr_id, throughput); - quark_bmw512_cpu_init(thr_id, throughput); - quark_keccak512_cpu_init(thr_id, throughput); - quark_jh512_cpu_init(thr_id, throughput); - quark_check_cpu_init(thr_id, throughput); - quark_compactTest_cpu_init(thr_id, throughput); - cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); - init[thr_id] = true; - } - - uint32_t endiandata[20]; - for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); - - quark_bmw512_cpu_setBlock_80((void*)endiandata); - quark_check_cpu_setTarget(ptarget); - - do { - int order = 0; - size_t nrm1=0, nrm2=0, nrm3=0; - - // erstes BMW512 Hash mit CUDA - quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - // das ist der unbedingte Branch für Blake512 - quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - - quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, - d_branch3Nonces[thr_id], &nrm3, - order++); - - // nur den Skein Branch weiterverfolgen - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - - // das ist der unbedingte Branch für Groestl512 - quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - - // das ist der unbedingte Branch für JH512 - quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - - // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) - quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); - - // das ist der bedingte Branch für Blake512 - quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - - // das ist der bedingte Branch für Bmw512 - quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); - - // das ist der unbedingte Branch für Keccak512 - quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - - // das ist der unbedingte Branch für Skein512 - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - - // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) - quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); - - // das ist der bedingte Branch für Keccak512 - quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - - // das ist der bedingte Branch für JH512 - quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); - - // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - if (foundNonce != 0xffffffff) - { - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); - animehash(vhash64, endiandata); - - if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { - - pdata[19] = foundNonce; - *hashes_done = (foundNonce - first_nonce + 1)/2; - return 1; - } else { - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); - } - } - - pdata[19] += throughput; - - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = (pdata[19] - first_nonce + 1)/2; - return 0; -} +extern "C" +{ +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" +#include "miner.h" +#include "cuda_helper.h" +} + +// aus cpu-miner.c +extern int device_map[8]; + +// Speicher für Input/Output der verketteten Hashfunktionen +static uint32_t *d_hash[8]; + +// Speicher zur Generierung der Noncevektoren für die bedingten Hashes +static uint32_t *d_animeNonces[8]; +static uint32_t *d_branch1Nonces[8]; +static uint32_t *d_branch2Nonces[8]; +static uint32_t *d_branch3Nonces[8]; + +extern void quark_blake512_cpu_init(int thr_id, int threads); +extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void quark_bmw512_cpu_init(int thr_id, int threads); +extern void quark_bmw512_cpu_setBlock_80(void *pdata); +extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order); +extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); + +extern void quark_groestl512_cpu_init(int thr_id, int threads); +extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void quark_skein512_cpu_init(int thr_id, int threads); +extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void quark_keccak512_cpu_init(int thr_id, int threads); +extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void quark_jh512_cpu_init(int thr_id, int threads); +extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); + +extern void quark_compactTest_cpu_init(int thr_id, int threads); +extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, + uint32_t *d_nonces1, size_t *nrm1, + uint32_t *d_nonces2, size_t *nrm2, + int order); +extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, + uint32_t *d_nonces1, size_t *nrm1, + int order); + +// Original Quarkhash Funktion aus einem miner Quelltext +inline void animehash(void *state, const void *input) +{ + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + + unsigned char hash[64]; + + sph_bmw512_init(&ctx_bmw); + // ZBMW; + sph_bmw512 (&ctx_bmw, (const void*) input, 80); + sph_bmw512_close(&ctx_bmw, (void*) hash); + + sph_blake512_init(&ctx_blake); + // ZBLAKE; + sph_blake512 (&ctx_blake, hash, 64); + sph_blake512_close(&ctx_blake, (void*) hash); + + if (hash[0] & 0x8) + { + sph_groestl512_init(&ctx_groestl); + // ZGROESTL; + sph_groestl512 (&ctx_groestl, (const void*) hash, 64); + sph_groestl512_close(&ctx_groestl, (void*) hash); + } + else + { + sph_skein512_init(&ctx_skein); + // ZSKEIN; + sph_skein512 (&ctx_skein, (const void*) hash, 64); + sph_skein512_close(&ctx_skein, (void*) hash); + } + + sph_groestl512_init(&ctx_groestl); + // ZGROESTL; + sph_groestl512 (&ctx_groestl, (const void*) hash, 64); + sph_groestl512_close(&ctx_groestl, (void*) hash); + + sph_jh512_init(&ctx_jh); + // ZJH; + sph_jh512 (&ctx_jh, (const void*) hash, 64); + sph_jh512_close(&ctx_jh, (void*) hash); + + if (hash[0] & 0x8) + { + sph_blake512_init(&ctx_blake); + // ZBLAKE; + sph_blake512 (&ctx_blake, (const void*) hash, 64); + sph_blake512_close(&ctx_blake, (void*) hash); + } + else + { + sph_bmw512_init(&ctx_bmw); + // ZBMW; + sph_bmw512 (&ctx_bmw, (const void*) hash, 64); + sph_bmw512_close(&ctx_bmw, (void*) hash); + } + + sph_keccak512_init(&ctx_keccak); + // ZKECCAK; + sph_keccak512 (&ctx_keccak, (const void*) hash, 64); + sph_keccak512_close(&ctx_keccak, (void*) hash); + + sph_skein512_init(&ctx_skein); + // SKEIN; + sph_skein512 (&ctx_skein, (const void*) hash, 64); + sph_skein512_close(&ctx_skein, (void*) hash); + + if (hash[0] & 0x8) + { + sph_keccak512_init(&ctx_keccak); + // ZKECCAK; + sph_keccak512 (&ctx_keccak, (const void*) hash, 64); + sph_keccak512_close(&ctx_keccak, (void*) hash); + } + else + { + sph_jh512_init(&ctx_jh); + // ZJH; + sph_jh512 (&ctx_jh, (const void*) hash, 64); + sph_jh512_close(&ctx_jh, (void*) hash); + } + + memcpy(state, hash, 32); +} + + +struct HashPredicate +{ + HashPredicate(uint32_t *hashes, uint32_t startNonce) : + m_hashes(hashes), + m_startNonce(startNonce) + { } + + __device__ + bool operator()(const uint32_t x) + { + uint32_t *hash = &m_hashes[(x - m_startNonce)*16]; + return hash[0] & 0x8; + } + + uint32_t *m_hashes; + uint32_t m_startNonce; +}; + +extern bool opt_benchmark; + +extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x00000f; + + const uint32_t Htarg = ptarget[7]; + + const int throughput = 256*2048; // 100; + + static bool init[8] = {0,0,0,0,0,0,0,0}; + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + + // Konstanten kopieren, Speicher belegen + cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); + quark_compactTest_cpu_init(thr_id, throughput); + + cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput); + cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); + cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); + cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); + + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + quark_bmw512_cpu_setBlock_80((void*)endiandata); + cuda_check_cpu_setTarget(ptarget); + + do { + int order = 0; + size_t nrm1=0, nrm2=0, nrm3=0; + + // erstes BMW512 Hash mit CUDA + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für Blake512 + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, + d_branch3Nonces[thr_id], &nrm3, + order++); + + // nur den Skein Branch weiterverfolgen + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für Groestl512 + quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für JH512 + quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) + quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, + order++); + + // das ist der bedingte Branch für Blake512 + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der bedingte Branch für Bmw512 + quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für Keccak512 + quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für Skein512 + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) + quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, + order++); + + // das ist der bedingte Branch für Keccak512 + quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der bedingte Branch für JH512 + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + + // Scan nach Gewinner Hashes auf der GPU + uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); + animehash(vhash64, endiandata); + + if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { + + pdata[19] = foundNonce; + *hashes_done = (foundNonce - first_nonce + 1)/2; + return 1; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); + } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = (pdata[19] - first_nonce + 1)/2; + return 0; +} diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index e5f2d6a..d5e3f4a 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -1,140 +1,9 @@ #if 1 -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; - -// Endian Drehung für 32 Bit Typen -/* -static __device__ uint32_t cuda_swab32(uint32_t x) -{ - return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) - | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); -} -*/ -static __device__ uint32_t cuda_swab32(uint32_t x) -{ - return __byte_perm(x, 0, 0x0123); -} - -// das Hi Word in einem 64 Bit Typen ersetzen -static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) { - return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL); -} - -#if 0 -// Endian Drehung für 64 Bit Typen -static __device__ unsigned long long cuda_swab64(unsigned long long x) { - uint32_t h = (x >> 32); - uint32_t l = (x & 0xFFFFFFFFULL); - return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h)); -} - -// das Hi Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t HIWORD(const unsigned long long &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2hiint(__longlong_as_double(x)); -#else - return (uint32_t)(x >> 32); -#endif -} - -// das Lo Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t LOWORD(const unsigned long long &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2loint(__longlong_as_double(x)); -#else - return (uint32_t)(x & 0xFFFFFFFFULL); -#endif -} - -static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI) -{ -#if __CUDA_ARCH__ >= 130 - return __double_as_longlong(__hiloint2double(HI, LO)); -#else - return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL); -#endif -} - -// das Lo Word in einem 64 Bit Typen ersetzen -static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) { - return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y); -} -#endif - -// der Versuch, einen Wrapper für einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen... -#if 1 -typedef unsigned long long uint64_t; -#else -typedef class uint64 -{ -public: - __device__ uint64() - { - } - __device__ uint64(unsigned long long init) - { - val = make_uint2( LOWORD(init), HIWORD(init) ); - } - __device__ uint64(uint32_t lo, uint32_t hi) - { - val = make_uint2( lo, hi ); - } - __device__ const uint64 operator^(uint64 const& rhs) const - { - return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y); - } - __device__ const uint64 operator|(uint64 const& rhs) const - { - return uint64(val.x | rhs.val.x, val.y | rhs.val.y); - } - __device__ const uint64 operator+(unsigned long long const& rhs) const - { - return *this+uint64(rhs); - } - __device__ const uint64 operator+(uint64 const& rhs) const - { - uint64 res; - asm ("add.cc.u32 %0, %2, %4;\n\t" - "addc.cc.u32 %1, %3, %5;\n\t" - : "=r"(res.val.x), "=r"(res.val.y) - : "r"( val.x), "r"( val.y), - "r"(rhs.val.x), "r"(rhs.val.y)); - return res; - } - __device__ const uint64 operator-(uint64 const& rhs) const - { - uint64 res; - asm ("sub.cc.u32 %0, %2, %4;\n\t" - "subc.cc.u32 %1, %3, %5;\n\t" - : "=r"(res.val.x), "=r"(res.val.y) - : "r"( val.x), "r"( val.y), - "r"(rhs.val.x), "r"(rhs.val.y)); - return res; - } - __device__ const uint64 operator<<(int n) const - { - return uint64(unsigned long long(*this)<>(int n) const - { - return uint64(unsigned long long(*this)>>n); - } - __device__ operator unsigned long long() const - { - return MAKE_ULONGLONG(val.x, val.y); - } - uint2 val; -} uint64_t; -#endif +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); @@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t // die Message it Padding zur Berechnung auf der GPU __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) -#define SPH_C64(x) ((uint64_t)(x ## ULL)) - // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt -#if __CUDA_ARCH__ >= 350 -__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { - uint2 result; - if(offset >= 32) { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } else { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - } - return __double_as_longlong(__hiloint2double(result.y, result.x)); -} -#else -#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) -#endif #define SHL(x, n) ((x) << (n)) #define SHR(x, n) ((x) >> (n)) diff --git a/quark/cuda_checkhash.cu b/quark/cuda_checkhash.cu index a3debca..3c41a02 100644 --- a/quark/cuda_checkhash.cu +++ b/quark/cuda_checkhash.cu @@ -1,11 +1,8 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include -#include #include +#include "cuda_helper.h" + // Hash Target gegen das wir testen sollen __constant__ uint32_t pTarget[8]; @@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32 } // Setup-Funktionen -__host__ void quark_check_cpu_init(int thr_id, int threads) +__host__ void cuda_check_cpu_init(int thr_id, int threads) { cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)); cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)); } // Target Difficulty setzen -__host__ void quark_check_cpu_setTarget(const void *ptarget) +__host__ void cuda_check_cpu_setTarget(const void *ptarget) { // die Message zur Berechnung auf der GPU cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); } -__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) +__host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) { uint32_t result = 0xffffffff; cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu index c55ac1a..25f4823 100644 --- a/quark/cuda_jh512.cu +++ b/quark/cuda_jh512.cu @@ -1,4 +1,4 @@ -#include +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index 16980fa..6ab6a30 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -1,16 +1,11 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include -#define USE_SHUFFLE 0 +#include "cuda_helper.h" -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; +#define ROTR(x,n) ROTR64(x,n) + +#define USE_SHUFFLE 0 // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); @@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] = { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } }; -// das Hi Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t HIWORD(const uint64_t &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2hiint(__longlong_as_double(x)); -#else - return (uint32_t)(x >> 32); -#endif -} - -// das Hi Word in einem 64 Bit Typen ersetzen -static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { - return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); -} - -// das Lo Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t LOWORD(const uint64_t &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2loint(__longlong_as_double(x)); -#else - return (uint32_t)(x & 0xFFFFFFFFULL); -#endif -} -#if 0 -// das Lo Word in einem 64 Bit Typen ersetzen -static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { - return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); -} -#endif - -__device__ __forceinline__ uint64_t SWAP64(uint64_t x) -{ - // Input: 77665544 33221100 - // Output: 00112233 44556677 - uint64_t temp[2]; - temp[0] = __byte_perm(HIWORD(x), 0, 0x0123); - temp[1] = __byte_perm(LOWORD(x), 0, 0x0123); - - return temp[0] | (temp[1]<<32); -} - -__constant__ uint64_t c_u512[16]; - -const uint64_t host_u512[16] = +__device__ __constant__ +const uint64_t c_u512[16] = { 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, @@ -96,24 +50,6 @@ const uint64_t host_u512[16] = 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL }; - -// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt -#if __CUDA_ARCH__ >= 350 -__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) { - uint2 result; - if(offset < 32) { - asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } else { - asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - } - return __double_as_longlong(__hiloint2double(result.y, result.x)); -} -#else -#define ROTR(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) -#endif - #define G(a,b,c,d,e) \ v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ v[d] = ROTR( v[d] ^ v[a],32); \ @@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) v[b] = ROTR( v[b] ^ v[c],11); -__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits ) +__device__ static +void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits ) { uint64_t v[16], m[16], i; #pragma unroll 16 - for( i = 0; i < 16; ++i ) - { - m[i] = SWAP64(block[i]); + for( i = 0; i < 16; ++i ) { + m[i] = cuda_swab64(block[i]); } #pragma unroll 8 @@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; } -// Endian Drehung für 32 Bit Typen - -static __device__ uint32_t cuda_swab32(uint32_t x) -{ - return __byte_perm(x, 0, 0x0123); -} - -/* -// Endian Drehung für 64 Bit Typen -static __device__ uint64_t cuda_swab64(uint64_t x) { - uint32_t h = (x >> 32); - uint32_t l = (x & 0xFFFFFFFFULL); - return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h)); -} -*/ - -static __constant__ uint64_t d_constMem[8]; -static const uint64_t h_constMem[8] = { +__device__ __constant__ +static const uint64_t d_constMem[8] = { 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, @@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = { 0x5be0cd19137e2179ULL }; // Hash-Padding -static __constant__ uint64_t d_constHashPadding[8]; -static const uint64_t h_constHashPadding[8] = { +__device__ __constant__ +static const uint64_t d_constHashPadding[8] = { 0x0000000000000080ull, 0, 0, @@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = { 0, 0x0002000000000000ull }; -__global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) +__global__ __launch_bounds__(256, 4) +void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads if (thread < threads) #endif { + uint8_t i; // bestimme den aktuellen Zähler uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; - //uint64_t *inpHash = &g_hash[8 * hashPosition]; - uint64_t *inpHash = &g_hash[hashPosition<<3]; + uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8 + + // 128 Byte für die Message + uint64_t buf[16]; // State vorbereiten uint64_t h[8]; - /* - h[0] = 0x6a09e667f3bcc908ULL; - h[1] = 0xbb67ae8584caa73bULL; - h[2] = 0x3c6ef372fe94f82bULL; - h[3] = 0xa54ff53a5f1d36f1ULL; - h[4] = 0x510e527fade682d1ULL; - h[5] = 0x9b05688c2b3e6c1fULL; - h[6] = 0x1f83d9abfb41bd6bULL; - h[7] = 0x5be0cd19137e2179ULL; - */ -#pragma unroll 8 - for(int i=0;i<8;i++) + #pragma unroll 8 + for (i=0;i<8;i++) h[i] = d_constMem[i]; - // 128 Byte für die Message - uint64_t buf[16]; - // Message für die erste Runde in Register holen -#pragma unroll 8 - for (int i=0; i < 8; ++i) buf[i] = inpHash[i]; - - /* - buf[ 8] = 0x0000000000000080ull; - buf[ 9] = 0; - buf[10] = 0; - buf[11] = 0; - buf[12] = 0; - buf[13] = 0x0100000000000000ull; - buf[14] = 0; - buf[15] = 0x0002000000000000ull; - */ -#pragma unroll 8 - for(int i=0;i<8;i++) + #pragma unroll 8 + for (i=0; i < 8; ++i) + buf[i] = inpHash[i]; + + #pragma unroll 8 + for (i=0; i < 8; i++) buf[i+8] = d_constHashPadding[i]; // die einzige Hashing-Runde quark_blake512_compress( h, buf, c_sigma, c_u512, 512 ); - // Hash rauslassen #if __CUDA_ARCH__ >= 130 // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition]; -#pragma unroll 8 - for (int i=0; i < 8; ++i) { - outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); - outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); + #pragma unroll 8 + for (i=0; i < 8; ++i) { + outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); + outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); } #else // in dieser Version passieren auch ein paar 64 Bit Shifts uint64_t *outHash = &g_hash[8 * hashPosition]; -#pragma unroll 8 - for (int i=0; i < 8; ++i) + #pragma unroll 8 + for (i=0; i < 8; ++i) { - //outHash[i] = cuda_swab64( h[i] ); - outHash[i] = SWAP64(h[i]); + outHash[i] = cuda_swab64(h[i]); } #endif } @@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - // bestimme den aktuellen Zähler - uint32_t nounce = startNounce + thread; - // State vorbereiten uint64_t h[8]; - /* - h[0] = 0x6a09e667f3bcc908ULL; - h[1] = 0xbb67ae8584caa73bULL; - h[2] = 0x3c6ef372fe94f82bULL; - h[3] = 0xa54ff53a5f1d36f1ULL; - h[4] = 0x510e527fade682d1ULL; - h[5] = 0x9b05688c2b3e6c1fULL; - h[6] = 0x1f83d9abfb41bd6bULL; - h[7] = 0x5be0cd19137e2179ULL; - */ -#pragma unroll 8 - for(int i=0;i<8;i++) - h[i] = d_constMem[i]; // 128 Byte für die Message uint64_t buf[16]; + uint8_t i; + // bestimme den aktuellen Zähler + uint32_t nounce = startNounce + thread; + + #pragma unroll 8 + for(i=0;i<8;i++) + h[i] = d_constMem[i]; // Message für die erste Runde in Register holen -#pragma unroll 16 - for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i]; + #pragma unroll 16 + for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i]; // die Nounce durch die thread-spezifische ersetzen buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce)); @@ -333,19 +224,17 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo #if __CUDA_ARCH__ >= 130 // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; -#pragma unroll 8 - for (int i=0; i < 8; ++i) { - outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); - outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); + #pragma unroll 8 + for (i=0; i < 8; ++i) { + outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); + outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); } #else // in dieser Version passieren auch ein paar 64 Bit Shifts uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; -#pragma unroll 8 - for (int i=0; i < 8; ++i) - { - //outHash[i] = cuda_swab64( h[i] ); - outHash[i] = SWAP64(h[i]); + #pragma unroll 8 + for (i=0; i < 8; ++i) { + outHash[i] = cuda_swab64( h[i] ); } #endif } @@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads) host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice); - - cudaMemcpyToSymbol( c_u512, - host_u512, - sizeof(host_u512), - 0, cudaMemcpyHostToDevice); - - cudaMemcpyToSymbol( d_constMem, - h_constMem, - sizeof(h_constMem), - 0, cudaMemcpyHostToDevice); - - cudaMemcpyToSymbol( d_constHashPadding, - h_constHashPadding, - sizeof(h_constHashPadding), - 0, cudaMemcpyHostToDevice); } // Blake512 für 80 Byte grosse Eingangsdaten diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu index 2eb2852..0a771db 100644 --- a/quark/cuda_quark_compactionTest.cu +++ b/quark/cuda_quark_compactionTest.cu @@ -1,371 +1,368 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#include "sm_30_intrinsics.h" - -#include -#include -#include - -// aus cpu-miner.c -extern int device_map[8]; - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props[8]; - -static uint32_t *d_tempBranch1Nonces[8]; -static uint32_t *d_numValid[8]; -static uint32_t *h_numValid[8]; - -static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -// True/False tester -typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); - -__device__ uint32_t QuarkTrueTest(uint32_t *inpHash) -{ - return ((inpHash[0] & 0x08) == 0x08); -} - -__device__ uint32_t QuarkFalseTest(uint32_t *inpHash) -{ - return ((inpHash[0] & 0x08) == 0); -} - -__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest; -cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8]; - -// Setup-Funktionen -__host__ void quark_compactTest_cpu_init(int thr_id, int threads) -{ - cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); - - cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); - cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); - - // wir brauchen auch Speicherplatz auf dem Device - cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2); - cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t)); - cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); - - uint32_t s1; - s1 = (threads / 256) * 2; - - cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) - cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) -} - -#if __CUDA_ARCH__ < 300 -/** - * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 - */ -#undef __shfl_up -#define __shfl_up(var, delta, width) (0) -#endif - -// Die Summenfunktion (vom NVIDIA SDK) -__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) -{ - extern __shared__ uint32_t sums[]; - int id = ((blockIdx.x * blockDim.x) + threadIdx.x); - //int lane_id = id % warpSize; - int lane_id = id % width; - // determine a warp_id within a block - //int warp_id = threadIdx.x / warpSize; - int warp_id = threadIdx.x / width; - - sums[lane_id] = 0; - - // Below is the basic structure of using a shfl instruction - // for a scan. - // Record "value" as a variable - we accumulate it along the way - uint32_t value; - if(testFunc != NULL) - { - if (id < threads) - { - uint32_t *inpHash; - if(d_validNonceTable == NULL) - { - // keine Nonce-Liste - inpHash = &inpHashes[id<<4]; - }else - { - // Nonce-Liste verfügbar - int nonce = d_validNonceTable[id] - startNounce; - inpHash = &inpHashes[nonce<<4]; - } - value = (*testFunc)(inpHash); - }else - { - value = 0; - } - }else - { - value = data[id]; - } - - __syncthreads(); - - // Now accumulate in log steps up the chain - // compute sums, with another thread's value who is - // distance delta away (i). Note - // those threads where the thread 'i' away would have - // been out of bounds of the warp are unaffected. This - // creates the scan sum. -#pragma unroll - - for (int i=1; i<=width; i*=2) - { - uint32_t n = __shfl_up((int)value, i, width); - - if (lane_id >= i) value += n; - } - - // value now holds the scan value for the individual thread - // next sum the largest values for each warp - - // write the sum of the warp to smem - //if (threadIdx.x % warpSize == warpSize-1) - if (threadIdx.x % width == width-1) - { - sums[warp_id] = value; - } - - __syncthreads(); - - // - // scan sum the warp sums - // the same shfl scan operation, but performed on warp sums - // - if (warp_id == 0) - { - uint32_t warp_sum = sums[lane_id]; - - for (int i=1; i<=width; i*=2) - { - uint32_t n = __shfl_up((int)warp_sum, i, width); - - if (lane_id >= i) warp_sum += n; - } - - sums[lane_id] = warp_sum; - } - - __syncthreads(); - - // perform a uniform add across warps in the block - // read neighbouring warp's sum and add it to threads value - uint32_t blockSum = 0; - - if (warp_id > 0) - { - blockSum = sums[warp_id-1]; - } - - value += blockSum; - - // Now write out our result - data[id] = value; - - // last thread has sum, write write out the block's sum - if (partial_sums != NULL && threadIdx.x == blockDim.x-1) - { - partial_sums[blockIdx.x] = value; - } -} - -// Uniform add: add partial sums array -__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) -{ - __shared__ uint32_t buf; - int id = ((blockIdx.x * blockDim.x) + threadIdx.x); - - if (id > len) return; - - if (threadIdx.x == 0) - { - buf = partial_sums[blockIdx.x]; - } - - __syncthreads(); - data[id] += buf; -} - -// Der Scatter -__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) -{ - int id = ((blockIdx.x * blockDim.x) + threadIdx.x); - uint32_t actNounce = id; - uint32_t value; - if (id < threads) - { -// uint32_t nounce = startNounce + id; - uint32_t *inpHash; - if(d_validNonceTable == NULL) - { - // keine Nonce-Liste - inpHash = &inpHashes[id<<4]; - }else - { - // Nonce-Liste verfügbar - int nonce = d_validNonceTable[id] - startNounce; - actNounce = nonce; - inpHash = &inpHashes[nonce<<4]; - } - - value = (*testFunc)(inpHash); - }else - { - value = 0; - } - - if( value ) - { - int idx = sum[id]; - if(idx > 0) - outp[idx-1] = startNounce + actNounce; - } -} - -__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) -{ - if(val == 0) - return 0; - - uint32_t mask = 0x80000000; - while( (val & mask) == 0 ) mask = mask >> 1; - - if( (val & (~mask)) != 0 ) - return mask << 1; - - return mask; -} - -__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm, - uint32_t *d_nonces1, cuda_compactTestFunction_t function, - uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) -{ - int orgThreads = threads; - threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); - // threadsPerBlock ausrechnen - int blockSize = 256; - int nSummen = threads / blockSize; - - int thr1 = (threads+blockSize-1) / blockSize; - int thr2 = threads / (blockSize*blockSize); - int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize; - int thr3 = (nSummen + blockSize2-1) / blockSize2; - - bool callThrid = (thr2 > 0) ? true : false; - - // Erster Initialscan - quark_compactTest_gpu_SCAN<<>>( - d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); - - // weitere Scans - if(callThrid) - { - quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); - quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); - }else - { - quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); - } - - // Sync + Anzahl merken - cudaStreamSynchronize(NULL); - - if(callThrid) - cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); - else - cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); - - - // Addieren - if(callThrid) - { - quark_compactTest_gpu_ADD<<>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); - } - quark_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); - - // Scatter - quark_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, - function, orgThreads, startNounce, inpHashes, d_validNonceTable); - - // Sync - cudaStreamSynchronize(NULL); -} - -////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) -__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm, - uint32_t *d_nonces1, uint32_t *d_nonces2, - uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) -{ - quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); - quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); - - /* - // threadsPerBlock ausrechnen - int blockSize = 256; - int thr1 = threads / blockSize; - int thr2 = threads / (blockSize*blockSize); - - // 1 - quark_compactTest_gpu_SCAN<<>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); - quark_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); - quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); - cudaStreamSynchronize(NULL); - cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); - quark_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); - quark_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); - - // 2 - quark_compactTest_gpu_SCAN<<>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); - quark_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); - quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); - cudaStreamSynchronize(NULL); - cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); - quark_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); - quark_compactTest_gpu_ADD<<>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); - - // Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten - // Schritt 3: Scatter - quark_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); - quark_compactTest_gpu_SCATTER<<>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); - cudaStreamSynchronize(NULL); - */ -} - -__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, size_t *nrm1, - uint32_t *d_nonces2, size_t *nrm2, - int order) -{ - // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind - // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! - - quark_compactTest_cpu_dualCompaction(thr_id, threads, - h_numValid[thr_id], d_nonces1, d_nonces2, - startNounce, inpHashes, d_validNonceTable); - - cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser - *nrm1 = (size_t)h_numValid[thr_id][0]; - *nrm2 = (size_t)h_numValid[thr_id][1]; -} - -__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, size_t *nrm1, - int order) -{ - // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind - // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! - - quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); - - cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser - *nrm1 = (size_t)h_numValid[thr_id][0]; -} +#include +#include + +#include "cuda_helper.h" +#include + +// aus cpu-miner.c +extern int device_map[8]; + +// diese Struktur wird in der Init Funktion angefordert +static cudaDeviceProp props[8]; + +static uint32_t *d_tempBranch1Nonces[8]; +static uint32_t *d_numValid[8]; +static uint32_t *h_numValid[8]; + +static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +// True/False tester +typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); + +__device__ uint32_t QuarkTrueTest(uint32_t *inpHash) +{ + return ((inpHash[0] & 0x08) == 0x08); +} + +__device__ uint32_t QuarkFalseTest(uint32_t *inpHash) +{ + return ((inpHash[0] & 0x08) == 0); +} + +__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest; +cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8]; + +// Setup-Funktionen +__host__ void quark_compactTest_cpu_init(int thr_id, int threads) +{ + cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); + + cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); + cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); + + // wir brauchen auch Speicherplatz auf dem Device + cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2); + cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t)); + cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); + + uint32_t s1; + s1 = (threads / 256) * 2; + + cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) + cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) +} + +#if __CUDA_ARCH__ < 300 +/** + * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 + */ +#undef __shfl_up +#define __shfl_up(var, delta, width) (0) +#endif + +// Die Summenfunktion (vom NVIDIA SDK) +__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) +{ + extern __shared__ uint32_t sums[]; + int id = ((blockIdx.x * blockDim.x) + threadIdx.x); + //int lane_id = id % warpSize; + int lane_id = id % width; + // determine a warp_id within a block + //int warp_id = threadIdx.x / warpSize; + int warp_id = threadIdx.x / width; + + sums[lane_id] = 0; + + // Below is the basic structure of using a shfl instruction + // for a scan. + // Record "value" as a variable - we accumulate it along the way + uint32_t value; + if(testFunc != NULL) + { + if (id < threads) + { + uint32_t *inpHash; + if(d_validNonceTable == NULL) + { + // keine Nonce-Liste + inpHash = &inpHashes[id<<4]; + }else + { + // Nonce-Liste verfügbar + int nonce = d_validNonceTable[id] - startNounce; + inpHash = &inpHashes[nonce<<4]; + } + value = (*testFunc)(inpHash); + }else + { + value = 0; + } + }else + { + value = data[id]; + } + + __syncthreads(); + + // Now accumulate in log steps up the chain + // compute sums, with another thread's value who is + // distance delta away (i). Note + // those threads where the thread 'i' away would have + // been out of bounds of the warp are unaffected. This + // creates the scan sum. +#pragma unroll + + for (int i=1; i<=width; i*=2) + { + uint32_t n = __shfl_up((int)value, i, width); + + if (lane_id >= i) value += n; + } + + // value now holds the scan value for the individual thread + // next sum the largest values for each warp + + // write the sum of the warp to smem + //if (threadIdx.x % warpSize == warpSize-1) + if (threadIdx.x % width == width-1) + { + sums[warp_id] = value; + } + + __syncthreads(); + + // + // scan sum the warp sums + // the same shfl scan operation, but performed on warp sums + // + if (warp_id == 0) + { + uint32_t warp_sum = sums[lane_id]; + + for (int i=1; i<=width; i*=2) + { + uint32_t n = __shfl_up((int)warp_sum, i, width); + + if (lane_id >= i) warp_sum += n; + } + + sums[lane_id] = warp_sum; + } + + __syncthreads(); + + // perform a uniform add across warps in the block + // read neighbouring warp's sum and add it to threads value + uint32_t blockSum = 0; + + if (warp_id > 0) + { + blockSum = sums[warp_id-1]; + } + + value += blockSum; + + // Now write out our result + data[id] = value; + + // last thread has sum, write write out the block's sum + if (partial_sums != NULL && threadIdx.x == blockDim.x-1) + { + partial_sums[blockIdx.x] = value; + } +} + +// Uniform add: add partial sums array +__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) +{ + __shared__ uint32_t buf; + int id = ((blockIdx.x * blockDim.x) + threadIdx.x); + + if (id > len) return; + + if (threadIdx.x == 0) + { + buf = partial_sums[blockIdx.x]; + } + + __syncthreads(); + data[id] += buf; +} + +// Der Scatter +__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) +{ + int id = ((blockIdx.x * blockDim.x) + threadIdx.x); + uint32_t actNounce = id; + uint32_t value; + if (id < threads) + { +// uint32_t nounce = startNounce + id; + uint32_t *inpHash; + if(d_validNonceTable == NULL) + { + // keine Nonce-Liste + inpHash = &inpHashes[id<<4]; + }else + { + // Nonce-Liste verfügbar + int nonce = d_validNonceTable[id] - startNounce; + actNounce = nonce; + inpHash = &inpHashes[nonce<<4]; + } + + value = (*testFunc)(inpHash); + }else + { + value = 0; + } + + if( value ) + { + int idx = sum[id]; + if(idx > 0) + outp[idx-1] = startNounce + actNounce; + } +} + +__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) +{ + if(val == 0) + return 0; + + uint32_t mask = 0x80000000; + while( (val & mask) == 0 ) mask = mask >> 1; + + if( (val & (~mask)) != 0 ) + return mask << 1; + + return mask; +} + +__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm, + uint32_t *d_nonces1, cuda_compactTestFunction_t function, + uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) +{ + int orgThreads = threads; + threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); + // threadsPerBlock ausrechnen + int blockSize = 256; + int nSummen = threads / blockSize; + + int thr1 = (threads+blockSize-1) / blockSize; + int thr2 = threads / (blockSize*blockSize); + int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize; + int thr3 = (nSummen + blockSize2-1) / blockSize2; + + bool callThrid = (thr2 > 0) ? true : false; + + // Erster Initialscan + quark_compactTest_gpu_SCAN<<>>( + d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); + + // weitere Scans + if(callThrid) + { + quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); + quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); + }else + { + quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); + } + + // Sync + Anzahl merken + cudaStreamSynchronize(NULL); + + if(callThrid) + cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); + else + cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); + + + // Addieren + if(callThrid) + { + quark_compactTest_gpu_ADD<<>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); + } + quark_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); + + // Scatter + quark_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, + function, orgThreads, startNounce, inpHashes, d_validNonceTable); + + // Sync + cudaStreamSynchronize(NULL); +} + +////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) +__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm, + uint32_t *d_nonces1, uint32_t *d_nonces2, + uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) +{ + quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); + quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); + + /* + // threadsPerBlock ausrechnen + int blockSize = 256; + int thr1 = threads / blockSize; + int thr2 = threads / (blockSize*blockSize); + + // 1 + quark_compactTest_gpu_SCAN<<>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); + quark_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); + quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); + cudaStreamSynchronize(NULL); + cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); + quark_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); + quark_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); + + // 2 + quark_compactTest_gpu_SCAN<<>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); + quark_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); + quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); + cudaStreamSynchronize(NULL); + cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); + quark_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); + quark_compactTest_gpu_ADD<<>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); + + // Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten + // Schritt 3: Scatter + quark_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); + quark_compactTest_gpu_SCATTER<<>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); + cudaStreamSynchronize(NULL); + */ +} + +__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, + uint32_t *d_nonces1, size_t *nrm1, + uint32_t *d_nonces2, size_t *nrm2, + int order) +{ + // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind + // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! + + quark_compactTest_cpu_dualCompaction(thr_id, threads, + h_numValid[thr_id], d_nonces1, d_nonces2, + startNounce, inpHashes, d_validNonceTable); + + cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser + *nrm1 = (size_t)h_numValid[thr_id][0]; + *nrm2 = (size_t)h_numValid[thr_id][1]; +} + +__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, + uint32_t *d_nonces1, size_t *nrm1, + int order) +{ + // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind + // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! + + quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); + + cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser + *nrm1 = (size_t)h_numValid[thr_id][0]; +} diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index 1b1843d..44c29e8 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -1,23 +1,16 @@ // Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice -#include -#include -#include "device_launch_parameters.h" - #include #include +#include "cuda_helper.h" + // aus cpu-miner.c extern int device_map[8]; // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; - // diese Struktur wird in der Init Funktion angefordert static cudaDeviceProp props[8]; @@ -25,8 +18,8 @@ static cudaDeviceProp props[8]; #include "groestl_functions_quad.cu" #include "bitslice_transformations_quad.cu" -__global__ void __launch_bounds__(256, 4) - quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) +__global__ __launch_bounds__(256, 4) +void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) { // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu index 775d5e2..2a6f2dc 100644 --- a/quark/cuda_quark_keccak512.cu +++ b/quark/cuda_quark_keccak512.cu @@ -1,27 +1,19 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include #include -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -#include "cuda_helper.h" - #define U32TO64_LE(p) \ (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) #define U64TO32_LE(p, v) \ *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); -static const uint64_t host_keccak_round_constants[24] = { +__device__ __constant__ +static const uint64_t c_keccak_round_constants[24] = { 0x0000000000000001ull, 0x0000000000008082ull, 0x800000000000808aull, 0x8000000080008000ull, 0x000000000000808bull, 0x0000000080000001ull, @@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = { 0x0000000080000001ull, 0x8000000080008008ull }; -__constant__ uint64_t c_keccak_round_constants[24]; - static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { size_t i; @@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u // Setup-Funktionen __host__ void quark_keccak512_cpu_init(int thr_id, int threads) { - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( c_keccak_round_constants, - host_keccak_round_constants, - sizeof(host_keccak_round_constants), - 0, cudaMemcpyHostToDevice); } __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index d32e9c0..7008807 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -1,16 +1,8 @@ -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" - #include +#include #include -// Folgende Definitionen später durch header ersetzen -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -#define SPH_C64(x) ((uint64_t)(x ## ULL)) +#include "cuda_helper.h" // aus cpu-miner.c extern "C" extern int device_map[8]; @@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t // Take a look at: https://www.schneier.com/skein1.3.pdf -#if __CUDA_ARCH__ >= 350 -__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { - uint2 result; - if(offset >= 32) { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } else { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - } - return __double_as_longlong(__hiloint2double(result.y, result.x)); -} -#else -#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) -#endif #define SHL(x, n) ((x) << (n)) #define SHR(x, n) ((x) >> (n)) diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index 34ba0bc..02679dd 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -1,4 +1,3 @@ - extern "C" { #include "sph/sph_blake.h" @@ -8,9 +7,9 @@ extern "C" #include "sph/sph_jh.h" #include "sph/sph_keccak.h" #include "miner.h" -} -#include +#include "cuda_helper.h" +} // aus cpu-miner.c extern int device_map[8]; @@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_check_cpu_init(int thr_id, int threads); -extern void quark_check_cpu_setTarget(const void *ptarget); -extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, @@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, // Konstanten kopieren, Speicher belegen cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + quark_blake512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput); - quark_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); quark_compactTest_cpu_init(thr_id, throughput); + cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); + init[thr_id] = true; } @@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); quark_blake512_cpu_setBlock_80((void*)endiandata); - quark_check_cpu_setTarget(ptarget); + cuda_check_cpu_setTarget(ptarget); do { int order = 0; @@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/util.c b/util.c index 55b8f86..980cc2b 100644 --- a/util.c +++ b/util.c @@ -1352,6 +1352,10 @@ void print_hash_tests(void) myriadhash(&hash[0], &buf[0]); printf("\nmyriad: "); print_hash(hash); + memset(hash, 0, sizeof hash); + nist5hash(&hash[0], &buf[0]); + printf("\nnist5: "); print_hash(hash); + memset(hash, 0, sizeof hash); quarkhash(&hash[0], &buf[0]); printf("\nquark: "); print_hash(hash); diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu index 939569d..53f82ba 100644 --- a/x11/cuda_x11_cubehash512.cu +++ b/x11/cuda_x11_cubehash512.cu @@ -1,30 +1,13 @@ -#include +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); typedef unsigned char BitSequence; -typedef unsigned long long DataLength; - -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -#if 0 -__device__ static uint32_t cuda_swab32(uint32_t x) -{ - return __byte_perm(x, 0, 0x0123); -} -#endif - -typedef unsigned char BitSequence; -typedef unsigned long long DataLength; #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ -typedef unsigned int uint32_t; /* must be exactly 32 bits */ - #define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25)) #define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21)) #define SWAP(a,b) { uint32_t u = a; a = b; b = u; } diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index 532aa64..f105484 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -1,33 +1,7 @@ -#include -#include -#include "device_launch_parameters.h" - #include -#include #include -// das Hi Word aus einem 64 Bit Typen extrahieren -#if 0 -static __device__ uint32_t HIWORD(const uint64_t &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2hiint(__longlong_as_double(x)); -#else - return (uint32_t)(x >> 32); -#endif -} - -// das Lo Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t LOWORD(const uint64_t &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2loint(__longlong_as_double(x)); -#else - return (uint32_t)(x & 0xFFFFFFFFULL); -#endif -} -#endif - -#define SPH_C64(x) ((uint64_t)(x ## ULL)) -#define SPH_C32(x) ((uint32_t)(x ## U)) +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu index cd72a53..a976271 100644 --- a/x11/cuda_x11_luffa512.cu +++ b/x11/cuda_x11_luffa512.cu @@ -18,28 +18,18 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); typedef unsigned char BitSequence; -typedef unsigned char uint8_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - typedef struct { uint32_t buffer[8]; /* Buffer to be hashed */ uint32_t chainv[40]; /* Chaining values */ } hashState; - __device__ __forceinline__ -static uint32_t BYTES_SWAP32(uint32_t x) -{ - return __byte_perm(x, x, 0x0123); -} - #define MULT2(a,j)\ tmp = a[7+(8*j)];\ a[7+(8*j)] = a[6+(8*j)];\ @@ -289,11 +279,11 @@ __device__ __forceinline__ void Update512(hashState *state, const BitSequence *data) { #pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]); + for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]); rnd512(state); #pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]); + for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]); rnd512(state); } @@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b) for(j=0;j<5;j++) { b[i] ^= state->chainv[i+8*j]; } - b[i] = BYTES_SWAP32((b[i])); + b[i] = cuda_swab32((b[i])); } #pragma unroll 8 @@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b) for(j=0;j<5;j++) { b[8+i] ^= state->chainv[i+8*j]; } - b[8+i] = BYTES_SWAP32((b[8+i])); + b[8 + i] = cuda_swab32((b[8 + i])); } } diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu index b5cbe0b..a6e6ed7 100644 --- a/x11/cuda_x11_shavite512.cu +++ b/x11/cuda_x11_shavite512.cu @@ -1,18 +1,13 @@ -#include -#include +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -typedef unsigned char BitSequence; -typedef unsigned long long DataLength; +//typedef unsigned char BitSequence; +//typedef unsigned long long DataLength; -#define SPH_C64(x) ((uint64_t)(x ## ULL)) -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -static __constant__ uint32_t d_ShaviteInitVector[16]; -static const uint32_t h_ShaviteInitVector[] = { +__device__ __constant__ +static const uint32_t d_ShaviteInitVector[16] = { SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), @@ -1304,18 +1299,18 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui aes_gpu_init(sharedMemory); - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; + int hashPosition = nounce - startNounce; + uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; // kopiere init-state uint32_t state[16]; -#pragma unroll 16 + #pragma unroll 16 for(int i=0;i<16;i++) state[i] = d_ShaviteInitVector[i]; @@ -1323,13 +1318,13 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui uint32_t msg[32]; // fülle die Nachricht mit 64-byte (vorheriger Hash) -#pragma unroll 16 + #pragma unroll 16 for(int i=0;i<16;i++) msg[i] = Hash[i]; // Nachrichtenende msg[16] = 0x80; -#pragma unroll 10 + #pragma unroll 10 for(int i=17;i<27;i++) msg[i] = 0; @@ -1341,10 +1336,10 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui c512(sharedMemory, state, msg); -#pragma unroll 16 + #pragma unroll 16 for(int i=0;i<16;i++) Hash[i] = state[i]; - } + } } @@ -1352,25 +1347,19 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui __host__ void x11_shavite512_cpu_init(int thr_id, int threads) { aes_cpu_init(); - - cudaMemcpyToSymbol( d_ShaviteInitVector, - h_ShaviteInitVector, - sizeof(h_ShaviteInitVector), - 0, cudaMemcpyHostToDevice); } __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - const int threadsperblock = 256; + const int threadsperblock = 256; - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; - x11_shavite512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); + x11_shavite512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + MyStreamSynchronize(NULL, order, thr_id); } - diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 0d161c7..3b494d0 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -7,29 +7,17 @@ #define TPB 256 +#include "cuda_helper.h" + // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - int *d_state[8]; uint4 *d_temp4[8]; // texture bound to d_temp4[thr_id], for read access in Compaction kernel texture texRef1D_128; -#define C32(x) ((uint32_t)(x ## U)) -#define T32(x) ((x) & C32(0xFFFFFFFF)) - -#if __CUDA_ARCH__ < 350 - // Kepler (Compute 3.0) - #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) -#else - // Kepler (Compute 3.5) - #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) -#endif - __device__ __constant__ const uint32_t c_IV_512[32] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, @@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \ #undef BUTTERFLY } -#if __CUDA_ARCH__ < 300 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 /** * __shfl() returns the value of var held by the thread whose ID is given by srcLane. * If srcLane is outside the range 0..width-1, the thread's own value of var is returned. @@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \ __device__ __forceinline__ void FFT_16(int *y) { -#if __CUDA_ARCH__ < 300 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 #ifndef WIN32 # warning FFT_16() function is not compatible with SM 2.1 devices! #endif @@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) { __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) { int i; -#if __CUDA_ARCH__ < 300 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 #ifndef WIN32 # warning Expansion() function is not compatible with SM 2.1 devices #endif diff --git a/x11/x11.cu b/x11/x11.cu index 2805302..c170f2f 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -15,10 +15,11 @@ extern "C" #include "sph/sph_echo.h" #include "miner.h" -} +#include "cuda_helper.h" -#include -#include +#include +#include +} // aus cpu-miner.c extern int device_map[8]; @@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_check_cpu_init(int thr_id, int threads); -extern void quark_check_cpu_setTarget(const void *ptarget); -extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, @@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, x11_shavite512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput); - quark_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; } @@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); quark_blake512_cpu_setBlock_80((void*)endiandata); - quark_check_cpu_setTarget(ptarget); + cuda_check_cpu_setTarget(ptarget); do { uint32_t foundNonce; @@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); // Scan nach Gewinner Hashes auf der GPU - foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu index adc1594..a231adb 100644 --- a/x13/cuda_x13_fugue512.cu +++ b/x13/cuda_x13_fugue512.cu @@ -5,26 +5,11 @@ * heavily based on phm's sgminer * */ -#include -#include -#include "device_launch_parameters.h" - -#include +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#if __CUDA_ARCH__ < 350 -// Kepler (Compute 3.0) -#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#else -// Kepler (Compute 3.5, 5.0) -#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) -#endif - /* * X13 kernel implementation. * @@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t * @author phm */ -#define SWAB32(x) ( __byte_perm(x, x, 0x0123) ) - #define mixtab0(x) (*((uint32_t*)mixtabs + ( (x)))) #define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x)))) #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x)))) @@ -584,97 +567,97 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint __syncthreads(); - int i; - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; - - #pragma unroll 16 - for( i = 0; i < 16; i++ ) - Hash[i] = SWAB32(Hash[i]); - - uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; - uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; - uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; - uint32_t S30, S31, S32, S33, S34, S35; - - uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35; - uint64_t bc = (uint64_t) 64 << 3; - uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL); - uint32_t bchi = (uint32_t)(bc >> 32); - - S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; - S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); - S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); - S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); - S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); - - FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2])); - FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5])); - FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8])); - FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB])); - FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE])); - FUGUE512_3((Hash[0xF]), bchi, bclo); - - #pragma unroll 32 - for (i = 0; i < 32; i ++) { - ROR3; - CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); - SMIX(S00, S01, S02, S03); - } - #pragma unroll 13 - for (i = 0; i < 13; i ++) { - S04 ^= S00; - S09 ^= S00; - S18 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S18 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S28 ^= S00; - ROR8; - SMIX(S00, S01, S02, S03); - } - S04 ^= S00; - S09 ^= S00; - S18 ^= S00; - S27 ^= S00; - - Hash[0] = SWAB32(S01); - Hash[1] = SWAB32(S02); - Hash[2] = SWAB32(S03); - Hash[3] = SWAB32(S04); - Hash[4] = SWAB32(S09); - Hash[5] = SWAB32(S10); - Hash[6] = SWAB32(S11); - Hash[7] = SWAB32(S12); - Hash[8] = SWAB32(S18); - Hash[9] = SWAB32(S19); - Hash[10] = SWAB32(S20); - Hash[11] = SWAB32(S21); - Hash[12] = SWAB32(S27); - Hash[13] = SWAB32(S28); - Hash[14] = SWAB32(S29); - Hash[15] = SWAB32(S30); - } + int i; + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + int hashPosition = nounce - startNounce; + uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; + + #pragma unroll 16 + for( i = 0; i < 16; i++ ) + Hash[i] = cuda_swab32(Hash[i]); + + uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; + uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; + uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; + uint32_t S30, S31, S32, S33, S34, S35; + + uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35; + uint64_t bc = (uint64_t) 64 << 3; + uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL); + uint32_t bchi = (uint32_t)(bc >> 32); + + S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; + S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); + S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); + S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); + S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); + + FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2])); + FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5])); + FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8])); + FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB])); + FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE])); + FUGUE512_3((Hash[0xF]), bchi, bclo); + + #pragma unroll 32 + for (i = 0; i < 32; i ++) { + ROR3; + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + } + #pragma unroll 13 + for (i = 0; i < 13; i ++) { + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S18 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S28 ^= S00; + ROR8; + SMIX(S00, S01, S02, S03); + } + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + + Hash[0] = cuda_swab32(S01); + Hash[1] = cuda_swab32(S02); + Hash[2] = cuda_swab32(S03); + Hash[3] = cuda_swab32(S04); + Hash[4] = cuda_swab32(S09); + Hash[5] = cuda_swab32(S10); + Hash[6] = cuda_swab32(S11); + Hash[7] = cuda_swab32(S12); + Hash[8] = cuda_swab32(S18); + Hash[9] = cuda_swab32(S19); + Hash[10] = cuda_swab32(S20); + Hash[11] = cuda_swab32(S21); + Hash[12] = cuda_swab32(S27); + Hash[13] = cuda_swab32(S28); + Hash[14] = cuda_swab32(S29); + Hash[15] = cuda_swab32(S30); + } } #define texDef(texname, texmem, texsource, texsize) \ @@ -697,17 +680,17 @@ __host__ void x13_fugue512_cpu_init(int thr_id, int threads) __host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - const int threadsperblock = 256; + const int threadsperblock = 256; - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs + // Größe des dynamischen Shared Memory Bereichs size_t shared_size = 4 * 256 * sizeof(uint32_t); -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - x13_fugue512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); + x13_fugue512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu index 4a6b900..c43246c 100644 --- a/x13/cuda_x13_hamsi512.cu +++ b/x13/cuda_x13_hamsi512.cu @@ -37,26 +37,11 @@ * @author phm */ -#include -#include +#include "cuda_helper.h" // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -#define SPH_C64(x) ((uint64_t)(x ## ULL)) -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#define SWAB32(x) ( __byte_perm(x, x, 0x0123) ) - -#if __CUDA_ARCH__ < 350 - // Kepler (Compute 3.0) - #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#else - // Kepler (Compute 3.5) - #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) -#endif - __device__ __constant__ static const uint32_t d_alpha_n[] = { SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), @@ -663,7 +648,7 @@ static const uint32_t d_T512[64][16] = { mD = 0; \ mE = 0; \ mF = 0; \ - for (u = 0; u < 8; u ++) { \ + for (u = 0; u < 8; u ++) { \ unsigned db = buf(u); \ for (v = 0; v < 8; v ++, db >>= 1) { \ uint32_t dm = SPH_T32(-(uint32_t)(db & 1)); \ @@ -692,45 +677,47 @@ static const uint32_t d_T512[64][16] = { // Die Hash-Funktion __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; - unsigned char *h1 = (unsigned char *)Hash; - - uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265); - uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c); - uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48); - uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d); - uint32_t m0, m1, m2, m3, m4, m5, m6, m7; - uint32_t m8, m9, mA, mB, mC, mD, mE, mF; - uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + int hashPosition = nounce - startNounce; + uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; + unsigned char *h1 = (unsigned char *)Hash; + + uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265); + uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c); + uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48); + uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d); + uint32_t m0, m1, m2, m3, m4, m5, m6, m7; + uint32_t m8, m9, mA, mB, mC, mD, mE, mF; + uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; #define buf(u) (h1[i+u]) #pragma unroll 8 - for(int i = 0; i < 64; i += 8) { - INPUT_BIG; - P_BIG; - T_BIG; - } + for(int i = 0; i < 64; i += 8) { + INPUT_BIG; + P_BIG; + T_BIG; + } + #undef buf #define buf(u) (u == 0 ? 0x80 : 0) - INPUT_BIG; - P_BIG; - T_BIG; + INPUT_BIG; + P_BIG; + T_BIG; + #undef buf #define buf(u) (u == 6 ? 2 : 0) - INPUT_BIG; - PF_BIG; - T_BIG; + INPUT_BIG; + PF_BIG; + T_BIG; #pragma unroll 16 - for (int i = 0; i < 16; i++) - Hash[i] = SWAB32(h[i]); - } + for (int i = 0; i < 16; i++) + Hash[i] = cuda_swab32(h[i]); + } } @@ -740,18 +727,17 @@ __host__ void x13_hamsi512_cpu_init(int thr_id, int threads) __host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - const int threadsperblock = 256; + const int threadsperblock = 256; - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); + x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + MyStreamSynchronize(NULL, order, thr_id); } - diff --git a/x13/x13.cu b/x13/x13.cu index ee94a6e..f4d6992 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -20,10 +20,9 @@ extern "C" #include "sph/sph_fugue.h" #include "miner.h" -} -#include -#include +#include "cuda_helper.h" +} // aus cpu-miner.c extern int device_map[8]; @@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun extern void x13_fugue512_cpu_init(int thr_id, int threads); extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_check_cpu_init(int thr_id, int threads); -extern void quark_check_cpu_setTarget(const void *ptarget); -extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, @@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, x11_echo512_cpu_init(thr_id, throughput); x13_hamsi512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput); - quark_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; } @@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); quark_blake512_cpu_setBlock_80((void*)endiandata); - quark_check_cpu_setTarget(ptarget); + cuda_check_cpu_setTarget(ptarget); do { uint32_t foundNonce; @@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); // Scan nach Gewinner Hashes auf der GPU - foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/x15/cuda_x14_shabal512.cu b/x15/cuda_x14_shabal512.cu index 76d151a..3d55747 100644 --- a/x15/cuda_x14_shabal512.cu +++ b/x15/cuda_x14_shabal512.cu @@ -1,26 +1,10 @@ /* * Shabal-512 for X14/X15 (STUB) */ -#include -#include +#include "cuda_helper.h" extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -#define SPH_C64(x) ((uint64_t)(x ## ULL)) -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#define SWAB32(x) ( __byte_perm(x, x, 0x0123) ) - -#if __CUDA_ARCH__ < 350 - // Kepler (Compute 3.0) - #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#else - // Kepler (Compute 3.5) - #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) -#endif - /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */ /* * Shabal implementation. diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu index 787a510..7a14cfe 100644 --- a/x15/cuda_x15_whirlpool.cu +++ b/x15/cuda_x15_whirlpool.cu @@ -4,8 +4,8 @@ * tpruvot@github */ #include -#include -#include + +#include "cuda_helper.h" #define NULLTEST 0 @@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t #define SPH_64 (1) #define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1) -#define SPH_C64(x) ((uint64_t)(x ## ULL)) - // defined in cuda_helper.h #define SPH_ROTL64(x,n) ROTL64(x,n) diff --git a/x15/x14.cu b/x15/x14.cu index 31c3ef3..0b56584 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -22,10 +22,9 @@ extern "C" { #include "sph/sph_shabal.h" #include "miner.h" -} -#include -#include +#include "cuda_helper.h" +} // from cpu-miner.c extern int device_map[8]; @@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun extern void x14_shabal512_cpu_init(int thr_id, int threads); extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_check_cpu_init(int thr_id, int threads); -extern void quark_check_cpu_setTarget(const void *ptarget); -extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, @@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, x13_fugue512_cpu_init(thr_id, throughput); x14_shabal512_cpu_init(thr_id, throughput); - quark_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; } @@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); quark_blake512_cpu_setBlock_80((void*)endiandata); - quark_check_cpu_setTarget(ptarget); + cuda_check_cpu_setTarget(ptarget); do { int order = 0; @@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { /* check now with the CPU to confirm */ diff --git a/x15/x15.cu b/x15/x15.cu index ebdb347..7da8aed 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -23,10 +23,9 @@ extern "C" { #include "sph/sph_whirlpool.h" #include "miner.h" -} -#include -#include +#include "cuda_helper.h" +} // to test gpu hash on a null buffer #define NULLTEST 0 @@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou extern void x15_whirlpool_cpu_init(int thr_id, int threads); extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_check_cpu_init(int thr_id, int threads); -extern void quark_check_cpu_setTarget(const void *ptarget); -extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, @@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, x14_shabal512_cpu_init(thr_id, throughput); x15_whirlpool_cpu_init(thr_id, throughput); - quark_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; } @@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); quark_blake512_cpu_setBlock_80((void*)endiandata); - quark_check_cpu_setTarget(ptarget); + cuda_check_cpu_setTarget(ptarget); do { int order = 0; @@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, print_hash((unsigned char*)buf); printf("\n"); #endif /* Scan with GPU */ - uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (foundNonce != 0xffffffff) {