Browse Source

Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof
master
Tanguy Pruvot 10 years ago
parent
commit
d9ea5f72ce
  1. 11
      JHA/cuda_jha_compactionTest.cu
  2. 33
      JHA/cuda_jha_keccak512.cu
  3. 19
      JHA/jackpotcoin.cu
  4. 3
      bitslice_transformations_quad.cu
  5. 9
      ccminer.vcxproj
  6. 6
      ccminer.vcxproj.filters
  7. 15
      cuda_fugue256.cu
  8. 18
      cuda_groestlcoin.cu
  9. 146
      cuda_helper.h
  10. 11
      cuda_myriadgroestl.cu
  11. 28
      cuda_nist5.cu
  12. 1
      groestl_functions_quad.cu
  13. 28
      heavy/cuda_blake512.cu
  14. 7
      heavy/cuda_combine.cu
  15. 10
      heavy/cuda_groestl512.cu
  16. 12
      heavy/cuda_hefty1.cu
  17. 13
      heavy/cuda_keccak512.cu
  18. 7
      heavy/cuda_sha256.cu
  19. 6
      heavy/heavy.cu
  20. 1
      miner.h
  21. 583
      quark/animecoin.cu
  22. 151
      quark/cuda_bmw512.cu
  23. 13
      quark/cuda_checkhash.cu
  24. 2
      quark/cuda_jh512.cu
  25. 230
      quark/cuda_quark_blake512.cu
  26. 739
      quark/cuda_quark_compactionTest.cu
  27. 15
      quark/cuda_quark_groestl512.cu
  28. 21
      quark/cuda_quark_keccak512.cu
  29. 27
      quark/cuda_skein512.cu
  30. 20
      quark/quarkcoin.cu
  31. 4
      util.c
  32. 19
      x11/cuda_x11_cubehash512.cu
  33. 28
      x11/cuda_x11_echo.cu
  34. 20
      x11/cuda_x11_luffa512.cu
  35. 59
      x11/cuda_x11_shavite512.cu
  36. 22
      x11/cuda_x11_simd512.cu
  37. 19
      x11/x11.cu
  38. 217
      x13/cuda_x13_fugue512.cu
  39. 100
      x13/cuda_x13_hamsi512.cu
  40. 17
      x13/x13.cu
  41. 18
      x15/cuda_x14_shabal512.cu
  42. 6
      x15/cuda_x15_whirlpool.cu
  43. 17
      x15/x14.cu
  44. 17
      x15/x15.cu

11
JHA/cuda_jha_compactionTest.cu

@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_30_intrinsics.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include <stdint.h>
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
} }
#if __CUDA_ARCH__ < 300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/** /**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/ */

33
JHA/cuda_jha_keccak512.cu

@ -1,16 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
__constant__ uint64_t c_State[25]; __constant__ uint64_t c_State[25];
__constant__ uint32_t c_PaddedMessage[18]; __constant__ uint32_t c_PaddedMessage[18];
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define U32TO64_LE(p) \ #define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))

19
JHA/jackpotcoin.cu

@ -1,4 +1,3 @@
extern "C" extern "C"
{ {
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
@ -7,10 +6,9 @@ extern "C"
#include "sph/sph_jh.h" #include "sph/sph_jh.h"
#include "sph/sph_skein.h" #include "sph/sph_skein.h"
#include "miner.h" #include "miner.h"
#include "cuda_helper.h"
} }
#include <stdint.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void jackpot_compactTest_cpu_init(int thr_id, int threads); extern void jackpot_compactTest_cpu_init(int thr_id, int threads);
extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
quark_groestl512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2); cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2); cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
jackpot_keccak512_cpu_setBlock((void*)endiandata, 80); jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
} }
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
unsigned int rounds;
uint32_t vhash64[8]; uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce); be32enc(&endiandata[19], foundNonce);
// diese jackpothash Funktion gibt die Zahl der Runden zurück // diese jackpothash Funktion gibt die Zahl der Runden zurück
unsigned int rounds = jackpothash(vhash64, endiandata); rounds = jackpothash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {

3
bitslice_transformations_quad.cu

@ -1,5 +1,4 @@
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#if __CUDA_ARCH__ < 300
/** /**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane. * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned. * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.

9
ccminer.vcxproj

@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep> <Keep>false</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration> <CodeGeneration>compute_50,sm_50</CodeGeneration>
<Defines></Defines> <Defines>
</Defines>
</CudaCompile> </CudaCompile>
<CudaLink> <CudaLink>
<GPUDebugInfo>false</GPUDebugInfo> <GPUDebugInfo>false</GPUDebugInfo>
@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<ClInclude Include="uint256.h" /> <ClInclude Include="uint256.h" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<CudaCompile Include="bitslice_transformations_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="cuda_fugue256.cu"> <CudaCompile Include="cuda_fugue256.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile> </CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="heavy\cuda_blake512.cu"> <CudaCompile Include="heavy\cuda_blake512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>

6
ccminer.vcxproj.filters

@ -391,5 +391,11 @@
<CudaCompile Include="x15\cuda_x15_whirlpool.cu"> <CudaCompile Include="x15\cuda_x15_whirlpool.cu">
<Filter>Source Files\CUDA\x15</Filter> <Filter>Source Files\CUDA\x15</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="bitslice_transformations_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
</ItemGroup> </ItemGroup>
</Project> </Project>

15
cuda_fugue256.cu

@ -1,12 +1,11 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "sph/sph_fugue.h" #include "sph/sph_fugue.h"
#include "cuda_helper.h"
#include <host_defines.h>
#define USE_SHARED 1 #define USE_SHARED 1
// aus cpu-miner.c // aus cpu-miner.c
@ -15,14 +14,6 @@ extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// schon in sph_fugue.h definiert
//#define SPH_C32(x) ((uint32_t)(x ## U))
uint32_t *d_fugue256_hashoutput[8]; uint32_t *d_fugue256_hashoutput[8];
uint32_t *d_resultNonce[8]; uint32_t *d_resultNonce[8];

18
cuda_groestlcoin.cu

@ -1,23 +1,17 @@
// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice // Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
#include <host_defines.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8]; static cudaDeviceProp props[8];
@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
#include "groestl_functions_quad.cu" #include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu" #include "bitslice_transformations_quad.cu"
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) #define SWAB32(x) cuda_swab32(x)
__global__ void __launch_bounds__(256, 4) __global__ __launch_bounds__(256, 4)
groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce) void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
{ {
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4; int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;

146
cuda_helper.h

@ -1,33 +1,78 @@
#ifndef CUDA_HELPER_H #ifndef CUDA_HELPER_H
#define CUDA_HELPER_H #define CUDA_HELPER_H
#include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI) #if defined(_MSC_VER)
{ /* reduce warnings */
#if __CUDA_ARCH__ >= 130 #include <device_functions.h>
return __double_as_longlong(__hiloint2double(HI, LO)); #include <device_launch_parameters.h>
#endif
#include <stdint.h>
extern __device__ __device_builtin__ void __syncthreads(void);
#ifndef __CUDA_ARCH__
// define blockDim and threadIdx for host
extern const dim3 blockDim;
extern const uint3 threadIdx;
#endif
#ifndef SPH_C32
#define SPH_C32(x) ((uint32_t)(x ## U))
#endif
#ifndef SPH_C64
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#endif
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else #else
return (unsigned long long)LO | (((unsigned long long)HI) << 32); // Kepler (Compute 3.5, 5.0)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif #endif
}
// das Hi Word aus einem 64 Bit Typen extrahieren __device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
static __device__ uint32_t HIWORD(const uint64_t &x) { {
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x)); return __double_as_longlong(__hiloint2double(HI, LO));
#else #else
return (uint32_t)(x >> 32); return (unsigned long long)LO | (((unsigned long long)HI) << 32);
#endif #endif
} }
// das Hi Word in einem 64 Bit Typen ersetzen // das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { __device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
} }
// das Lo Word in einem 64 Bit Typen ersetzen
__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
// Endian Drehung für 32 Bit Typen
#ifdef __CUDA_ARCH__
__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
{
/* device */
return __byte_perm(x, x, 0x0123);
}
#else
/* host */
#define cuda_swab32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
#endif
// das Lo Word aus einem 64 Bit Typen extrahieren // das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) { __device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x)); return (uint32_t)__double2loint(__longlong_as_double(x));
#else #else
@ -35,34 +80,51 @@ static __device__ uint32_t LOWORD(const uint64_t &x) {
#endif #endif
} }
// das Lo Word in einem 64 Bit Typen ersetzen // das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { __device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); #if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
} }
// Endian Drehung für 32 Bit Typen #ifdef __CUDA_ARCH__
static __device__ uint32_t cuda_swab32(uint32_t x) __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
{ {
return __byte_perm(x, x, 0x0123); // Input: 77665544 33221100
} // Output: 00112233 44556677
uint64_t temp[2];
temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);
// Endian Drehung für 64 Bit Typen return temp[0] | (temp[1]<<32);
static __device__ uint64_t cuda_swab64(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x)));
} }
#else
/* host */
#define cuda_swab64(x) \
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
#endif
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) { __device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
uint2 result; uint2 result;
if(offset < 32) { if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else { } else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
} }
return __double_as_longlong(__hiloint2double(result.y, result.x)); return __double_as_longlong(__hiloint2double(result.y, result.x));
} }
#else #else
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) #define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
@ -70,16 +132,16 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { __device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result; uint2 result;
if(offset >= 32) { if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else { } else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
} }
return __double_as_longlong(__hiloint2double(result.y, result.x)); return __double_as_longlong(__hiloint2double(result.y, result.x));
} }
#else #else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))

11
cuda_myriadgroestl.cu

@ -1,23 +1,16 @@
// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice // Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8]; static cudaDeviceProp props[8];

28
cuda_nist5.cu

@ -1,4 +1,3 @@
extern "C" extern "C"
{ {
#include "sph/sph_blake.h" #include "sph/sph_blake.h"
@ -7,10 +6,9 @@ extern "C"
#include "sph/sph_jh.h" #include "sph/sph_jh.h"
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
#include "miner.h" #include "miner.h"
#include "cuda_helper.h"
} }
#include <stdint.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
// Original nist5hash Funktion aus einem miner Quelltext // Original nist5hash Funktion aus einem miner Quelltext
inline void nist5hash(void *state, const void *input) extern "C" void nist5hash(void *state, const void *input)
{ {
sph_blake512_context ctx_blake; sph_blake512_context ctx_blake;
sph_groestl512_context ctx_groestl; sph_groestl512_context ctx_groestl;
@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
quark_jh512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
// erstes Blake512 Hash mit CUDA // Hash with CUDA
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];

1
groestl_functions_quad.cu

@ -1,3 +1,4 @@
#include "cuda_helper.h"
__device__ __forceinline__ void G256_Mul2(uint32_t *regs) __device__ __forceinline__ void G256_Mul2(uint32_t *regs)
{ {

28
heavy/cuda_blake512.cu

@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *d_heftyHashes[8];
@ -20,7 +13,6 @@ uint32_t *d_hash5output[8];
// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding) __constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
#include "cuda_helper.h"
// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------ // ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
}; };
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam. /* in cuda_helper */
#define SWAP32(x) \ #define SWAP32(x) cuda_swab32(x)
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ #define SWAP64(x) cuda_swab64(x)
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
#define SWAP64(x) \
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
__constant__ uint64_t c_SecondRound[15]; __constant__ uint64_t c_SecondRound[15];

7
heavy/cuda_combine.cu

@ -1,9 +1,4 @@
#include <cuda.h> #include "cuda_helper.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
// globaler Speicher für unsere Ergebnisse // globaler Speicher für unsere Ergebnisse
uint32_t *d_hashoutput[8]; uint32_t *d_hashoutput[8];

10
heavy/cuda_groestl512.cu

@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *d_heftyHashes[8];
@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
cudaMemcpyToSymbol( groestl_gpu_msg, cudaMemcpyToSymbol( groestl_gpu_msg,
msgBlock, msgBlock,
128); 128);
BLOCKSIZE = len; BLOCKSIZE = len;
} }

12
heavy/cuda_hefty1.cu

@ -1,10 +1,9 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
#include <device_functions.h>
#define USE_SHARED 1 #define USE_SHARED 1
// aus cpu-miner.c // aus cpu-miner.c
@ -13,11 +12,6 @@ extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8]; static cudaDeviceProp props[8];

13
heavy/cuda_keccak512.cu

@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *d_heftyHashes[8];
@ -81,8 +74,8 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
uint64_t t[5], u[5], v, w; uint64_t t[5], u[5], v, w;
/* absorb input */ /* absorb input */
#pragma unroll 9 #pragma unroll 9
for (i = 0; i < 72 / 8; i++, in += 2) for (i = 0; i < 9 /* 72/8 */; i++, in += 2)
s[i] ^= U32TO64_LE(in); s[i] ^= U32TO64_LE(in);
for (i = 0; i < 24; i++) { for (i = 0; i < 24; i++) {

7
heavy/cuda_sha256.cu

@ -1,12 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned int uint32_t;
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *d_heftyHashes[8];

6
heavy/heavy.cu

@ -1,7 +1,3 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include <string.h> #include <string.h>
@ -34,6 +30,8 @@
#include "heavy/cuda_blake512.h" #include "heavy/cuda_blake512.h"
#include "heavy/cuda_combine.h" #include "heavy/cuda_combine.h"
#include "cuda_helper.h"
extern uint32_t *d_hash2output[8]; extern uint32_t *d_hash2output[8];
extern uint32_t *d_hash3output[8]; extern uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8]; extern uint32_t *d_hash4output[8];

1
miner.h

@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void groestlhash(void *state, const void *input); void groestlhash(void *state, const void *input);
void myriadhash(void *state, const void *input); void myriadhash(void *state, const void *input);
void nist5hash(void *state, const void *input);
void quarkhash(void *state, const void *input); void quarkhash(void *state, const void *input);
void x11hash(void *output, const void *input); void x11hash(void *output, const void *input);
void x13hash(void *output, const void *input); void x13hash(void *output, const void *input);

583
quark/animecoin.cu

@ -1,291 +1,292 @@
extern "C"
extern "C" {
{ #include "sph/sph_blake.h"
#include "sph/sph_blake.h" #include "sph/sph_bmw.h"
#include "sph/sph_bmw.h" #include "sph/sph_groestl.h"
#include "sph/sph_groestl.h" #include "sph/sph_skein.h"
#include "sph/sph_skein.h" #include "sph/sph_jh.h"
#include "sph/sph_jh.h" #include "sph/sph_keccak.h"
#include "sph/sph_keccak.h" #include "miner.h"
#include "miner.h" #include "cuda_helper.h"
} }
#include <stdint.h> // aus cpu-miner.c
extern int device_map[8];
// aus cpu-miner.c
extern int device_map[8]; // Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8]; // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
static uint32_t *d_animeNonces[8];
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes static uint32_t *d_branch1Nonces[8];
static uint32_t *d_animeNonces[8]; static uint32_t *d_branch2Nonces[8];
static uint32_t *d_branch1Nonces[8]; static uint32_t *d_branch3Nonces[8];
static uint32_t *d_branch2Nonces[8];
static uint32_t *d_branch3Nonces[8]; extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_setBlock_80(void *pdata);
extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
extern void quark_bmw512_cpu_setBlock_80(void *pdata); extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); extern void quark_groestl512_cpu_init(int thr_id, int threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_init(int thr_id, int threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern void quark_check_cpu_init(int thr_id, int threads); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
extern void quark_compactTest_cpu_init(int thr_id, int threads); uint32_t *d_nonces1, size_t *nrm1,
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, uint32_t *d_nonces2, size_t *nrm2,
uint32_t *d_nonces1, size_t *nrm1, int order);
uint32_t *d_nonces2, size_t *nrm2, extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
int order); uint32_t *d_nonces1, size_t *nrm1,
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, int order);
uint32_t *d_nonces1, size_t *nrm1,
int order); // Original Quarkhash Funktion aus einem miner Quelltext
inline void animehash(void *state, const void *input)
// Original Quarkhash Funktion aus einem miner Quelltext {
inline void animehash(void *state, const void *input) sph_blake512_context ctx_blake;
{ sph_bmw512_context ctx_bmw;
sph_blake512_context ctx_blake; sph_groestl512_context ctx_groestl;
sph_bmw512_context ctx_bmw; sph_jh512_context ctx_jh;
sph_groestl512_context ctx_groestl; sph_keccak512_context ctx_keccak;
sph_jh512_context ctx_jh; sph_skein512_context ctx_skein;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein; unsigned char hash[64];
unsigned char hash[64]; sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512_init(&ctx_bmw); sph_bmw512 (&ctx_bmw, (const void*) input, 80);
// ZBMW; sph_bmw512_close(&ctx_bmw, (void*) hash);
sph_bmw512 (&ctx_bmw, (const void*) input, 80);
sph_bmw512_close(&ctx_bmw, (void*) hash); sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512_init(&ctx_blake); sph_blake512 (&ctx_blake, hash, 64);
// ZBLAKE; sph_blake512_close(&ctx_blake, (void*) hash);
sph_blake512 (&ctx_blake, hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash); if (hash[0] & 0x8)
{
if (hash[0] & 0x8) sph_groestl512_init(&ctx_groestl);
{ // ZGROESTL;
sph_groestl512_init(&ctx_groestl); sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
// ZGROESTL; sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_groestl512 (&ctx_groestl, (const void*) hash, 64); }
sph_groestl512_close(&ctx_groestl, (void*) hash); else
} {
else sph_skein512_init(&ctx_skein);
{ // ZSKEIN;
sph_skein512_init(&ctx_skein); sph_skein512 (&ctx_skein, (const void*) hash, 64);
// ZSKEIN; sph_skein512_close(&ctx_skein, (void*) hash);
sph_skein512 (&ctx_skein, (const void*) hash, 64); }
sph_skein512_close(&ctx_skein, (void*) hash);
} sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512_init(&ctx_groestl); sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
// ZGROESTL; sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash); sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512_init(&ctx_jh); sph_jh512 (&ctx_jh, (const void*) hash, 64);
// ZJH; sph_jh512_close(&ctx_jh, (void*) hash);
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash); if (hash[0] & 0x8)
{
if (hash[0] & 0x8) sph_blake512_init(&ctx_blake);
{ // ZBLAKE;
sph_blake512_init(&ctx_blake); sph_blake512 (&ctx_blake, (const void*) hash, 64);
// ZBLAKE; sph_blake512_close(&ctx_blake, (void*) hash);
sph_blake512 (&ctx_blake, (const void*) hash, 64); }
sph_blake512_close(&ctx_blake, (void*) hash); else
} {
else sph_bmw512_init(&ctx_bmw);
{ // ZBMW;
sph_bmw512_init(&ctx_bmw); sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
// ZBMW; sph_bmw512_close(&ctx_bmw, (void*) hash);
sph_bmw512 (&ctx_bmw, (const void*) hash, 64); }
sph_bmw512_close(&ctx_bmw, (void*) hash);
} sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512_init(&ctx_keccak); sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
// ZKECCAK; sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash); sph_skein512_init(&ctx_skein);
// SKEIN;
sph_skein512_init(&ctx_skein); sph_skein512 (&ctx_skein, (const void*) hash, 64);
// SKEIN; sph_skein512_close(&ctx_skein, (void*) hash);
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash); if (hash[0] & 0x8)
{
if (hash[0] & 0x8) sph_keccak512_init(&ctx_keccak);
{ // ZKECCAK;
sph_keccak512_init(&ctx_keccak); sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
// ZKECCAK; sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_keccak512 (&ctx_keccak, (const void*) hash, 64); }
sph_keccak512_close(&ctx_keccak, (void*) hash); else
} {
else sph_jh512_init(&ctx_jh);
{ // ZJH;
sph_jh512_init(&ctx_jh); sph_jh512 (&ctx_jh, (const void*) hash, 64);
// ZJH; sph_jh512_close(&ctx_jh, (void*) hash);
sph_jh512 (&ctx_jh, (const void*) hash, 64); }
sph_jh512_close(&ctx_jh, (void*) hash);
} memcpy(state, hash, 32);
}
memcpy(state, hash, 32);
}
struct HashPredicate
{
struct HashPredicate HashPredicate(uint32_t *hashes, uint32_t startNonce) :
{ m_hashes(hashes),
HashPredicate(uint32_t *hashes, uint32_t startNonce) : m_startNonce(startNonce)
m_hashes(hashes), { }
m_startNonce(startNonce)
{ } __device__
bool operator()(const uint32_t x)
__device__ {
bool operator()(const uint32_t x) uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
{ return hash[0] & 0x8;
uint32_t *hash = &m_hashes[(x - m_startNonce)*16]; }
return hash[0] & 0x8;
} uint32_t *m_hashes;
uint32_t m_startNonce;
uint32_t *m_hashes; };
uint32_t m_startNonce;
}; extern bool opt_benchmark;
extern bool opt_benchmark; extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, unsigned long *hashes_done)
const uint32_t *ptarget, uint32_t max_nonce, {
unsigned long *hashes_done) const uint32_t first_nonce = pdata[19];
{
const uint32_t first_nonce = pdata[19]; if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00000f;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00000f; const uint32_t Htarg = ptarget[7];
const uint32_t Htarg = ptarget[7]; const int throughput = 256*2048; // 100;
const int throughput = 256*2048; // 100; static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id])
static bool init[8] = {0,0,0,0,0,0,0,0}; {
if (!init[thr_id]) cudaSetDevice(device_map[thr_id]);
{
cudaSetDevice(device_map[thr_id]); // Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); quark_blake512_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); quark_compactTest_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true;
} init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++) uint32_t endiandata[20];
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_bmw512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); quark_bmw512_cpu_setBlock_80((void*)endiandata);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0; do {
size_t nrm1=0, nrm2=0, nrm3=0; int order = 0;
size_t nrm1=0, nrm2=0, nrm3=0;
// erstes BMW512 Hash mit CUDA
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); // erstes BMW512 Hash mit CUDA
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); // das ist der unbedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
d_branch3Nonces[thr_id], &nrm3, quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
order++); d_branch3Nonces[thr_id], &nrm3,
order++);
// nur den Skein Branch weiterverfolgen
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); // nur den Skein Branch weiterverfolgen
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); // das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); // das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
d_branch1Nonces[thr_id], &nrm1, quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch2Nonces[thr_id], &nrm2, d_branch1Nonces[thr_id], &nrm1,
order++); d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); // das ist der bedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für Bmw512
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); // das ist der bedingte Branch für Bmw512
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); // das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); // das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
d_branch1Nonces[thr_id], &nrm1, quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch2Nonces[thr_id], &nrm2, d_branch1Nonces[thr_id], &nrm1,
order++); d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); // das ist der bedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); // das ist der bedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); // Scan nach Gewinner Hashes auf der GPU
if (foundNonce != 0xffffffff) uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
{ if (foundNonce != 0xffffffff)
uint32_t vhash64[8]; {
be32enc(&endiandata[19], foundNonce); uint32_t vhash64[8];
animehash(vhash64, endiandata); be32enc(&endiandata[19], foundNonce);
animehash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = (foundNonce - first_nonce + 1)/2; pdata[19] = foundNonce;
return 1; *hashes_done = (foundNonce - first_nonce + 1)/2;
} else { return 1;
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); } else {
} applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
} }
}
pdata[19] += throughput;
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = (pdata[19] - first_nonce + 1)/2;
return 0; *hashes_done = (pdata[19] - first_nonce + 1)/2;
} return 0;
}

151
quark/cuda_bmw512.cu

@ -1,140 +1,9 @@
#if 1 #if 1
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
// Endian Drehung für 32 Bit Typen
/*
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
}
*/
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL);
}
#if 0
// Endian Drehung für 64 Bit Typen
static __device__ unsigned long long cuda_swab64(unsigned long long x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h));
}
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO));
#else
return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL);
#endif
}
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y);
}
#endif
// der Versuch, einen Wrapper für einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen...
#if 1
typedef unsigned long long uint64_t;
#else
typedef class uint64
{
public:
__device__ uint64()
{
}
__device__ uint64(unsigned long long init)
{
val = make_uint2( LOWORD(init), HIWORD(init) );
}
__device__ uint64(uint32_t lo, uint32_t hi)
{
val = make_uint2( lo, hi );
}
__device__ const uint64 operator^(uint64 const& rhs) const
{
return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y);
}
__device__ const uint64 operator|(uint64 const& rhs) const
{
return uint64(val.x | rhs.val.x, val.y | rhs.val.y);
}
__device__ const uint64 operator+(unsigned long long const& rhs) const
{
return *this+uint64(rhs);
}
__device__ const uint64 operator+(uint64 const& rhs) const
{
uint64 res;
asm ("add.cc.u32 %0, %2, %4;\n\t"
"addc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator-(uint64 const& rhs) const
{
uint64 res;
asm ("sub.cc.u32 %0, %2, %4;\n\t"
"subc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator<<(int n) const
{
return uint64(unsigned long long(*this)<<n);
}
__device__ const uint64 operator>>(int n) const
{
return uint64(unsigned long long(*this)>>n);
}
__device__ operator unsigned long long() const
{
return MAKE_ULONGLONG(val.x, val.y);
}
uint2 val;
} uint64_t;
#endif
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// die Message it Padding zur Berechnung auf der GPU // die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define SHL(x, n) ((x) << (n)) #define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n)) #define SHR(x, n) ((x) >> (n))

13
quark/cuda_checkhash.cu

@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <stdint.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
// Hash Target gegen das wir testen sollen // Hash Target gegen das wir testen sollen
__constant__ uint32_t pTarget[8]; __constant__ uint32_t pTarget[8];
@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32
} }
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_check_cpu_init(int thr_id, int threads) __host__ void cuda_check_cpu_init(int thr_id, int threads)
{ {
cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)); cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)); cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
} }
// Target Difficulty setzen // Target Difficulty setzen
__host__ void quark_check_cpu_setTarget(const void *ptarget) __host__ void cuda_check_cpu_setTarget(const void *ptarget)
{ {
// die Message zur Berechnung auf der GPU // die Message zur Berechnung auf der GPU
cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
} }
__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) __host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{ {
uint32_t result = 0xffffffff; uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));

2
quark/cuda_jh512.cu

@ -1,4 +1,4 @@
#include <stdint.h> #include "cuda_helper.h"
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

230
quark/cuda_quark_blake512.cu

@ -1,16 +1,11 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#define USE_SHUFFLE 0 #include "cuda_helper.h"
// Folgende Definitionen später durch header ersetzen #define ROTR(x,n) ROTR64(x,n)
typedef unsigned char uint8_t;
typedef unsigned int uint32_t; #define USE_SHUFFLE 0
typedef unsigned long long uint64_t;
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] =
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
}; };
// das Hi Word aus einem 64 Bit Typen extrahieren __device__ __constant__
static __device__ uint32_t HIWORD(const uint64_t &x) { const uint64_t c_u512[16] =
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#if 0
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
#endif
__device__ __forceinline__ uint64_t SWAP64(uint64_t x)
{
// Input: 77665544 33221100
// Output: 00112233 44556677
uint64_t temp[2];
temp[0] = __byte_perm(HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(LOWORD(x), 0, 0x0123);
return temp[0] | (temp[1]<<32);
}
__constant__ uint64_t c_u512[16];
const uint64_t host_u512[16] =
{ {
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
@ -96,24 +50,6 @@ const uint64_t host_u512[16] =
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
}; };
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
uint2 result;
if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTR(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#endif
#define G(a,b,c,d,e) \ #define G(a,b,c,d,e) \
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
v[d] = ROTR( v[d] ^ v[a],32); \ v[d] = ROTR( v[d] ^ v[a],32); \
@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset)
v[b] = ROTR( v[b] ^ v[c],11); v[b] = ROTR( v[b] ^ v[c],11);
__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits ) __device__ static
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
{ {
uint64_t v[16], m[16], i; uint64_t v[16], m[16], i;
#pragma unroll 16 #pragma unroll 16
for( i = 0; i < 16; ++i ) for( i = 0; i < 16; ++i ) {
{ m[i] = cuda_swab64(block[i]);
m[i] = SWAP64(block[i]);
} }
#pragma unroll 8 #pragma unroll 8
@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
} }
// Endian Drehung für 32 Bit Typen __device__ __constant__
static const uint64_t d_constMem[8] = {
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
/*
// Endian Drehung für 64 Bit Typen
static __device__ uint64_t cuda_swab64(uint64_t x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
}
*/
static __constant__ uint64_t d_constMem[8];
static const uint64_t h_constMem[8] = {
0x6a09e667f3bcc908ULL, 0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0x3c6ef372fe94f82bULL,
@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = {
0x5be0cd19137e2179ULL }; 0x5be0cd19137e2179ULL };
// Hash-Padding // Hash-Padding
static __constant__ uint64_t d_constHashPadding[8]; __device__ __constant__
static const uint64_t h_constHashPadding[8] = { static const uint64_t d_constHashPadding[8] = {
0x0000000000000080ull, 0x0000000000000080ull,
0, 0,
0, 0,
@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = {
0, 0,
0x0002000000000000ull }; 0x0002000000000000ull };
__global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) __global__ __launch_bounds__(256, 4)
void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads
if (thread < threads) if (thread < threads)
#endif #endif
{ {
uint8_t i;
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
//uint64_t *inpHash = &g_hash[8 * hashPosition]; uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
uint64_t *inpHash = &g_hash[hashPosition<<3];
// 128 Byte für die Message
uint64_t buf[16];
// State vorbereiten // State vorbereiten
uint64_t h[8]; uint64_t h[8];
/* #pragma unroll 8
h[0] = 0x6a09e667f3bcc908ULL; for (i=0;i<8;i++)
h[1] = 0xbb67ae8584caa73bULL;
h[2] = 0x3c6ef372fe94f82bULL;
h[3] = 0xa54ff53a5f1d36f1ULL;
h[4] = 0x510e527fade682d1ULL;
h[5] = 0x9b05688c2b3e6c1fULL;
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
h[i] = d_constMem[i]; h[i] = d_constMem[i];
// 128 Byte für die Message
uint64_t buf[16];
// Message für die erste Runde in Register holen // Message für die erste Runde in Register holen
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) buf[i] = inpHash[i]; for (i=0; i < 8; ++i)
buf[i] = inpHash[i];
/*
buf[ 8] = 0x0000000000000080ull; #pragma unroll 8
buf[ 9] = 0; for (i=0; i < 8; i++)
buf[10] = 0;
buf[11] = 0;
buf[12] = 0;
buf[13] = 0x0100000000000000ull;
buf[14] = 0;
buf[15] = 0x0002000000000000ull;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
buf[i+8] = d_constHashPadding[i]; buf[i+8] = d_constHashPadding[i];
// die einzige Hashing-Runde // die einzige Hashing-Runde
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 ); quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
// Hash rauslassen
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition]; uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) { for (i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
} }
#else #else
// in dieser Version passieren auch ein paar 64 Bit Shifts // in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = &g_hash[8 * hashPosition]; uint64_t *outHash = &g_hash[8 * hashPosition];
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) for (i=0; i < 8; ++i)
{ {
//outHash[i] = cuda_swab64( h[i] ); outHash[i] = cuda_swab64(h[i]);
outHash[i] = SWAP64(h[i]);
} }
#endif #endif
} }
@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
// State vorbereiten // State vorbereiten
uint64_t h[8]; uint64_t h[8];
/*
h[0] = 0x6a09e667f3bcc908ULL;
h[1] = 0xbb67ae8584caa73bULL;
h[2] = 0x3c6ef372fe94f82bULL;
h[3] = 0xa54ff53a5f1d36f1ULL;
h[4] = 0x510e527fade682d1ULL;
h[5] = 0x9b05688c2b3e6c1fULL;
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
h[i] = d_constMem[i];
// 128 Byte für die Message // 128 Byte für die Message
uint64_t buf[16]; uint64_t buf[16];
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
#pragma unroll 8
for(i=0;i<8;i++)
h[i] = d_constMem[i];
// Message für die erste Runde in Register holen // Message für die erste Runde in Register holen
#pragma unroll 16 #pragma unroll 16
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i]; for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
// die Nounce durch die thread-spezifische ersetzen // die Nounce durch die thread-spezifische ersetzen
buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce)); buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
@ -333,19 +224,17 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) { for (i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
} }
#else #else
// in dieser Version passieren auch ein paar 64 Bit Shifts // in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) for (i=0; i < 8; ++i) {
{ outHash[i] = cuda_swab64( h[i] );
//outHash[i] = cuda_swab64( h[i] );
outHash[i] = SWAP64(h[i]);
} }
#endif #endif
} }
@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads)
host_sigma, host_sigma,
sizeof(host_sigma), sizeof(host_sigma),
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_u512,
host_u512,
sizeof(host_u512),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_constMem,
h_constMem,
sizeof(h_constMem),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_constHashPadding,
h_constHashPadding,
sizeof(h_constHashPadding),
0, cudaMemcpyHostToDevice);
} }
// Blake512 für 80 Byte grosse Eingangsdaten // Blake512 für 80 Byte grosse Eingangsdaten

739
quark/cuda_quark_compactionTest.cu

@ -1,371 +1,368 @@
#include <cuda.h> #include <stdio.h>
#include "cuda_runtime.h" #include <memory.h>
#include "device_launch_parameters.h"
#include "sm_30_intrinsics.h" #include "cuda_helper.h"
#include <sm_30_intrinsics.h>
#include <stdio.h>
#include <memory.h> // aus cpu-miner.c
#include <stdint.h> extern int device_map[8];
// aus cpu-miner.c // diese Struktur wird in der Init Funktion angefordert
extern int device_map[8]; static cudaDeviceProp props[8];
// diese Struktur wird in der Init Funktion angefordert static uint32_t *d_tempBranch1Nonces[8];
static cudaDeviceProp props[8]; static uint32_t *d_numValid[8];
static uint32_t *h_numValid[8];
static uint32_t *d_tempBranch1Nonces[8];
static uint32_t *d_numValid[8]; static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
static uint32_t *h_numValid[8];
// aus heavy.cu
static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// aus heavy.cu // True/False tester
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
// True/False tester __device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); {
return ((inpHash[0] & 0x08) == 0x08);
__device__ uint32_t QuarkTrueTest(uint32_t *inpHash) }
{
return ((inpHash[0] & 0x08) == 0x08); __device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
} {
return ((inpHash[0] & 0x08) == 0);
__device__ uint32_t QuarkFalseTest(uint32_t *inpHash) }
{
return ((inpHash[0] & 0x08) == 0); __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
} cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest; // Setup-Funktionen
cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8]; __host__ void quark_compactTest_cpu_init(int thr_id, int threads)
{
// Setup-Funktionen cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
{ cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); // wir brauchen auch Speicherplatz auf dem Device
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
// wir brauchen auch Speicherplatz auf dem Device cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t)); uint32_t s1;
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); s1 = (threads / 256) * 2;
uint32_t s1; cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
s1 = (threads / 256) * 2; cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
}
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) #if __CUDA_ARCH__ < 300
} /**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
#if __CUDA_ARCH__ < 300 */
/** #undef __shfl_up
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 #define __shfl_up(var, delta, width) (0)
*/ #endif
#undef __shfl_up
#define __shfl_up(var, delta, width) (0) // Die Summenfunktion (vom NVIDIA SDK)
#endif __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
// Die Summenfunktion (vom NVIDIA SDK) extern __shared__ uint32_t sums[];
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
{ //int lane_id = id % warpSize;
extern __shared__ uint32_t sums[]; int lane_id = id % width;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); // determine a warp_id within a block
//int lane_id = id % warpSize; //int warp_id = threadIdx.x / warpSize;
int lane_id = id % width; int warp_id = threadIdx.x / width;
// determine a warp_id within a block
//int warp_id = threadIdx.x / warpSize; sums[lane_id] = 0;
int warp_id = threadIdx.x / width;
// Below is the basic structure of using a shfl instruction
sums[lane_id] = 0; // for a scan.
// Record "value" as a variable - we accumulate it along the way
// Below is the basic structure of using a shfl instruction uint32_t value;
// for a scan. if(testFunc != NULL)
// Record "value" as a variable - we accumulate it along the way {
uint32_t value; if (id < threads)
if(testFunc != NULL) {
{ uint32_t *inpHash;
if (id < threads) if(d_validNonceTable == NULL)
{ {
uint32_t *inpHash; // keine Nonce-Liste
if(d_validNonceTable == NULL) inpHash = &inpHashes[id<<4];
{ }else
// keine Nonce-Liste {
inpHash = &inpHashes[id<<4]; // Nonce-Liste verfügbar
}else int nonce = d_validNonceTable[id] - startNounce;
{ inpHash = &inpHashes[nonce<<4];
// Nonce-Liste verfügbar }
int nonce = d_validNonceTable[id] - startNounce; value = (*testFunc)(inpHash);
inpHash = &inpHashes[nonce<<4]; }else
} {
value = (*testFunc)(inpHash); value = 0;
}else }
{ }else
value = 0; {
} value = data[id];
}else }
{
value = data[id]; __syncthreads();
}
// Now accumulate in log steps up the chain
__syncthreads(); // compute sums, with another thread's value who is
// distance delta away (i). Note
// Now accumulate in log steps up the chain // those threads where the thread 'i' away would have
// compute sums, with another thread's value who is // been out of bounds of the warp are unaffected. This
// distance delta away (i). Note // creates the scan sum.
// those threads where the thread 'i' away would have #pragma unroll
// been out of bounds of the warp are unaffected. This
// creates the scan sum. for (int i=1; i<=width; i*=2)
#pragma unroll {
uint32_t n = __shfl_up((int)value, i, width);
for (int i=1; i<=width; i*=2)
{ if (lane_id >= i) value += n;
uint32_t n = __shfl_up((int)value, i, width); }
if (lane_id >= i) value += n; // value now holds the scan value for the individual thread
} // next sum the largest values for each warp
// value now holds the scan value for the individual thread // write the sum of the warp to smem
// next sum the largest values for each warp //if (threadIdx.x % warpSize == warpSize-1)
if (threadIdx.x % width == width-1)
// write the sum of the warp to smem {
//if (threadIdx.x % warpSize == warpSize-1) sums[warp_id] = value;
if (threadIdx.x % width == width-1) }
{
sums[warp_id] = value; __syncthreads();
}
//
__syncthreads(); // scan sum the warp sums
// the same shfl scan operation, but performed on warp sums
// //
// scan sum the warp sums if (warp_id == 0)
// the same shfl scan operation, but performed on warp sums {
// uint32_t warp_sum = sums[lane_id];
if (warp_id == 0)
{ for (int i=1; i<=width; i*=2)
uint32_t warp_sum = sums[lane_id]; {
uint32_t n = __shfl_up((int)warp_sum, i, width);
for (int i=1; i<=width; i*=2)
{ if (lane_id >= i) warp_sum += n;
uint32_t n = __shfl_up((int)warp_sum, i, width); }
if (lane_id >= i) warp_sum += n; sums[lane_id] = warp_sum;
} }
sums[lane_id] = warp_sum; __syncthreads();
}
// perform a uniform add across warps in the block
__syncthreads(); // read neighbouring warp's sum and add it to threads value
uint32_t blockSum = 0;
// perform a uniform add across warps in the block
// read neighbouring warp's sum and add it to threads value if (warp_id > 0)
uint32_t blockSum = 0; {
blockSum = sums[warp_id-1];
if (warp_id > 0) }
{
blockSum = sums[warp_id-1]; value += blockSum;
}
// Now write out our result
value += blockSum; data[id] = value;
// Now write out our result // last thread has sum, write write out the block's sum
data[id] = value; if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
{
// last thread has sum, write write out the block's sum partial_sums[blockIdx.x] = value;
if (partial_sums != NULL && threadIdx.x == blockDim.x-1) }
{ }
partial_sums[blockIdx.x] = value;
} // Uniform add: add partial sums array
} __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
{
// Uniform add: add partial sums array __shared__ uint32_t buf;
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
{
__shared__ uint32_t buf; if (id > len) return;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
if (threadIdx.x == 0)
if (id > len) return; {
buf = partial_sums[blockIdx.x];
if (threadIdx.x == 0) }
{
buf = partial_sums[blockIdx.x]; __syncthreads();
} data[id] += buf;
}
__syncthreads();
data[id] += buf; // Der Scatter
} __global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
// Der Scatter int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) uint32_t actNounce = id;
{ uint32_t value;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); if (id < threads)
uint32_t actNounce = id; {
uint32_t value; // uint32_t nounce = startNounce + id;
if (id < threads) uint32_t *inpHash;
{ if(d_validNonceTable == NULL)
// uint32_t nounce = startNounce + id; {
uint32_t *inpHash; // keine Nonce-Liste
if(d_validNonceTable == NULL) inpHash = &inpHashes[id<<4];
{ }else
// keine Nonce-Liste {
inpHash = &inpHashes[id<<4]; // Nonce-Liste verfügbar
}else int nonce = d_validNonceTable[id] - startNounce;
{ actNounce = nonce;
// Nonce-Liste verfügbar inpHash = &inpHashes[nonce<<4];
int nonce = d_validNonceTable[id] - startNounce; }
actNounce = nonce;
inpHash = &inpHashes[nonce<<4]; value = (*testFunc)(inpHash);
} }else
{
value = (*testFunc)(inpHash); value = 0;
}else }
{
value = 0; if( value )
} {
int idx = sum[id];
if( value ) if(idx > 0)
{ outp[idx-1] = startNounce + actNounce;
int idx = sum[id]; }
if(idx > 0) }
outp[idx-1] = startNounce + actNounce;
} __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
} {
if(val == 0)
__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) return 0;
{
if(val == 0) uint32_t mask = 0x80000000;
return 0; while( (val & mask) == 0 ) mask = mask >> 1;
uint32_t mask = 0x80000000; if( (val & (~mask)) != 0 )
while( (val & mask) == 0 ) mask = mask >> 1; return mask << 1;
if( (val & (~mask)) != 0 ) return mask;
return mask << 1; }
return mask; __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
} uint32_t *d_nonces1, cuda_compactTestFunction_t function,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm, {
uint32_t *d_nonces1, cuda_compactTestFunction_t function, int orgThreads = threads;
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
{ // threadsPerBlock ausrechnen
int orgThreads = threads; int blockSize = 256;
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); int nSummen = threads / blockSize;
// threadsPerBlock ausrechnen
int blockSize = 256; int thr1 = (threads+blockSize-1) / blockSize;
int nSummen = threads / blockSize; int thr2 = threads / (blockSize*blockSize);
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
int thr1 = (threads+blockSize-1) / blockSize; int thr3 = (nSummen + blockSize2-1) / blockSize2;
int thr2 = threads / (blockSize*blockSize);
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize; bool callThrid = (thr2 > 0) ? true : false;
int thr3 = (nSummen + blockSize2-1) / blockSize2;
// Erster Initialscan
bool callThrid = (thr2 > 0) ? true : false; quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);
// Erster Initialscan
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>( // weitere Scans
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); if(callThrid)
{
// weitere Scans quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
if(callThrid) quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
{ }else
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); {
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
}else }
{
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); // Sync + Anzahl merken
} cudaStreamSynchronize(NULL);
// Sync + Anzahl merken if(callThrid)
cudaStreamSynchronize(NULL); cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
else
if(callThrid) cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
else
cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); // Addieren
if(callThrid)
{
// Addieren quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
if(callThrid) }
{ quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
} // Scatter
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1,
function, orgThreads, startNounce, inpHashes, d_validNonceTable);
// Scatter
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, // Sync
function, orgThreads, startNounce, inpHashes, d_validNonceTable); cudaStreamSynchronize(NULL);
}
// Sync
cudaStreamSynchronize(NULL); ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
} __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
uint32_t *d_nonces1, uint32_t *d_nonces2,
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm, {
uint32_t *d_nonces1, uint32_t *d_nonces2, quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
{
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); /*
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); // threadsPerBlock ausrechnen
int blockSize = 256;
/* int thr1 = threads / blockSize;
// threadsPerBlock ausrechnen int thr2 = threads / (blockSize*blockSize);
int blockSize = 256;
int thr1 = threads / blockSize; // 1
int thr2 = threads / (blockSize*blockSize); quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
// 1 quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); cudaStreamSynchronize(NULL);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
cudaStreamSynchronize(NULL); quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); // 2
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
// 2 quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); cudaStreamSynchronize(NULL);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
cudaStreamSynchronize(NULL); quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); // Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); // Schritt 3: Scatter
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
// Schritt 3: Scatter cudaStreamSynchronize(NULL);
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); */
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); }
cudaStreamSynchronize(NULL);
*/ __host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
} uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, int order)
uint32_t *d_nonces1, size_t *nrm1, {
uint32_t *d_nonces2, size_t *nrm2, // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
int order) // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind quark_compactTest_cpu_dualCompaction(thr_id, threads,
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! h_numValid[thr_id], d_nonces1, d_nonces2,
startNounce, inpHashes, d_validNonceTable);
quark_compactTest_cpu_dualCompaction(thr_id, threads,
h_numValid[thr_id], d_nonces1, d_nonces2, cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
startNounce, inpHashes, d_validNonceTable); *nrm1 = (size_t)h_numValid[thr_id][0];
*nrm2 = (size_t)h_numValid[thr_id][1];
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser }
*nrm1 = (size_t)h_numValid[thr_id][0];
*nrm2 = (size_t)h_numValid[thr_id][1]; __host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
} uint32_t *d_nonces1, size_t *nrm1,
int order)
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, {
uint32_t *d_nonces1, size_t *nrm1, // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
int order) // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); *nrm1 = (size_t)h_numValid[thr_id][0];
}
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
*nrm1 = (size_t)h_numValid[thr_id][0];
}

15
quark/cuda_quark_groestl512.cu

@ -1,23 +1,16 @@
// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice // Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8]; static cudaDeviceProp props[8];
@ -25,8 +18,8 @@ static cudaDeviceProp props[8];
#include "groestl_functions_quad.cu" #include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu" #include "bitslice_transformations_quad.cu"
__global__ void __launch_bounds__(256, 4) __global__ __launch_bounds__(256, 4)
quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{ {
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;

21
quark/cuda_quark_keccak512.cu

@ -1,27 +1,19 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#include "cuda_helper.h"
#define U32TO64_LE(p) \ #define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
#define U64TO32_LE(p, v) \ #define U64TO32_LE(p, v) \
*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
static const uint64_t host_keccak_round_constants[24] = { __device__ __constant__
static const uint64_t c_keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull, 0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull, 0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull, 0x000000000000808bull, 0x0000000080000001ull,
@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = {
0x0000000080000001ull, 0x8000000080008008ull 0x0000000080000001ull, 0x8000000080008008ull
}; };
__constant__ uint64_t c_keccak_round_constants[24];
static __device__ __forceinline__ void static __device__ __forceinline__ void
keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
size_t i; size_t i;
@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_keccak512_cpu_init(int thr_id, int threads) __host__ void quark_keccak512_cpu_init(int thr_id, int threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants,
host_keccak_round_constants,
sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice);
} }
__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

27
quark/cuda_skein512.cu

@ -1,16 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <stdint.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// aus cpu-miner.c // aus cpu-miner.c
extern "C" extern int device_map[8]; extern "C" extern int device_map[8];
@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// Take a look at: https://www.schneier.com/skein1.3.pdf // Take a look at: https://www.schneier.com/skein1.3.pdf
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define SHL(x, n) ((x) << (n)) #define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n)) #define SHR(x, n) ((x) >> (n))

20
quark/quarkcoin.cu

@ -1,4 +1,3 @@
extern "C" extern "C"
{ {
#include "sph/sph_blake.h" #include "sph/sph_blake.h"
@ -8,9 +7,9 @@ extern "C"
#include "sph/sph_jh.h" #include "sph/sph_jh.h"
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
#include "miner.h" #include "miner.h"
}
#include <stdint.h> #include "cuda_helper.h"
}
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
// Konstanten kopieren, Speicher belegen // Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput); quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput); quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];

4
util.c

@ -1352,6 +1352,10 @@ void print_hash_tests(void)
myriadhash(&hash[0], &buf[0]); myriadhash(&hash[0], &buf[0]);
printf("\nmyriad: "); print_hash(hash); printf("\nmyriad: "); print_hash(hash);
memset(hash, 0, sizeof hash);
nist5hash(&hash[0], &buf[0]);
printf("\nnist5: "); print_hash(hash);
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
quarkhash(&hash[0], &buf[0]); quarkhash(&hash[0], &buf[0]);
printf("\nquark: "); print_hash(hash); printf("\nquark: "); print_hash(hash);

19
x11/cuda_x11_cubehash512.cu

@ -1,30 +1,13 @@
#include <cuda_runtime.h> #include "cuda_helper.h"
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence; typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#if 0
__device__ static uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
#endif
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
typedef unsigned int uint32_t; /* must be exactly 32 bits */
#define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25)) #define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
#define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21)) #define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } #define SWAP(a,b) { uint32_t u = a; a = b; b = u; }

28
x11/cuda_x11_echo.cu

@ -1,33 +1,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <stdint.h>
#include <memory.h> #include <memory.h>
// das Hi Word aus einem 64 Bit Typen extrahieren #include "cuda_helper.h"
#if 0
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#endif
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

20
x11/cuda_x11_luffa512.cu

@ -18,28 +18,18 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/ */
#include <cuda_runtime.h> #include "cuda_helper.h"
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence; typedef unsigned char BitSequence;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef struct { typedef struct {
uint32_t buffer[8]; /* Buffer to be hashed */ uint32_t buffer[8]; /* Buffer to be hashed */
uint32_t chainv[40]; /* Chaining values */ uint32_t chainv[40]; /* Chaining values */
} hashState; } hashState;
__device__ __forceinline__
static uint32_t BYTES_SWAP32(uint32_t x)
{
return __byte_perm(x, x, 0x0123);
}
#define MULT2(a,j)\ #define MULT2(a,j)\
tmp = a[7+(8*j)];\ tmp = a[7+(8*j)];\
a[7+(8*j)] = a[6+(8*j)];\ a[7+(8*j)] = a[6+(8*j)];\
@ -289,11 +279,11 @@ __device__ __forceinline__
void Update512(hashState *state, const BitSequence *data) void Update512(hashState *state, const BitSequence *data)
{ {
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]); for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]);
rnd512(state); rnd512(state);
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]); for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]);
rnd512(state); rnd512(state);
} }
@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b)
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
b[i] ^= state->chainv[i+8*j]; b[i] ^= state->chainv[i+8*j];
} }
b[i] = BYTES_SWAP32((b[i])); b[i] = cuda_swab32((b[i]));
} }
#pragma unroll 8 #pragma unroll 8
@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b)
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
b[8+i] ^= state->chainv[i+8*j]; b[8+i] ^= state->chainv[i+8*j];
} }
b[8+i] = BYTES_SWAP32((b[8+i])); b[8 + i] = cuda_swab32((b[8 + i]));
} }
} }

59
x11/cuda_x11_shavite512.cu

@ -1,18 +1,13 @@
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_runtime.h>
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence; //typedef unsigned char BitSequence;
typedef unsigned long long DataLength; //typedef unsigned long long DataLength;
#define SPH_C64(x) ((uint64_t)(x ## ULL)) __device__ __constant__
#define SPH_C32(x) ((uint32_t)(x ## U)) static const uint32_t d_ShaviteInitVector[16] = {
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
static __constant__ uint32_t d_ShaviteInitVector[16];
static const uint32_t h_ShaviteInitVector[] = {
SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
@ -1304,18 +1299,18 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
aes_gpu_init(sharedMemory); aes_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
// kopiere init-state // kopiere init-state
uint32_t state[16]; uint32_t state[16];
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
state[i] = d_ShaviteInitVector[i]; state[i] = d_ShaviteInitVector[i];
@ -1323,13 +1318,13 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
uint32_t msg[32]; uint32_t msg[32];
// fülle die Nachricht mit 64-byte (vorheriger Hash) // fülle die Nachricht mit 64-byte (vorheriger Hash)
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
msg[i] = Hash[i]; msg[i] = Hash[i];
// Nachrichtenende // Nachrichtenende
msg[16] = 0x80; msg[16] = 0x80;
#pragma unroll 10 #pragma unroll 10
for(int i=17;i<27;i++) for(int i=17;i<27;i++)
msg[i] = 0; msg[i] = 0;
@ -1341,10 +1336,10 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
c512(sharedMemory, state, msg); c512(sharedMemory, state, msg);
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
Hash[i] = state[i]; Hash[i] = state[i];
} }
} }
@ -1352,25 +1347,19 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
__host__ void x11_shavite512_cpu_init(int thr_id, int threads) __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
{ {
aes_cpu_init(); aes_cpu_init();
cudaMemcpyToSymbol( d_ShaviteInitVector,
h_ShaviteInitVector,
sizeof(h_ShaviteInitVector),
0, cudaMemcpyHostToDevice);
} }
__host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

22
x11/cuda_x11_simd512.cu

@ -7,29 +7,17 @@
#define TPB 256 #define TPB 256
#include "cuda_helper.h"
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
int *d_state[8]; int *d_state[8];
uint4 *d_temp4[8]; uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel // texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128; texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
#define C32(x) ((uint32_t)(x ## U))
#define T32(x) ((x) & C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __constant__ __device__ __constant__
const uint32_t c_IV_512[32] = { const uint32_t c_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \
#undef BUTTERFLY #undef BUTTERFLY
} }
#if __CUDA_ARCH__ < 300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/** /**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane. * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned. * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \
__device__ __forceinline__ void FFT_16(int *y) { __device__ __forceinline__ void FFT_16(int *y) {
#if __CUDA_ARCH__ < 300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32 #ifndef WIN32
# warning FFT_16() function is not compatible with SM 2.1 devices! # warning FFT_16() function is not compatible with SM 2.1 devices!
#endif #endif
@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
{ {
int i; int i;
#if __CUDA_ARCH__ < 300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32 #ifndef WIN32
# warning Expansion() function is not compatible with SM 2.1 devices # warning Expansion() function is not compatible with SM 2.1 devices
#endif #endif

19
x11/x11.cu

@ -15,10 +15,11 @@ extern "C"
#include "sph/sph_echo.h" #include "sph/sph_echo.h"
#include "miner.h" #include "miner.h"
} #include "cuda_helper.h"
#include <stdint.h> #include <stdio.h>
#include <cuda_helper.h> #include <memory.h>
}
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, int threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_shavite512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
uint32_t foundNonce; uint32_t foundNonce;
@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];

217
x13/cuda_x13_fugue512.cu

@ -5,26 +5,11 @@
* heavily based on phm's sgminer * heavily based on phm's sgminer
* *
*/ */
#include <cuda.h> #include "cuda_helper.h"
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdint.h>
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5, 5.0)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
/* /*
* X13 kernel implementation. * X13 kernel implementation.
* *
@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
* @author phm <phm@inbox.com> * @author phm <phm@inbox.com>
*/ */
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#define mixtab0(x) (*((uint32_t*)mixtabs + ( (x)))) #define mixtab0(x) (*((uint32_t*)mixtabs + ( (x))))
#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x)))) #define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x)))) #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
@ -584,97 +567,97 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
__syncthreads(); __syncthreads();
int i; int i;
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
#pragma unroll 16 #pragma unroll 16
for( i = 0; i < 16; i++ ) for( i = 0; i < 16; i++ )
Hash[i] = SWAB32(Hash[i]); Hash[i] = cuda_swab32(Hash[i]);
uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
uint32_t S30, S31, S32, S33, S34, S35; uint32_t S30, S31, S32, S33, S34, S35;
uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35; uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
uint64_t bc = (uint64_t) 64 << 3; uint64_t bc = (uint64_t) 64 << 3;
uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL); uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
uint32_t bchi = (uint32_t)(bc >> 32); uint32_t bchi = (uint32_t)(bc >> 32);
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1);
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2])); FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5])); FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8])); FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB])); FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE])); FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
FUGUE512_3((Hash[0xF]), bchi, bclo); FUGUE512_3((Hash[0xF]), bchi, bclo);
#pragma unroll 32 #pragma unroll 32
for (i = 0; i < 32; i ++) { for (i = 0; i < 32; i ++) {
ROR3; ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03); SMIX(S00, S01, S02, S03);
} }
#pragma unroll 13 #pragma unroll 13
for (i = 0; i < 13; i ++) { for (i = 0; i < 13; i ++) {
S04 ^= S00; S04 ^= S00;
S09 ^= S00; S09 ^= S00;
S18 ^= S00; S18 ^= S00;
S27 ^= S00; S27 ^= S00;
ROR9; ROR9;
SMIX(S00, S01, S02, S03); SMIX(S00, S01, S02, S03);
S04 ^= S00; S04 ^= S00;
S10 ^= S00; S10 ^= S00;
S18 ^= S00; S18 ^= S00;
S27 ^= S00; S27 ^= S00;
ROR9; ROR9;
SMIX(S00, S01, S02, S03); SMIX(S00, S01, S02, S03);
S04 ^= S00; S04 ^= S00;
S10 ^= S00; S10 ^= S00;
S19 ^= S00; S19 ^= S00;
S27 ^= S00; S27 ^= S00;
ROR9; ROR9;
SMIX(S00, S01, S02, S03); SMIX(S00, S01, S02, S03);
S04 ^= S00; S04 ^= S00;
S10 ^= S00; S10 ^= S00;
S19 ^= S00; S19 ^= S00;
S28 ^= S00; S28 ^= S00;
ROR8; ROR8;
SMIX(S00, S01, S02, S03); SMIX(S00, S01, S02, S03);
} }
S04 ^= S00; S04 ^= S00;
S09 ^= S00; S09 ^= S00;
S18 ^= S00; S18 ^= S00;
S27 ^= S00; S27 ^= S00;
Hash[0] = SWAB32(S01); Hash[0] = cuda_swab32(S01);
Hash[1] = SWAB32(S02); Hash[1] = cuda_swab32(S02);
Hash[2] = SWAB32(S03); Hash[2] = cuda_swab32(S03);
Hash[3] = SWAB32(S04); Hash[3] = cuda_swab32(S04);
Hash[4] = SWAB32(S09); Hash[4] = cuda_swab32(S09);
Hash[5] = SWAB32(S10); Hash[5] = cuda_swab32(S10);
Hash[6] = SWAB32(S11); Hash[6] = cuda_swab32(S11);
Hash[7] = SWAB32(S12); Hash[7] = cuda_swab32(S12);
Hash[8] = SWAB32(S18); Hash[8] = cuda_swab32(S18);
Hash[9] = SWAB32(S19); Hash[9] = cuda_swab32(S19);
Hash[10] = SWAB32(S20); Hash[10] = cuda_swab32(S20);
Hash[11] = SWAB32(S21); Hash[11] = cuda_swab32(S21);
Hash[12] = SWAB32(S27); Hash[12] = cuda_swab32(S27);
Hash[13] = SWAB32(S28); Hash[13] = cuda_swab32(S28);
Hash[14] = SWAB32(S29); Hash[14] = cuda_swab32(S29);
Hash[15] = SWAB32(S30); Hash[15] = cuda_swab32(S30);
} }
} }
#define texDef(texname, texmem, texsource, texsize) \ #define texDef(texname, texmem, texsource, texsize) \
@ -697,17 +680,17 @@ __host__ void x13_fugue512_cpu_init(int thr_id, int threads)
__host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 4 * 256 * sizeof(uint32_t); size_t shared_size = 4 * 256 * sizeof(uint32_t);
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

100
x13/cuda_x13_hamsi512.cu

@ -37,26 +37,11 @@
* @author phm <phm@inbox.com> * @author phm <phm@inbox.com>
*/ */
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_runtime.h>
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __constant__ __device__ __constant__
static const uint32_t d_alpha_n[] = { static const uint32_t d_alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
@ -663,7 +648,7 @@ static const uint32_t d_T512[64][16] = {
mD = 0; \ mD = 0; \
mE = 0; \ mE = 0; \
mF = 0; \ mF = 0; \
for (u = 0; u < 8; u ++) { \ for (u = 0; u < 8; u ++) { \
unsigned db = buf(u); \ unsigned db = buf(u); \
for (v = 0; v < 8; v ++, db >>= 1) { \ for (v = 0; v < 8; v ++, db >>= 1) { \
uint32_t dm = SPH_T32(-(uint32_t)(db & 1)); \ uint32_t dm = SPH_T32(-(uint32_t)(db & 1)); \
@ -692,45 +677,47 @@ static const uint32_t d_T512[64][16] = {
// Die Hash-Funktion // Die Hash-Funktion
__global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
unsigned char *h1 = (unsigned char *)Hash; unsigned char *h1 = (unsigned char *)Hash;
uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265); uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c); uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48); uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d); uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
uint32_t m0, m1, m2, m3, m4, m5, m6, m7; uint32_t m0, m1, m2, m3, m4, m5, m6, m7;
uint32_t m8, m9, mA, mB, mC, mD, mE, mF; uint32_t m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) (h1[i+u]) #define buf(u) (h1[i+u])
#pragma unroll 8 #pragma unroll 8
for(int i = 0; i < 64; i += 8) { for(int i = 0; i < 64; i += 8) {
INPUT_BIG; INPUT_BIG;
P_BIG; P_BIG;
T_BIG; T_BIG;
} }
#undef buf #undef buf
#define buf(u) (u == 0 ? 0x80 : 0) #define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG; INPUT_BIG;
P_BIG; P_BIG;
T_BIG; T_BIG;
#undef buf #undef buf
#define buf(u) (u == 6 ? 2 : 0) #define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG; INPUT_BIG;
PF_BIG; PF_BIG;
T_BIG; T_BIG;
#pragma unroll 16 #pragma unroll 16
for (int i = 0; i < 16; i++) for (int i = 0; i < 16; i++)
Hash[i] = SWAB32(h[i]); Hash[i] = cuda_swab32(h[i]);
} }
} }
@ -740,18 +727,17 @@ __host__ void x13_hamsi512_cpu_init(int thr_id, int threads)
__host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

17
x13/x13.cu

@ -20,10 +20,9 @@ extern "C"
#include "sph/sph_fugue.h" #include "sph/sph_fugue.h"
#include "miner.h" #include "miner.h"
}
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_helper.h> }
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
extern void x13_fugue512_cpu_init(int thr_id, int threads); extern void x13_fugue512_cpu_init(int thr_id, int threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x11_echo512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput); x13_hamsi512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
uint32_t foundNonce; uint32_t foundNonce;
@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];

18
x15/cuda_x14_shabal512.cu

@ -1,26 +1,10 @@
/* /*
* Shabal-512 for X14/X15 (STUB) * Shabal-512 for X14/X15 (STUB)
*/ */
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_runtime.h>
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */ /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
/* /*
* Shabal implementation. * Shabal implementation.

6
x15/cuda_x15_whirlpool.cu

@ -4,8 +4,8 @@
* tpruvot@github * tpruvot@github
*/ */
#include <stdio.h> #include <stdio.h>
#include <stdint.h>
#include <cuda_helper.h> #include "cuda_helper.h"
#define NULLTEST 0 #define NULLTEST 0
@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
#define SPH_64 (1) #define SPH_64 (1)
#define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1) #define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1)
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// defined in cuda_helper.h // defined in cuda_helper.h
#define SPH_ROTL64(x,n) ROTL64(x,n) #define SPH_ROTL64(x,n) ROTL64(x,n)

17
x15/x14.cu

@ -22,10 +22,9 @@ extern "C" {
#include "sph/sph_shabal.h" #include "sph/sph_shabal.h"
#include "miner.h" #include "miner.h"
}
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_helper.h> }
// from cpu-miner.c // from cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
extern void x14_shabal512_cpu_init(int thr_id, int threads); extern void x14_shabal512_cpu_init(int thr_id, int threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput); x14_shabal512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
/* check now with the CPU to confirm */ /* check now with the CPU to confirm */

17
x15/x15.cu

@ -23,10 +23,9 @@ extern "C" {
#include "sph/sph_whirlpool.h" #include "sph/sph_whirlpool.h"
#include "miner.h" #include "miner.h"
}
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_helper.h> }
// to test gpu hash on a null buffer // to test gpu hash on a null buffer
#define NULLTEST 0 #define NULLTEST 0
@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
extern void x15_whirlpool_cpu_init(int thr_id, int threads); extern void x15_whirlpool_cpu_init(int thr_id, int threads);
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
x14_shabal512_cpu_init(thr_id, throughput); x14_shabal512_cpu_init(thr_id, throughput);
x15_whirlpool_cpu_init(thr_id, throughput); x15_whirlpool_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
print_hash((unsigned char*)buf); printf("\n"); print_hash((unsigned char*)buf); printf("\n");
#endif #endif
/* Scan with GPU */ /* Scan with GPU */
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {

Loading…
Cancel
Save