Browse Source

Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof
master
Tanguy Pruvot 10 years ago
parent
commit
d9ea5f72ce
  1. 11
      JHA/cuda_jha_compactionTest.cu
  2. 33
      JHA/cuda_jha_keccak512.cu
  3. 19
      JHA/jackpotcoin.cu
  4. 3
      bitslice_transformations_quad.cu
  5. 9
      ccminer.vcxproj
  6. 6
      ccminer.vcxproj.filters
  7. 15
      cuda_fugue256.cu
  8. 18
      cuda_groestlcoin.cu
  9. 110
      cuda_helper.h
  10. 11
      cuda_myriadgroestl.cu
  11. 28
      cuda_nist5.cu
  12. 1
      groestl_functions_quad.cu
  13. 28
      heavy/cuda_blake512.cu
  14. 7
      heavy/cuda_combine.cu
  15. 10
      heavy/cuda_groestl512.cu
  16. 12
      heavy/cuda_hefty1.cu
  17. 13
      heavy/cuda_keccak512.cu
  18. 7
      heavy/cuda_sha256.cu
  19. 6
      heavy/heavy.cu
  20. 1
      miner.h
  21. 19
      quark/animecoin.cu
  22. 151
      quark/cuda_bmw512.cu
  23. 13
      quark/cuda_checkhash.cu
  24. 2
      quark/cuda_jh512.cu
  25. 230
      quark/cuda_quark_blake512.cu
  26. 9
      quark/cuda_quark_compactionTest.cu
  27. 15
      quark/cuda_quark_groestl512.cu
  28. 21
      quark/cuda_quark_keccak512.cu
  29. 27
      quark/cuda_skein512.cu
  30. 20
      quark/quarkcoin.cu
  31. 4
      util.c
  32. 19
      x11/cuda_x11_cubehash512.cu
  33. 28
      x11/cuda_x11_echo.cu
  34. 20
      x11/cuda_x11_luffa512.cu
  35. 29
      x11/cuda_x11_shavite512.cu
  36. 22
      x11/cuda_x11_simd512.cu
  37. 19
      x11/x11.cu
  38. 55
      x13/cuda_x13_fugue512.cu
  39. 24
      x13/cuda_x13_hamsi512.cu
  40. 17
      x13/x13.cu
  41. 18
      x15/cuda_x14_shabal512.cu
  42. 6
      x15/cuda_x15_whirlpool.cu
  43. 17
      x15/x14.cu
  44. 17
      x15/x15.cu

11
JHA/cuda_jha_compactionTest.cu

@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_30_intrinsics.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include <stdint.h>
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
} }
#if __CUDA_ARCH__ < 300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/** /**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/ */

33
JHA/cuda_jha_keccak512.cu

@ -1,16 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
__constant__ uint64_t c_State[25]; __constant__ uint64_t c_State[25];
__constant__ uint32_t c_PaddedMessage[18]; __constant__ uint32_t c_PaddedMessage[18];
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define U32TO64_LE(p) \ #define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))

19
JHA/jackpotcoin.cu

@ -1,4 +1,3 @@
extern "C" extern "C"
{ {
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
@ -7,10 +6,9 @@ extern "C"
#include "sph/sph_jh.h" #include "sph/sph_jh.h"
#include "sph/sph_skein.h" #include "sph/sph_skein.h"
#include "miner.h" #include "miner.h"
#include "cuda_helper.h"
} }
#include <stdint.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void jackpot_compactTest_cpu_init(int thr_id, int threads); extern void jackpot_compactTest_cpu_init(int thr_id, int threads);
extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
quark_groestl512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2); cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2); cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
jackpot_keccak512_cpu_setBlock((void*)endiandata, 80); jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
} }
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
unsigned int rounds;
uint32_t vhash64[8]; uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce); be32enc(&endiandata[19], foundNonce);
// diese jackpothash Funktion gibt die Zahl der Runden zurück // diese jackpothash Funktion gibt die Zahl der Runden zurück
unsigned int rounds = jackpothash(vhash64, endiandata); rounds = jackpothash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {

3
bitslice_transformations_quad.cu

@ -1,5 +1,4 @@
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#if __CUDA_ARCH__ < 300
/** /**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane. * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned. * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.

9
ccminer.vcxproj

@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep> <Keep>false</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration> <CodeGeneration>compute_50,sm_50</CodeGeneration>
<Defines></Defines> <Defines>
</Defines>
</CudaCompile> </CudaCompile>
<CudaLink> <CudaLink>
<GPUDebugInfo>false</GPUDebugInfo> <GPUDebugInfo>false</GPUDebugInfo>
@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<ClInclude Include="uint256.h" /> <ClInclude Include="uint256.h" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<CudaCompile Include="bitslice_transformations_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="cuda_fugue256.cu"> <CudaCompile Include="cuda_fugue256.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile> </CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="heavy\cuda_blake512.cu"> <CudaCompile Include="heavy\cuda_blake512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>

6
ccminer.vcxproj.filters

@ -391,5 +391,11 @@
<CudaCompile Include="x15\cuda_x15_whirlpool.cu"> <CudaCompile Include="x15\cuda_x15_whirlpool.cu">
<Filter>Source Files\CUDA\x15</Filter> <Filter>Source Files\CUDA\x15</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="bitslice_transformations_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
</ItemGroup> </ItemGroup>
</Project> </Project>

15
cuda_fugue256.cu

@ -1,12 +1,11 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "sph/sph_fugue.h" #include "sph/sph_fugue.h"
#include "cuda_helper.h"
#include <host_defines.h>
#define USE_SHARED 1 #define USE_SHARED 1
// aus cpu-miner.c // aus cpu-miner.c
@ -15,14 +14,6 @@ extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// schon in sph_fugue.h definiert
//#define SPH_C32(x) ((uint32_t)(x ## U))
uint32_t *d_fugue256_hashoutput[8]; uint32_t *d_fugue256_hashoutput[8];
uint32_t *d_resultNonce[8]; uint32_t *d_resultNonce[8];

18
cuda_groestlcoin.cu

@ -1,23 +1,17 @@
// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice // Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
#include <host_defines.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8]; static cudaDeviceProp props[8];
@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
#include "groestl_functions_quad.cu" #include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu" #include "bitslice_transformations_quad.cu"
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) #define SWAB32(x) cuda_swab32(x)
__global__ void __launch_bounds__(256, 4) __global__ __launch_bounds__(256, 4)
groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce) void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
{ {
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4; int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;

110
cuda_helper.h

@ -1,33 +1,78 @@
#ifndef CUDA_HELPER_H #ifndef CUDA_HELPER_H
#define CUDA_HELPER_H #define CUDA_HELPER_H
#include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI) #if defined(_MSC_VER)
{ /* reduce warnings */
#if __CUDA_ARCH__ >= 130 #include <device_functions.h>
return __double_as_longlong(__hiloint2double(HI, LO)); #include <device_launch_parameters.h>
#endif
#include <stdint.h>
extern __device__ __device_builtin__ void __syncthreads(void);
#ifndef __CUDA_ARCH__
// define blockDim and threadIdx for host
extern const dim3 blockDim;
extern const uint3 threadIdx;
#endif
#ifndef SPH_C32
#define SPH_C32(x) ((uint32_t)(x ## U))
#endif
#ifndef SPH_C64
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#endif
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else #else
return (unsigned long long)LO | (((unsigned long long)HI) << 32); // Kepler (Compute 3.5, 5.0)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif #endif
}
// das Hi Word aus einem 64 Bit Typen extrahieren __device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
static __device__ uint32_t HIWORD(const uint64_t &x) { {
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x)); return __double_as_longlong(__hiloint2double(HI, LO));
#else #else
return (uint32_t)(x >> 32); return (unsigned long long)LO | (((unsigned long long)HI) << 32);
#endif #endif
} }
// das Hi Word in einem 64 Bit Typen ersetzen // das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { __device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
} }
// das Lo Word in einem 64 Bit Typen ersetzen
__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
// Endian Drehung für 32 Bit Typen
#ifdef __CUDA_ARCH__
__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
{
/* device */
return __byte_perm(x, x, 0x0123);
}
#else
/* host */
#define cuda_swab32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
#endif
// das Lo Word aus einem 64 Bit Typen extrahieren // das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) { __device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x)); return (uint32_t)__double2loint(__longlong_as_double(x));
#else #else
@ -35,25 +80,42 @@ static __device__ uint32_t LOWORD(const uint64_t &x) {
#endif #endif
} }
// das Lo Word in einem 64 Bit Typen ersetzen // das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { __device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); #if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
} }
// Endian Drehung für 32 Bit Typen #ifdef __CUDA_ARCH__
static __device__ uint32_t cuda_swab32(uint32_t x) __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
{ {
return __byte_perm(x, x, 0x0123); // Input: 77665544 33221100
} // Output: 00112233 44556677
uint64_t temp[2];
temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);
// Endian Drehung für 64 Bit Typen return temp[0] | (temp[1]<<32);
static __device__ uint64_t cuda_swab64(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x)));
} }
#else
/* host */
#define cuda_swab64(x) \
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
#endif
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) { __device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
uint2 result; uint2 result;
if(offset < 32) { if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
@ -70,7 +132,7 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { __device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result; uint2 result;
if(offset >= 32) { if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));

11
cuda_myriadgroestl.cu

@ -1,23 +1,16 @@
// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice // Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8]; static cudaDeviceProp props[8];

28
cuda_nist5.cu

@ -1,4 +1,3 @@
extern "C" extern "C"
{ {
#include "sph/sph_blake.h" #include "sph/sph_blake.h"
@ -7,10 +6,9 @@ extern "C"
#include "sph/sph_jh.h" #include "sph/sph_jh.h"
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
#include "miner.h" #include "miner.h"
#include "cuda_helper.h"
} }
#include <stdint.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
// Original nist5hash Funktion aus einem miner Quelltext // Original nist5hash Funktion aus einem miner Quelltext
inline void nist5hash(void *state, const void *input) extern "C" void nist5hash(void *state, const void *input)
{ {
sph_blake512_context ctx_blake; sph_blake512_context ctx_blake;
sph_groestl512_context ctx_groestl; sph_groestl512_context ctx_groestl;
@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
quark_jh512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
// erstes Blake512 Hash mit CUDA // Hash with CUDA
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];

1
groestl_functions_quad.cu

@ -1,3 +1,4 @@
#include "cuda_helper.h"
__device__ __forceinline__ void G256_Mul2(uint32_t *regs) __device__ __forceinline__ void G256_Mul2(uint32_t *regs)
{ {

28
heavy/cuda_blake512.cu

@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *d_heftyHashes[8];
@ -20,7 +13,6 @@ uint32_t *d_hash5output[8];
// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding) __constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
#include "cuda_helper.h"
// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------ // ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
}; };
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam. /* in cuda_helper */
#define SWAP32(x) \ #define SWAP32(x) cuda_swab32(x)
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ #define SWAP64(x) cuda_swab64(x)
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
#define SWAP64(x) \
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
__constant__ uint64_t c_SecondRound[15]; __constant__ uint64_t c_SecondRound[15];

7
heavy/cuda_combine.cu

@ -1,9 +1,4 @@
#include <cuda.h> #include "cuda_helper.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
// globaler Speicher für unsere Ergebnisse // globaler Speicher für unsere Ergebnisse
uint32_t *d_hashoutput[8]; uint32_t *d_hashoutput[8];

10
heavy/cuda_groestl512.cu

@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *d_heftyHashes[8];
@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
cudaMemcpyToSymbol( groestl_gpu_msg, cudaMemcpyToSymbol( groestl_gpu_msg,
msgBlock, msgBlock,
128); 128);
BLOCKSIZE = len; BLOCKSIZE = len;
} }

12
heavy/cuda_hefty1.cu

@ -1,10 +1,9 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
#include <device_functions.h>
#define USE_SHARED 1 #define USE_SHARED 1
// aus cpu-miner.c // aus cpu-miner.c
@ -13,11 +12,6 @@ extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8]; static cudaDeviceProp props[8];

13
heavy/cuda_keccak512.cu

@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *d_heftyHashes[8];
@ -81,8 +74,8 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
uint64_t t[5], u[5], v, w; uint64_t t[5], u[5], v, w;
/* absorb input */ /* absorb input */
#pragma unroll 9 #pragma unroll 9
for (i = 0; i < 72 / 8; i++, in += 2) for (i = 0; i < 9 /* 72/8 */; i++, in += 2)
s[i] ^= U32TO64_LE(in); s[i] ^= U32TO64_LE(in);
for (i = 0; i < 24; i++) { for (i = 0; i < 24; i++) {

7
heavy/cuda_sha256.cu

@ -1,12 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned int uint32_t;
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *d_heftyHashes[8];

6
heavy/heavy.cu

@ -1,7 +1,3 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include <string.h> #include <string.h>
@ -34,6 +30,8 @@
#include "heavy/cuda_blake512.h" #include "heavy/cuda_blake512.h"
#include "heavy/cuda_combine.h" #include "heavy/cuda_combine.h"
#include "cuda_helper.h"
extern uint32_t *d_hash2output[8]; extern uint32_t *d_hash2output[8];
extern uint32_t *d_hash3output[8]; extern uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8]; extern uint32_t *d_hash4output[8];

1
miner.h

@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void groestlhash(void *state, const void *input); void groestlhash(void *state, const void *input);
void myriadhash(void *state, const void *input); void myriadhash(void *state, const void *input);
void nist5hash(void *state, const void *input);
void quarkhash(void *state, const void *input); void quarkhash(void *state, const void *input);
void x11hash(void *output, const void *input); void x11hash(void *output, const void *input);
void x13hash(void *output, const void *input); void x13hash(void *output, const void *input);

19
quark/animecoin.cu

@ -1,4 +1,3 @@
extern "C" extern "C"
{ {
#include "sph/sph_blake.h" #include "sph/sph_blake.h"
@ -8,10 +7,9 @@ extern "C"
#include "sph/sph_jh.h" #include "sph/sph_jh.h"
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
#include "miner.h" #include "miner.h"
#include "cuda_helper.h"
} }
#include <stdint.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -45,9 +43,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -189,18 +187,21 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
// Konstanten kopieren, Speicher belegen // Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput); quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput); quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -209,7 +210,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_bmw512_cpu_setBlock_80((void*)endiandata); quark_bmw512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
@ -265,7 +266,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];

151
quark/cuda_bmw512.cu

@ -1,140 +1,9 @@
#if 1 #if 1
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
// Endian Drehung für 32 Bit Typen
/*
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
}
*/
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL);
}
#if 0
// Endian Drehung für 64 Bit Typen
static __device__ unsigned long long cuda_swab64(unsigned long long x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h));
}
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO));
#else
return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL);
#endif
}
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y);
}
#endif
// der Versuch, einen Wrapper für einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen...
#if 1
typedef unsigned long long uint64_t;
#else
typedef class uint64
{
public:
__device__ uint64()
{
}
__device__ uint64(unsigned long long init)
{
val = make_uint2( LOWORD(init), HIWORD(init) );
}
__device__ uint64(uint32_t lo, uint32_t hi)
{
val = make_uint2( lo, hi );
}
__device__ const uint64 operator^(uint64 const& rhs) const
{
return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y);
}
__device__ const uint64 operator|(uint64 const& rhs) const
{
return uint64(val.x | rhs.val.x, val.y | rhs.val.y);
}
__device__ const uint64 operator+(unsigned long long const& rhs) const
{
return *this+uint64(rhs);
}
__device__ const uint64 operator+(uint64 const& rhs) const
{
uint64 res;
asm ("add.cc.u32 %0, %2, %4;\n\t"
"addc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator-(uint64 const& rhs) const
{
uint64 res;
asm ("sub.cc.u32 %0, %2, %4;\n\t"
"subc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator<<(int n) const
{
return uint64(unsigned long long(*this)<<n);
}
__device__ const uint64 operator>>(int n) const
{
return uint64(unsigned long long(*this)>>n);
}
__device__ operator unsigned long long() const
{
return MAKE_ULONGLONG(val.x, val.y);
}
uint2 val;
} uint64_t;
#endif
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// die Message it Padding zur Berechnung auf der GPU // die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define SHL(x, n) ((x) << (n)) #define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n)) #define SHR(x, n) ((x) >> (n))

13
quark/cuda_checkhash.cu

@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <stdint.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
// Hash Target gegen das wir testen sollen // Hash Target gegen das wir testen sollen
__constant__ uint32_t pTarget[8]; __constant__ uint32_t pTarget[8];
@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32
} }
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_check_cpu_init(int thr_id, int threads) __host__ void cuda_check_cpu_init(int thr_id, int threads)
{ {
cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)); cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)); cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
} }
// Target Difficulty setzen // Target Difficulty setzen
__host__ void quark_check_cpu_setTarget(const void *ptarget) __host__ void cuda_check_cpu_setTarget(const void *ptarget)
{ {
// die Message zur Berechnung auf der GPU // die Message zur Berechnung auf der GPU
cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
} }
__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) __host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{ {
uint32_t result = 0xffffffff; uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));

2
quark/cuda_jh512.cu

@ -1,4 +1,4 @@
#include <stdint.h> #include "cuda_helper.h"
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

230
quark/cuda_quark_blake512.cu

@ -1,16 +1,11 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#define USE_SHUFFLE 0 #include "cuda_helper.h"
// Folgende Definitionen später durch header ersetzen #define ROTR(x,n) ROTR64(x,n)
typedef unsigned char uint8_t;
typedef unsigned int uint32_t; #define USE_SHUFFLE 0
typedef unsigned long long uint64_t;
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] =
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
}; };
// das Hi Word aus einem 64 Bit Typen extrahieren __device__ __constant__
static __device__ uint32_t HIWORD(const uint64_t &x) { const uint64_t c_u512[16] =
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#if 0
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
#endif
__device__ __forceinline__ uint64_t SWAP64(uint64_t x)
{
// Input: 77665544 33221100
// Output: 00112233 44556677
uint64_t temp[2];
temp[0] = __byte_perm(HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(LOWORD(x), 0, 0x0123);
return temp[0] | (temp[1]<<32);
}
__constant__ uint64_t c_u512[16];
const uint64_t host_u512[16] =
{ {
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
@ -96,24 +50,6 @@ const uint64_t host_u512[16] =
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
}; };
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
uint2 result;
if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTR(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#endif
#define G(a,b,c,d,e) \ #define G(a,b,c,d,e) \
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
v[d] = ROTR( v[d] ^ v[a],32); \ v[d] = ROTR( v[d] ^ v[a],32); \
@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset)
v[b] = ROTR( v[b] ^ v[c],11); v[b] = ROTR( v[b] ^ v[c],11);
__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits ) __device__ static
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
{ {
uint64_t v[16], m[16], i; uint64_t v[16], m[16], i;
#pragma unroll 16 #pragma unroll 16
for( i = 0; i < 16; ++i ) for( i = 0; i < 16; ++i ) {
{ m[i] = cuda_swab64(block[i]);
m[i] = SWAP64(block[i]);
} }
#pragma unroll 8 #pragma unroll 8
@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
} }
// Endian Drehung für 32 Bit Typen __device__ __constant__
static const uint64_t d_constMem[8] = {
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
/*
// Endian Drehung für 64 Bit Typen
static __device__ uint64_t cuda_swab64(uint64_t x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
}
*/
static __constant__ uint64_t d_constMem[8];
static const uint64_t h_constMem[8] = {
0x6a09e667f3bcc908ULL, 0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0x3c6ef372fe94f82bULL,
@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = {
0x5be0cd19137e2179ULL }; 0x5be0cd19137e2179ULL };
// Hash-Padding // Hash-Padding
static __constant__ uint64_t d_constHashPadding[8]; __device__ __constant__
static const uint64_t h_constHashPadding[8] = { static const uint64_t d_constHashPadding[8] = {
0x0000000000000080ull, 0x0000000000000080ull,
0, 0,
0, 0,
@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = {
0, 0,
0x0002000000000000ull }; 0x0002000000000000ull };
__global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) __global__ __launch_bounds__(256, 4)
void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads
if (thread < threads) if (thread < threads)
#endif #endif
{ {
uint8_t i;
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
//uint64_t *inpHash = &g_hash[8 * hashPosition]; uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
uint64_t *inpHash = &g_hash[hashPosition<<3];
// 128 Byte für die Message
uint64_t buf[16];
// State vorbereiten // State vorbereiten
uint64_t h[8]; uint64_t h[8];
/* #pragma unroll 8
h[0] = 0x6a09e667f3bcc908ULL; for (i=0;i<8;i++)
h[1] = 0xbb67ae8584caa73bULL;
h[2] = 0x3c6ef372fe94f82bULL;
h[3] = 0xa54ff53a5f1d36f1ULL;
h[4] = 0x510e527fade682d1ULL;
h[5] = 0x9b05688c2b3e6c1fULL;
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
h[i] = d_constMem[i]; h[i] = d_constMem[i];
// 128 Byte für die Message
uint64_t buf[16];
// Message für die erste Runde in Register holen // Message für die erste Runde in Register holen
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) buf[i] = inpHash[i]; for (i=0; i < 8; ++i)
buf[i] = inpHash[i];
/*
buf[ 8] = 0x0000000000000080ull; #pragma unroll 8
buf[ 9] = 0; for (i=0; i < 8; i++)
buf[10] = 0;
buf[11] = 0;
buf[12] = 0;
buf[13] = 0x0100000000000000ull;
buf[14] = 0;
buf[15] = 0x0002000000000000ull;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
buf[i+8] = d_constHashPadding[i]; buf[i+8] = d_constHashPadding[i];
// die einzige Hashing-Runde // die einzige Hashing-Runde
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 ); quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
// Hash rauslassen
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition]; uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) { for (i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
} }
#else #else
// in dieser Version passieren auch ein paar 64 Bit Shifts // in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = &g_hash[8 * hashPosition]; uint64_t *outHash = &g_hash[8 * hashPosition];
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) for (i=0; i < 8; ++i)
{ {
//outHash[i] = cuda_swab64( h[i] ); outHash[i] = cuda_swab64(h[i]);
outHash[i] = SWAP64(h[i]);
} }
#endif #endif
} }
@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
// State vorbereiten // State vorbereiten
uint64_t h[8]; uint64_t h[8];
/*
h[0] = 0x6a09e667f3bcc908ULL;
h[1] = 0xbb67ae8584caa73bULL;
h[2] = 0x3c6ef372fe94f82bULL;
h[3] = 0xa54ff53a5f1d36f1ULL;
h[4] = 0x510e527fade682d1ULL;
h[5] = 0x9b05688c2b3e6c1fULL;
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
h[i] = d_constMem[i];
// 128 Byte für die Message // 128 Byte für die Message
uint64_t buf[16]; uint64_t buf[16];
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
#pragma unroll 8
for(i=0;i<8;i++)
h[i] = d_constMem[i];
// Message für die erste Runde in Register holen // Message für die erste Runde in Register holen
#pragma unroll 16 #pragma unroll 16
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i]; for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
// die Nounce durch die thread-spezifische ersetzen // die Nounce durch die thread-spezifische ersetzen
buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce)); buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
@ -333,19 +224,17 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) { for (i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
} }
#else #else
// in dieser Version passieren auch ein paar 64 Bit Shifts // in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) for (i=0; i < 8; ++i) {
{ outHash[i] = cuda_swab64( h[i] );
//outHash[i] = cuda_swab64( h[i] );
outHash[i] = SWAP64(h[i]);
} }
#endif #endif
} }
@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads)
host_sigma, host_sigma,
sizeof(host_sigma), sizeof(host_sigma),
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_u512,
host_u512,
sizeof(host_u512),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_constMem,
h_constMem,
sizeof(h_constMem),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_constHashPadding,
h_constHashPadding,
sizeof(h_constHashPadding),
0, cudaMemcpyHostToDevice);
} }
// Blake512 für 80 Byte grosse Eingangsdaten // Blake512 für 80 Byte grosse Eingangsdaten

9
quark/cuda_quark_compactionTest.cu

@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_30_intrinsics.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include <stdint.h>
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];

15
quark/cuda_quark_groestl512.cu

@ -1,23 +1,16 @@
// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice // Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h"
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8]; static cudaDeviceProp props[8];
@ -25,8 +18,8 @@ static cudaDeviceProp props[8];
#include "groestl_functions_quad.cu" #include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu" #include "bitslice_transformations_quad.cu"
__global__ void __launch_bounds__(256, 4) __global__ __launch_bounds__(256, 4)
quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{ {
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;

21
quark/cuda_quark_keccak512.cu

@ -1,27 +1,19 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#include "cuda_helper.h"
#define U32TO64_LE(p) \ #define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
#define U64TO32_LE(p, v) \ #define U64TO32_LE(p, v) \
*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
static const uint64_t host_keccak_round_constants[24] = { __device__ __constant__
static const uint64_t c_keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull, 0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull, 0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull, 0x000000000000808bull, 0x0000000080000001ull,
@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = {
0x0000000080000001ull, 0x8000000080008008ull 0x0000000080000001ull, 0x8000000080008008ull
}; };
__constant__ uint64_t c_keccak_round_constants[24];
static __device__ __forceinline__ void static __device__ __forceinline__ void
keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
size_t i; size_t i;
@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_keccak512_cpu_init(int thr_id, int threads) __host__ void quark_keccak512_cpu_init(int thr_id, int threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants,
host_keccak_round_constants,
sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice);
} }
__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

27
quark/cuda_skein512.cu

@ -1,16 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <stdint.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen #include "cuda_helper.h"
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// aus cpu-miner.c // aus cpu-miner.c
extern "C" extern int device_map[8]; extern "C" extern int device_map[8];
@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// Take a look at: https://www.schneier.com/skein1.3.pdf // Take a look at: https://www.schneier.com/skein1.3.pdf
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define SHL(x, n) ((x) << (n)) #define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n)) #define SHR(x, n) ((x) >> (n))

20
quark/quarkcoin.cu

@ -1,4 +1,3 @@
extern "C" extern "C"
{ {
#include "sph/sph_blake.h" #include "sph/sph_blake.h"
@ -8,9 +7,9 @@ extern "C"
#include "sph/sph_jh.h" #include "sph/sph_jh.h"
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
#include "miner.h" #include "miner.h"
}
#include <stdint.h> #include "cuda_helper.h"
}
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
// Konstanten kopieren, Speicher belegen // Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput); quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput); quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];

4
util.c

@ -1352,6 +1352,10 @@ void print_hash_tests(void)
myriadhash(&hash[0], &buf[0]); myriadhash(&hash[0], &buf[0]);
printf("\nmyriad: "); print_hash(hash); printf("\nmyriad: "); print_hash(hash);
memset(hash, 0, sizeof hash);
nist5hash(&hash[0], &buf[0]);
printf("\nnist5: "); print_hash(hash);
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
quarkhash(&hash[0], &buf[0]); quarkhash(&hash[0], &buf[0]);
printf("\nquark: "); print_hash(hash); printf("\nquark: "); print_hash(hash);

19
x11/cuda_x11_cubehash512.cu

@ -1,30 +1,13 @@
#include <cuda_runtime.h> #include "cuda_helper.h"
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence; typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#if 0
__device__ static uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
#endif
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
typedef unsigned int uint32_t; /* must be exactly 32 bits */
#define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25)) #define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
#define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21)) #define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } #define SWAP(a,b) { uint32_t u = a; a = b; b = u; }

28
x11/cuda_x11_echo.cu

@ -1,33 +1,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <stdint.h>
#include <memory.h> #include <memory.h>
// das Hi Word aus einem 64 Bit Typen extrahieren #include "cuda_helper.h"
#if 0
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#endif
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

20
x11/cuda_x11_luffa512.cu

@ -18,28 +18,18 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/ */
#include <cuda_runtime.h> #include "cuda_helper.h"
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence; typedef unsigned char BitSequence;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef struct { typedef struct {
uint32_t buffer[8]; /* Buffer to be hashed */ uint32_t buffer[8]; /* Buffer to be hashed */
uint32_t chainv[40]; /* Chaining values */ uint32_t chainv[40]; /* Chaining values */
} hashState; } hashState;
__device__ __forceinline__
static uint32_t BYTES_SWAP32(uint32_t x)
{
return __byte_perm(x, x, 0x0123);
}
#define MULT2(a,j)\ #define MULT2(a,j)\
tmp = a[7+(8*j)];\ tmp = a[7+(8*j)];\
a[7+(8*j)] = a[6+(8*j)];\ a[7+(8*j)] = a[6+(8*j)];\
@ -289,11 +279,11 @@ __device__ __forceinline__
void Update512(hashState *state, const BitSequence *data) void Update512(hashState *state, const BitSequence *data)
{ {
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]); for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]);
rnd512(state); rnd512(state);
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]); for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]);
rnd512(state); rnd512(state);
} }
@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b)
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
b[i] ^= state->chainv[i+8*j]; b[i] ^= state->chainv[i+8*j];
} }
b[i] = BYTES_SWAP32((b[i])); b[i] = cuda_swab32((b[i]));
} }
#pragma unroll 8 #pragma unroll 8
@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b)
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
b[8+i] ^= state->chainv[i+8*j]; b[8+i] ^= state->chainv[i+8*j];
} }
b[8+i] = BYTES_SWAP32((b[8+i])); b[8 + i] = cuda_swab32((b[8 + i]));
} }
} }

29
x11/cuda_x11_shavite512.cu

@ -1,18 +1,13 @@
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_runtime.h>
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence; //typedef unsigned char BitSequence;
typedef unsigned long long DataLength; //typedef unsigned long long DataLength;
#define SPH_C64(x) ((uint64_t)(x ## ULL)) __device__ __constant__
#define SPH_C32(x) ((uint32_t)(x ## U)) static const uint32_t d_ShaviteInitVector[16] = {
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
static __constant__ uint32_t d_ShaviteInitVector[16];
static const uint32_t h_ShaviteInitVector[] = {
SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
@ -1315,7 +1310,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
// kopiere init-state // kopiere init-state
uint32_t state[16]; uint32_t state[16];
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
state[i] = d_ShaviteInitVector[i]; state[i] = d_ShaviteInitVector[i];
@ -1323,13 +1318,13 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
uint32_t msg[32]; uint32_t msg[32];
// fülle die Nachricht mit 64-byte (vorheriger Hash) // fülle die Nachricht mit 64-byte (vorheriger Hash)
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
msg[i] = Hash[i]; msg[i] = Hash[i];
// Nachrichtenende // Nachrichtenende
msg[16] = 0x80; msg[16] = 0x80;
#pragma unroll 10 #pragma unroll 10
for(int i=17;i<27;i++) for(int i=17;i<27;i++)
msg[i] = 0; msg[i] = 0;
@ -1341,7 +1336,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
c512(sharedMemory, state, msg); c512(sharedMemory, state, msg);
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
Hash[i] = state[i]; Hash[i] = state[i];
} }
@ -1352,11 +1347,6 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
__host__ void x11_shavite512_cpu_init(int thr_id, int threads) __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
{ {
aes_cpu_init(); aes_cpu_init();
cudaMemcpyToSymbol( d_ShaviteInitVector,
h_ShaviteInitVector,
sizeof(h_ShaviteInitVector),
0, cudaMemcpyHostToDevice);
} }
__host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
@ -1373,4 +1363,3 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

22
x11/cuda_x11_simd512.cu

@ -7,29 +7,17 @@
#define TPB 256 #define TPB 256
#include "cuda_helper.h"
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
int *d_state[8]; int *d_state[8];
uint4 *d_temp4[8]; uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel // texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128; texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
#define C32(x) ((uint32_t)(x ## U))
#define T32(x) ((x) & C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __constant__ __device__ __constant__
const uint32_t c_IV_512[32] = { const uint32_t c_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \
#undef BUTTERFLY #undef BUTTERFLY
} }
#if __CUDA_ARCH__ < 300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/** /**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane. * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned. * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \
__device__ __forceinline__ void FFT_16(int *y) { __device__ __forceinline__ void FFT_16(int *y) {
#if __CUDA_ARCH__ < 300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32 #ifndef WIN32
# warning FFT_16() function is not compatible with SM 2.1 devices! # warning FFT_16() function is not compatible with SM 2.1 devices!
#endif #endif
@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
{ {
int i; int i;
#if __CUDA_ARCH__ < 300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32 #ifndef WIN32
# warning Expansion() function is not compatible with SM 2.1 devices # warning Expansion() function is not compatible with SM 2.1 devices
#endif #endif

19
x11/x11.cu

@ -15,10 +15,11 @@ extern "C"
#include "sph/sph_echo.h" #include "sph/sph_echo.h"
#include "miner.h" #include "miner.h"
} #include "cuda_helper.h"
#include <stdint.h> #include <stdio.h>
#include <cuda_helper.h> #include <memory.h>
}
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, int threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_shavite512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
uint32_t foundNonce; uint32_t foundNonce;
@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];

55
x13/cuda_x13_fugue512.cu

@ -5,26 +5,11 @@
* heavily based on phm's sgminer * heavily based on phm's sgminer
* *
*/ */
#include <cuda.h> #include "cuda_helper.h"
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdint.h>
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5, 5.0)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
/* /*
* X13 kernel implementation. * X13 kernel implementation.
* *
@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
* @author phm <phm@inbox.com> * @author phm <phm@inbox.com>
*/ */
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#define mixtab0(x) (*((uint32_t*)mixtabs + ( (x)))) #define mixtab0(x) (*((uint32_t*)mixtabs + ( (x))))
#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x)))) #define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x)))) #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
@ -595,7 +578,7 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
#pragma unroll 16 #pragma unroll 16
for( i = 0; i < 16; i++ ) for( i = 0; i < 16; i++ )
Hash[i] = SWAB32(Hash[i]); Hash[i] = cuda_swab32(Hash[i]);
uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
@ -658,22 +641,22 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
S18 ^= S00; S18 ^= S00;
S27 ^= S00; S27 ^= S00;
Hash[0] = SWAB32(S01); Hash[0] = cuda_swab32(S01);
Hash[1] = SWAB32(S02); Hash[1] = cuda_swab32(S02);
Hash[2] = SWAB32(S03); Hash[2] = cuda_swab32(S03);
Hash[3] = SWAB32(S04); Hash[3] = cuda_swab32(S04);
Hash[4] = SWAB32(S09); Hash[4] = cuda_swab32(S09);
Hash[5] = SWAB32(S10); Hash[5] = cuda_swab32(S10);
Hash[6] = SWAB32(S11); Hash[6] = cuda_swab32(S11);
Hash[7] = SWAB32(S12); Hash[7] = cuda_swab32(S12);
Hash[8] = SWAB32(S18); Hash[8] = cuda_swab32(S18);
Hash[9] = SWAB32(S19); Hash[9] = cuda_swab32(S19);
Hash[10] = SWAB32(S20); Hash[10] = cuda_swab32(S20);
Hash[11] = SWAB32(S21); Hash[11] = cuda_swab32(S21);
Hash[12] = SWAB32(S27); Hash[12] = cuda_swab32(S27);
Hash[13] = SWAB32(S28); Hash[13] = cuda_swab32(S28);
Hash[14] = SWAB32(S29); Hash[14] = cuda_swab32(S29);
Hash[15] = SWAB32(S30); Hash[15] = cuda_swab32(S30);
} }
} }
@ -706,7 +689,7 @@ __host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 4 * 256 * sizeof(uint32_t); size_t shared_size = 4 * 256 * sizeof(uint32_t);
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);

24
x13/cuda_x13_hamsi512.cu

@ -37,26 +37,11 @@
* @author phm <phm@inbox.com> * @author phm <phm@inbox.com>
*/ */
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_runtime.h>
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __constant__ __device__ __constant__
static const uint32_t d_alpha_n[] = { static const uint32_t d_alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
@ -716,11 +701,13 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint
P_BIG; P_BIG;
T_BIG; T_BIG;
} }
#undef buf #undef buf
#define buf(u) (u == 0 ? 0x80 : 0) #define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG; INPUT_BIG;
P_BIG; P_BIG;
T_BIG; T_BIG;
#undef buf #undef buf
#define buf(u) (u == 6 ? 2 : 0) #define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG; INPUT_BIG;
@ -729,7 +716,7 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint
#pragma unroll 16 #pragma unroll 16
for (int i = 0; i < 16; i++) for (int i = 0; i < 16; i++)
Hash[i] = SWAB32(h[i]); Hash[i] = cuda_swab32(h[i]);
} }
} }
@ -749,9 +736,8 @@ __host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

17
x13/x13.cu

@ -20,10 +20,9 @@ extern "C"
#include "sph/sph_fugue.h" #include "sph/sph_fugue.h"
#include "miner.h" #include "miner.h"
}
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_helper.h> }
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
extern void x13_fugue512_cpu_init(int thr_id, int threads); extern void x13_fugue512_cpu_init(int thr_id, int threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x11_echo512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput); x13_hamsi512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
uint32_t foundNonce; uint32_t foundNonce;
@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
uint32_t vhash64[8]; uint32_t vhash64[8];

18
x15/cuda_x14_shabal512.cu

@ -1,26 +1,10 @@
/* /*
* Shabal-512 for X14/X15 (STUB) * Shabal-512 for X14/X15 (STUB)
*/ */
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_runtime.h>
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */ /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
/* /*
* Shabal implementation. * Shabal implementation.

6
x15/cuda_x15_whirlpool.cu

@ -4,8 +4,8 @@
* tpruvot@github * tpruvot@github
*/ */
#include <stdio.h> #include <stdio.h>
#include <stdint.h>
#include <cuda_helper.h> #include "cuda_helper.h"
#define NULLTEST 0 #define NULLTEST 0
@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
#define SPH_64 (1) #define SPH_64 (1)
#define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1) #define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1)
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// defined in cuda_helper.h // defined in cuda_helper.h
#define SPH_ROTL64(x,n) ROTL64(x,n) #define SPH_ROTL64(x,n) ROTL64(x,n)

17
x15/x14.cu

@ -22,10 +22,9 @@ extern "C" {
#include "sph/sph_shabal.h" #include "sph/sph_shabal.h"
#include "miner.h" #include "miner.h"
}
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_helper.h> }
// from cpu-miner.c // from cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
extern void x14_shabal512_cpu_init(int thr_id, int threads); extern void x14_shabal512_cpu_init(int thr_id, int threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput); x14_shabal512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {
/* check now with the CPU to confirm */ /* check now with the CPU to confirm */

17
x15/x15.cu

@ -23,10 +23,9 @@ extern "C" {
#include "sph/sph_whirlpool.h" #include "sph/sph_whirlpool.h"
#include "miner.h" #include "miner.h"
}
#include <stdint.h> #include "cuda_helper.h"
#include <cuda_helper.h> }
// to test gpu hash on a null buffer // to test gpu hash on a null buffer
#define NULLTEST 0 #define NULLTEST 0
@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
extern void x15_whirlpool_cpu_init(int thr_id, int threads); extern void x15_whirlpool_cpu_init(int thr_id, int threads);
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
x14_shabal512_cpu_init(thr_id, throughput); x14_shabal512_cpu_init(thr_id, throughput);
x15_whirlpool_cpu_init(thr_id, throughput); x15_whirlpool_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata); quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
print_hash((unsigned char*)buf); printf("\n"); print_hash((unsigned char*)buf); printf("\n");
#endif #endif
/* Scan with GPU */ /* Scan with GPU */
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {

Loading…
Cancel
Save