Browse Source

Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof
2upstream
Tanguy Pruvot 10 years ago
parent
commit
d9ea5f72ce
  1. 11
      JHA/cuda_jha_compactionTest.cu
  2. 33
      JHA/cuda_jha_keccak512.cu
  3. 19
      JHA/jackpotcoin.cu
  4. 3
      bitslice_transformations_quad.cu
  5. 9
      ccminer.vcxproj
  6. 6
      ccminer.vcxproj.filters
  7. 15
      cuda_fugue256.cu
  8. 18
      cuda_groestlcoin.cu
  9. 110
      cuda_helper.h
  10. 11
      cuda_myriadgroestl.cu
  11. 28
      cuda_nist5.cu
  12. 1
      groestl_functions_quad.cu
  13. 28
      heavy/cuda_blake512.cu
  14. 7
      heavy/cuda_combine.cu
  15. 10
      heavy/cuda_groestl512.cu
  16. 12
      heavy/cuda_hefty1.cu
  17. 13
      heavy/cuda_keccak512.cu
  18. 7
      heavy/cuda_sha256.cu
  19. 6
      heavy/heavy.cu
  20. 1
      miner.h
  21. 19
      quark/animecoin.cu
  22. 151
      quark/cuda_bmw512.cu
  23. 13
      quark/cuda_checkhash.cu
  24. 2
      quark/cuda_jh512.cu
  25. 230
      quark/cuda_quark_blake512.cu
  26. 9
      quark/cuda_quark_compactionTest.cu
  27. 15
      quark/cuda_quark_groestl512.cu
  28. 21
      quark/cuda_quark_keccak512.cu
  29. 27
      quark/cuda_skein512.cu
  30. 20
      quark/quarkcoin.cu
  31. 4
      util.c
  32. 19
      x11/cuda_x11_cubehash512.cu
  33. 28
      x11/cuda_x11_echo.cu
  34. 20
      x11/cuda_x11_luffa512.cu
  35. 29
      x11/cuda_x11_shavite512.cu
  36. 22
      x11/cuda_x11_simd512.cu
  37. 19
      x11/x11.cu
  38. 55
      x13/cuda_x13_fugue512.cu
  39. 24
      x13/cuda_x13_hamsi512.cu
  40. 17
      x13/x13.cu
  41. 18
      x15/cuda_x14_shabal512.cu
  42. 6
      x15/cuda_x15_whirlpool.cu
  43. 17
      x15/x14.cu
  44. 17
      x15/x15.cu

11
JHA/cuda_jha_compactionTest.cu

@ -1,11 +1,8 @@ @@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_30_intrinsics.h"
#include <stdio.h>
#include <memory.h>
#include <stdint.h>
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c
extern int device_map[8];
@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads) @@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
}
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/

33
JHA/cuda_jha_keccak512.cu

@ -1,16 +1,7 @@ @@ -1,16 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t @@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
__constant__ uint64_t c_State[25];
__constant__ uint32_t c_PaddedMessage[18];
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))

19
JHA/jackpotcoin.cu

@ -1,4 +1,3 @@ @@ -1,4 +1,3 @@
extern "C"
{
#include "sph/sph_keccak.h"
@ -7,10 +6,9 @@ extern "C" @@ -7,10 +6,9 @@ extern "C"
#include "sph/sph_jh.h"
#include "sph/sph_skein.h"
#include "miner.h"
#include "cuda_helper.h"
}
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc @@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void jackpot_compactTest_cpu_init(int thr_id, int threads);
extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, @@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
quark_groestl512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, @@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, @@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
}
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
unsigned int rounds;
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
// diese jackpothash Funktion gibt die Zahl der Runden zurück
unsigned int rounds = jackpothash(vhash64, endiandata);
rounds = jackpothash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {

3
bitslice_transformations_quad.cu

@ -1,5 +1,4 @@ @@ -1,5 +1,4 @@
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.

9
ccminer.vcxproj

@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command> @@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration>
<Defines></Defines>
<Defines>
</Defines>
</CudaCompile>
<CudaLink>
<GPUDebugInfo>false</GPUDebugInfo>
@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command> @@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<ClInclude Include="uint256.h" />
</ItemGroup>
<ItemGroup>
<CudaCompile Include="bitslice_transformations_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="cuda_fugue256.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command> @@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="heavy\cuda_blake512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>

6
ccminer.vcxproj.filters

@ -391,5 +391,11 @@ @@ -391,5 +391,11 @@
<CudaCompile Include="x15\cuda_x15_whirlpool.cu">
<Filter>Source Files\CUDA\x15</Filter>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="bitslice_transformations_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
</ItemGroup>
</Project>

15
cuda_fugue256.cu

@ -1,12 +1,11 @@ @@ -1,12 +1,11 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "sph/sph_fugue.h"
#include "cuda_helper.h"
#include <host_defines.h>
#define USE_SHARED 1
// aus cpu-miner.c
@ -15,14 +14,6 @@ extern int device_map[8]; @@ -15,14 +14,6 @@ extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// schon in sph_fugue.h definiert
//#define SPH_C32(x) ((uint32_t)(x ## U))
uint32_t *d_fugue256_hashoutput[8];
uint32_t *d_resultNonce[8];

18
cuda_groestlcoin.cu

@ -1,23 +1,17 @@ @@ -1,23 +1,17 @@
// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
#include <host_defines.h>
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32]; @@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
#define SWAB32(x) cuda_swab32(x)
__global__ void __launch_bounds__(256, 4)
groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
__global__ __launch_bounds__(256, 4)
void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
{
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;

110
cuda_helper.h

@ -1,33 +1,78 @@ @@ -1,33 +1,78 @@
#ifndef CUDA_HELPER_H
#define CUDA_HELPER_H
#include <cuda.h>
#include <cuda_runtime.h>
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO));
#if defined(_MSC_VER)
/* reduce warnings */
#include <device_functions.h>
#include <device_launch_parameters.h>
#endif
#include <stdint.h>
extern __device__ __device_builtin__ void __syncthreads(void);
#ifndef __CUDA_ARCH__
// define blockDim and threadIdx for host
extern const dim3 blockDim;
extern const uint3 threadIdx;
#endif
#ifndef SPH_C32
#define SPH_C32(x) ((uint32_t)(x ## U))
#endif
#ifndef SPH_C64
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#endif
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
return (unsigned long long)LO | (((unsigned long long)HI) << 32);
// Kepler (Compute 3.5, 5.0)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
}
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const uint64_t &x) {
__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
return __double_as_longlong(__hiloint2double(HI, LO));
#else
return (uint32_t)(x >> 32);
return (unsigned long long)LO | (((unsigned long long)HI) << 32);
#endif
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
}
// das Lo Word in einem 64 Bit Typen ersetzen
__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
// Endian Drehung für 32 Bit Typen
#ifdef __CUDA_ARCH__
__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
{
/* device */
return __byte_perm(x, x, 0x0123);
}
#else
/* host */
#define cuda_swab32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
#endif
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
__device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
@ -35,25 +80,42 @@ static __device__ uint32_t LOWORD(const uint64_t &x) { @@ -35,25 +80,42 @@ static __device__ uint32_t LOWORD(const uint64_t &x) {
#endif
}
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
// das Hi Word aus einem 64 Bit Typen extrahieren
__device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// Endian Drehung für 32 Bit Typen
static __device__ uint32_t cuda_swab32(uint32_t x)
#ifdef __CUDA_ARCH__
__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
{
return __byte_perm(x, x, 0x0123);
}
// Input: 77665544 33221100
// Output: 00112233 44556677
uint64_t temp[2];
temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);
// Endian Drehung für 64 Bit Typen
static __device__ uint64_t cuda_swab64(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x)));
return temp[0] | (temp[1]<<32);
}
#else
/* host */
#define cuda_swab64(x) \
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
#endif
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) {
__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
uint2 result;
if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
@ -70,7 +132,7 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse @@ -70,7 +132,7 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));

11
cuda_myriadgroestl.cu

@ -1,23 +1,16 @@ @@ -1,23 +1,16 @@
// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];

28
cuda_nist5.cu

@ -1,4 +1,3 @@ @@ -1,4 +1,3 @@
extern "C"
{
#include "sph/sph_blake.h"
@ -7,10 +6,9 @@ extern "C" @@ -7,10 +6,9 @@ extern "C"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "miner.h"
#include "cuda_helper.h"
}
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN @@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
// Original nist5hash Funktion aus einem miner Quelltext
inline void nist5hash(void *state, const void *input)
extern "C" void nist5hash(void *state, const void *input)
{
sph_blake512_context ctx_blake;
sph_groestl512_context ctx_groestl;
@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, @@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, @@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
// erstes Blake512 Hash mit CUDA
// Hash with CUDA
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

1
groestl_functions_quad.cu

@ -1,3 +1,4 @@ @@ -1,3 +1,4 @@
#include "cuda_helper.h"
__device__ __forceinline__ void G256_Mul2(uint32_t *regs)
{

28
heavy/cuda_blake512.cu

@ -1,14 +1,7 @@ @@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8];
@ -20,7 +13,6 @@ uint32_t *d_hash5output[8]; @@ -20,7 +13,6 @@ uint32_t *d_hash5output[8];
// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
#include "cuda_helper.h"
// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] = @@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
#define SWAP32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
#define SWAP64(x) \
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
/* in cuda_helper */
#define SWAP32(x) cuda_swab32(x)
#define SWAP64(x) cuda_swab64(x)
__constant__ uint64_t c_SecondRound[15];

7
heavy/cuda_combine.cu

@ -1,9 +1,4 @@ @@ -1,9 +1,4 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
#include "cuda_helper.h"
// globaler Speicher für unsere Ergebnisse
uint32_t *d_hashoutput[8];

10
heavy/cuda_groestl512.cu

@ -1,14 +1,7 @@ @@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8];
@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len) @@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
cudaMemcpyToSymbol( groestl_gpu_msg,
msgBlock,
128);
BLOCKSIZE = len;
}

12
heavy/cuda_hefty1.cu

@ -1,10 +1,9 @@ @@ -1,10 +1,9 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
#include <device_functions.h>
#define USE_SHARED 1
// aus cpu-miner.c
@ -13,11 +12,6 @@ extern int device_map[8]; @@ -13,11 +12,6 @@ extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];

13
heavy/cuda_keccak512.cu

@ -1,14 +1,7 @@ @@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8];
@ -81,8 +74,8 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const @@ -81,8 +74,8 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
uint64_t t[5], u[5], v, w;
/* absorb input */
#pragma unroll 9
for (i = 0; i < 72 / 8; i++, in += 2)
#pragma unroll 9
for (i = 0; i < 9 /* 72/8 */; i++, in += 2)
s[i] ^= U32TO64_LE(in);
for (i = 0; i < 24; i++) {

7
heavy/cuda_sha256.cu

@ -1,12 +1,7 @@ @@ -1,12 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
#include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8];

6
heavy/heavy.cu

@ -1,7 +1,3 @@ @@ -1,7 +1,3 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include <string.h>
@ -34,6 +30,8 @@ @@ -34,6 +30,8 @@
#include "heavy/cuda_blake512.h"
#include "heavy/cuda_combine.h"
#include "cuda_helper.h"
extern uint32_t *d_hash2output[8];
extern uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8];

1
miner.h

@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len); @@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void groestlhash(void *state, const void *input);
void myriadhash(void *state, const void *input);
void nist5hash(void *state, const void *input);
void quarkhash(void *state, const void *input);
void x11hash(void *output, const void *input);
void x13hash(void *output, const void *input);

19
quark/animecoin.cu

@ -1,4 +1,3 @@ @@ -1,4 +1,3 @@
extern "C"
{
#include "sph/sph_blake.h"
@ -8,10 +7,9 @@ extern "C" @@ -8,10 +7,9 @@ extern "C"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "miner.h"
#include "cuda_helper.h"
}
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
@ -45,9 +43,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN @@ -45,9 +43,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -189,18 +187,21 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, @@ -189,18 +187,21 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true;
}
@ -209,7 +210,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, @@ -209,7 +210,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_bmw512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
@ -265,7 +266,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, @@ -265,7 +266,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

151
quark/cuda_bmw512.cu

@ -1,140 +1,9 @@ @@ -1,140 +1,9 @@
#if 1
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
// Endian Drehung für 32 Bit Typen
/*
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
}
*/
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL);
}
#if 0
// Endian Drehung für 64 Bit Typen
static __device__ unsigned long long cuda_swab64(unsigned long long x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h));
}
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO));
#else
return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL);
#endif
}
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y);
}
#endif
// der Versuch, einen Wrapper für einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen...
#if 1
typedef unsigned long long uint64_t;
#else
typedef class uint64
{
public:
__device__ uint64()
{
}
__device__ uint64(unsigned long long init)
{
val = make_uint2( LOWORD(init), HIWORD(init) );
}
__device__ uint64(uint32_t lo, uint32_t hi)
{
val = make_uint2( lo, hi );
}
__device__ const uint64 operator^(uint64 const& rhs) const
{
return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y);
}
__device__ const uint64 operator|(uint64 const& rhs) const
{
return uint64(val.x | rhs.val.x, val.y | rhs.val.y);
}
__device__ const uint64 operator+(unsigned long long const& rhs) const
{
return *this+uint64(rhs);
}
__device__ const uint64 operator+(uint64 const& rhs) const
{
uint64 res;
asm ("add.cc.u32 %0, %2, %4;\n\t"
"addc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator-(uint64 const& rhs) const
{
uint64 res;
asm ("sub.cc.u32 %0, %2, %4;\n\t"
"subc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator<<(int n) const
{
return uint64(unsigned long long(*this)<<n);
}
__device__ const uint64 operator>>(int n) const
{
return uint64(unsigned long long(*this)>>n);
}
__device__ operator unsigned long long() const
{
return MAKE_ULONGLONG(val.x, val.y);
}
uint2 val;
} uint64_t;
#endif
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t @@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))

13
quark/cuda_checkhash.cu

@ -1,11 +1,8 @@ @@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include "cuda_helper.h"
// Hash Target gegen das wir testen sollen
__constant__ uint32_t pTarget[8];
@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32 @@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32
}
// Setup-Funktionen
__host__ void quark_check_cpu_init(int thr_id, int threads)
__host__ void cuda_check_cpu_init(int thr_id, int threads)
{
cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
}
// Target Difficulty setzen
__host__ void quark_check_cpu_setTarget(const void *ptarget)
__host__ void cuda_check_cpu_setTarget(const void *ptarget)
{
// die Message zur Berechnung auf der GPU
cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}
__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
__host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));

2
quark/cuda_jh512.cu

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
#include <stdint.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

230
quark/cuda_quark_blake512.cu

@ -1,16 +1,11 @@ @@ -1,16 +1,11 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#define USE_SHUFFLE 0
#include "cuda_helper.h"
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define ROTR(x,n) ROTR64(x,n)
#define USE_SHUFFLE 0
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] = @@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] =
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#if 0
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
#endif
__device__ __forceinline__ uint64_t SWAP64(uint64_t x)
{
// Input: 77665544 33221100
// Output: 00112233 44556677
uint64_t temp[2];
temp[0] = __byte_perm(HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(LOWORD(x), 0, 0x0123);
return temp[0] | (temp[1]<<32);
}
__constant__ uint64_t c_u512[16];
const uint64_t host_u512[16] =
__device__ __constant__
const uint64_t c_u512[16] =
{
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
@ -96,24 +50,6 @@ const uint64_t host_u512[16] = @@ -96,24 +50,6 @@ const uint64_t host_u512[16] =
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
};
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
uint2 result;
if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTR(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#endif
#define G(a,b,c,d,e) \
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
v[d] = ROTR( v[d] ^ v[a],32); \
@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) @@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset)
v[b] = ROTR( v[b] ^ v[c],11);
__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
__device__ static
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
{
uint64_t v[16], m[16], i;
#pragma unroll 16
for( i = 0; i < 16; ++i )
{
m[i] = SWAP64(block[i]);
for( i = 0; i < 16; ++i ) {
m[i] = cuda_swab64(block[i]);
}
#pragma unroll 8
@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con @@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
}
// Endian Drehung für 32 Bit Typen
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
/*
// Endian Drehung für 64 Bit Typen
static __device__ uint64_t cuda_swab64(uint64_t x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
}
*/
static __constant__ uint64_t d_constMem[8];
static const uint64_t h_constMem[8] = {
__device__ __constant__
static const uint64_t d_constMem[8] = {
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = { @@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = {
0x5be0cd19137e2179ULL };
// Hash-Padding
static __constant__ uint64_t d_constHashPadding[8];
static const uint64_t h_constHashPadding[8] = {
__device__ __constant__
static const uint64_t d_constHashPadding[8] = {
0x0000000000000080ull,
0,
0,
@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = { @@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = {
0,
0x0002000000000000ull };
__global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
__global__ __launch_bounds__(256, 4)
void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads @@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads
if (thread < threads)
#endif
{
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
//uint64_t *inpHash = &g_hash[8 * hashPosition];
uint64_t *inpHash = &g_hash[hashPosition<<3];
uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
// 128 Byte für die Message
uint64_t buf[16];
// State vorbereiten
uint64_t h[8];
/*
h[0] = 0x6a09e667f3bcc908ULL;
h[1] = 0xbb67ae8584caa73bULL;
h[2] = 0x3c6ef372fe94f82bULL;
h[3] = 0xa54ff53a5f1d36f1ULL;
h[4] = 0x510e527fade682d1ULL;
h[5] = 0x9b05688c2b3e6c1fULL;
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
#pragma unroll 8
for (i=0;i<8;i++)
h[i] = d_constMem[i];
// 128 Byte für die Message
uint64_t buf[16];
// Message für die erste Runde in Register holen
#pragma unroll 8
for (int i=0; i < 8; ++i) buf[i] = inpHash[i];
/*
buf[ 8] = 0x0000000000000080ull;
buf[ 9] = 0;
buf[10] = 0;
buf[11] = 0;
buf[12] = 0;
buf[13] = 0x0100000000000000ull;
buf[14] = 0;
buf[15] = 0x0002000000000000ull;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
#pragma unroll 8
for (i=0; i < 8; ++i)
buf[i] = inpHash[i];
#pragma unroll 8
for (i=0; i < 8; i++)
buf[i+8] = d_constHashPadding[i];
// die einzige Hashing-Runde
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
// Hash rauslassen
#if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 8
for (int i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
#pragma unroll 8
for (i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
// in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = &g_hash[8 * hashPosition];
#pragma unroll 8
for (int i=0; i < 8; ++i)
#pragma unroll 8
for (i=0; i < 8; ++i)
{
//outHash[i] = cuda_swab64( h[i] );
outHash[i] = SWAP64(h[i]);
outHash[i] = cuda_swab64(h[i]);
}
#endif
}
@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo @@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
// State vorbereiten
uint64_t h[8];
/*
h[0] = 0x6a09e667f3bcc908ULL;
h[1] = 0xbb67ae8584caa73bULL;
h[2] = 0x3c6ef372fe94f82bULL;
h[3] = 0xa54ff53a5f1d36f1ULL;
h[4] = 0x510e527fade682d1ULL;
h[5] = 0x9b05688c2b3e6c1fULL;
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
h[i] = d_constMem[i];
// 128 Byte für die Message
uint64_t buf[16];
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
#pragma unroll 8
for(i=0;i<8;i++)
h[i] = d_constMem[i];
// Message für die erste Runde in Register holen
#pragma unroll 16
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
#pragma unroll 16
for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
// die Nounce durch die thread-spezifische ersetzen
buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
@ -333,19 +224,17 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo @@ -333,19 +224,17 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
#if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 8
for (int i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
#pragma unroll 8
for (i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
// in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
#pragma unroll 8
for (int i=0; i < 8; ++i)
{
//outHash[i] = cuda_swab64( h[i] );
outHash[i] = SWAP64(h[i]);
#pragma unroll 8
for (i=0; i < 8; ++i) {
outHash[i] = cuda_swab64( h[i] );
}
#endif
}
@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads) @@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads)
host_sigma,
sizeof(host_sigma),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_u512,
host_u512,
sizeof(host_u512),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_constMem,
h_constMem,
sizeof(h_constMem),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_constHashPadding,
h_constHashPadding,
sizeof(h_constHashPadding),
0, cudaMemcpyHostToDevice);
}
// Blake512 für 80 Byte grosse Eingangsdaten

9
quark/cuda_quark_compactionTest.cu

@ -1,11 +1,8 @@ @@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_30_intrinsics.h"
#include <stdio.h>
#include <memory.h>
#include <stdint.h>
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c
extern int device_map[8];

15
quark/cuda_quark_groestl512.cu

@ -1,23 +1,16 @@ @@ -1,23 +1,16 @@
// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
@ -25,8 +18,8 @@ static cudaDeviceProp props[8]; @@ -25,8 +18,8 @@ static cudaDeviceProp props[8];
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
__global__ void __launch_bounds__(256, 4)
quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
__global__ __launch_bounds__(256, 4)
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;

21
quark/cuda_quark_keccak512.cu

@ -1,27 +1,19 @@ @@ -1,27 +1,19 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#include "cuda_helper.h"
#define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
#define U64TO32_LE(p, v) \
*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
static const uint64_t host_keccak_round_constants[24] = {
__device__ __constant__
static const uint64_t c_keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull,
@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = { @@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = {
0x0000000080000001ull, 0x8000000080008008ull
};
__constant__ uint64_t c_keccak_round_constants[24];
static __device__ __forceinline__ void
keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
size_t i;
@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u @@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
// Setup-Funktionen
__host__ void quark_keccak512_cpu_init(int thr_id, int threads)
{
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants,
host_keccak_round_constants,
sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice);
}
__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

27
quark/cuda_skein512.cu

@ -1,16 +1,8 @@ @@ -1,16 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#include "cuda_helper.h"
// aus cpu-miner.c
extern "C" extern int device_map[8];
@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t @@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// Take a look at: https://www.schneier.com/skein1.3.pdf
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))

20
quark/quarkcoin.cu

@ -1,4 +1,3 @@ @@ -1,4 +1,3 @@
extern "C"
{
#include "sph/sph_blake.h"
@ -8,9 +7,9 @@ extern "C" @@ -8,9 +7,9 @@ extern "C"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "miner.h"
}
#include <stdint.h>
#include "cuda_helper.h"
}
// aus cpu-miner.c
extern int device_map[8];
@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN @@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, @@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true;
}
@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, @@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, @@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

4
util.c

@ -1352,6 +1352,10 @@ void print_hash_tests(void) @@ -1352,6 +1352,10 @@ void print_hash_tests(void)
myriadhash(&hash[0], &buf[0]);
printf("\nmyriad: "); print_hash(hash);
memset(hash, 0, sizeof hash);
nist5hash(&hash[0], &buf[0]);
printf("\nnist5: "); print_hash(hash);
memset(hash, 0, sizeof hash);
quarkhash(&hash[0], &buf[0]);
printf("\nquark: "); print_hash(hash);

19
x11/cuda_x11_cubehash512.cu

@ -1,30 +1,13 @@ @@ -1,30 +1,13 @@
#include <cuda_runtime.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#if 0
__device__ static uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
#endif
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
typedef unsigned int uint32_t; /* must be exactly 32 bits */
#define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
#define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }

28
x11/cuda_x11_echo.cu

@ -1,33 +1,7 @@ @@ -1,33 +1,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
// das Hi Word aus einem 64 Bit Typen extrahieren
#if 0
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#endif
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

20
x11/cuda_x11_luffa512.cu

@ -18,28 +18,18 @@ @@ -18,28 +18,18 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <cuda_runtime.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef struct {
uint32_t buffer[8]; /* Buffer to be hashed */
uint32_t chainv[40]; /* Chaining values */
} hashState;
__device__ __forceinline__
static uint32_t BYTES_SWAP32(uint32_t x)
{
return __byte_perm(x, x, 0x0123);
}
#define MULT2(a,j)\
tmp = a[7+(8*j)];\
a[7+(8*j)] = a[6+(8*j)];\
@ -289,11 +279,11 @@ __device__ __forceinline__ @@ -289,11 +279,11 @@ __device__ __forceinline__
void Update512(hashState *state, const BitSequence *data)
{
#pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]);
rnd512(state);
#pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]);
rnd512(state);
}
@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b) @@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b)
for(j=0;j<5;j++) {
b[i] ^= state->chainv[i+8*j];
}
b[i] = BYTES_SWAP32((b[i]));
b[i] = cuda_swab32((b[i]));
}
#pragma unroll 8
@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b) @@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b)
for(j=0;j<5;j++) {
b[8+i] ^= state->chainv[i+8*j];
}
b[8+i] = BYTES_SWAP32((b[8+i]));
b[8 + i] = cuda_swab32((b[8 + i]));
}
}

29
x11/cuda_x11_shavite512.cu

@ -1,18 +1,13 @@ @@ -1,18 +1,13 @@
#include <stdint.h>
#include <cuda_runtime.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
//typedef unsigned char BitSequence;
//typedef unsigned long long DataLength;
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
static __constant__ uint32_t d_ShaviteInitVector[16];
static const uint32_t h_ShaviteInitVector[] = {
__device__ __constant__
static const uint32_t d_ShaviteInitVector[16] = {
SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
@ -1315,7 +1310,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui @@ -1315,7 +1310,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
// kopiere init-state
uint32_t state[16];
#pragma unroll 16
#pragma unroll 16
for(int i=0;i<16;i++)
state[i] = d_ShaviteInitVector[i];
@ -1323,13 +1318,13 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui @@ -1323,13 +1318,13 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
uint32_t msg[32];
// fülle die Nachricht mit 64-byte (vorheriger Hash)
#pragma unroll 16
#pragma unroll 16
for(int i=0;i<16;i++)
msg[i] = Hash[i];
// Nachrichtenende
msg[16] = 0x80;
#pragma unroll 10
#pragma unroll 10
for(int i=17;i<27;i++)
msg[i] = 0;
@ -1341,7 +1336,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui @@ -1341,7 +1336,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
c512(sharedMemory, state, msg);
#pragma unroll 16
#pragma unroll 16
for(int i=0;i<16;i++)
Hash[i] = state[i];
}
@ -1352,11 +1347,6 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui @@ -1352,11 +1347,6 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
__host__ void x11_shavite512_cpu_init(int thr_id, int threads)
{
aes_cpu_init();
cudaMemcpyToSymbol( d_ShaviteInitVector,
h_ShaviteInitVector,
sizeof(h_ShaviteInitVector),
0, cudaMemcpyHostToDevice);
}
__host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
@ -1373,4 +1363,3 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start @@ -1373,4 +1363,3 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

22
x11/cuda_x11_simd512.cu

@ -7,29 +7,17 @@ @@ -7,29 +7,17 @@
#define TPB 256
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
int *d_state[8];
uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
#define C32(x) ((uint32_t)(x ## U))
#define T32(x) ((x) & C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __constant__
const uint32_t c_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \ @@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \
#undef BUTTERFLY
}
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \ @@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \
__device__ __forceinline__ void FFT_16(int *y) {
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning FFT_16() function is not compatible with SM 2.1 devices!
#endif
@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) { @@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
{
int i;
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning Expansion() function is not compatible with SM 2.1 devices
#endif

19
x11/x11.cu

@ -15,10 +15,11 @@ extern "C" @@ -15,10 +15,11 @@ extern "C"
#include "sph/sph_echo.h"
#include "miner.h"
}
#include "cuda_helper.h"
#include <stdint.h>
#include <cuda_helper.h>
#include <stdio.h>
#include <memory.h>
}
// aus cpu-miner.c
extern int device_map[8];
@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc @@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
extern void x11_echo512_cpu_init(int thr_id, int threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
uint32_t foundNonce;
@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

55
x13/cuda_x13_fugue512.cu

@ -5,26 +5,11 @@ @@ -5,26 +5,11 @@
* heavily based on phm's sgminer
*
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdint.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5, 5.0)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
/*
* X13 kernel implementation.
*
@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t @@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
* @author phm <phm@inbox.com>
*/
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#define mixtab0(x) (*((uint32_t*)mixtabs + ( (x))))
#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
@ -595,7 +578,7 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint @@ -595,7 +578,7 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
#pragma unroll 16
for( i = 0; i < 16; i++ )
Hash[i] = SWAB32(Hash[i]);
Hash[i] = cuda_swab32(Hash[i]);
uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
@ -658,22 +641,22 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint @@ -658,22 +641,22 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
S18 ^= S00;
S27 ^= S00;
Hash[0] = SWAB32(S01);
Hash[1] = SWAB32(S02);
Hash[2] = SWAB32(S03);
Hash[3] = SWAB32(S04);
Hash[4] = SWAB32(S09);
Hash[5] = SWAB32(S10);
Hash[6] = SWAB32(S11);
Hash[7] = SWAB32(S12);
Hash[8] = SWAB32(S18);
Hash[9] = SWAB32(S19);
Hash[10] = SWAB32(S20);
Hash[11] = SWAB32(S21);
Hash[12] = SWAB32(S27);
Hash[13] = SWAB32(S28);
Hash[14] = SWAB32(S29);
Hash[15] = SWAB32(S30);
Hash[0] = cuda_swab32(S01);
Hash[1] = cuda_swab32(S02);
Hash[2] = cuda_swab32(S03);
Hash[3] = cuda_swab32(S04);
Hash[4] = cuda_swab32(S09);
Hash[5] = cuda_swab32(S10);
Hash[6] = cuda_swab32(S11);
Hash[7] = cuda_swab32(S12);
Hash[8] = cuda_swab32(S18);
Hash[9] = cuda_swab32(S19);
Hash[10] = cuda_swab32(S20);
Hash[11] = cuda_swab32(S21);
Hash[12] = cuda_swab32(S27);
Hash[13] = cuda_swab32(S28);
Hash[14] = cuda_swab32(S29);
Hash[15] = cuda_swab32(S30);
}
}
@ -706,7 +689,7 @@ __host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNo @@ -706,7 +689,7 @@ __host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 4 * 256 * sizeof(uint32_t);
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);

24
x13/cuda_x13_hamsi512.cu

@ -37,26 +37,11 @@ @@ -37,26 +37,11 @@
* @author phm <phm@inbox.com>
*/
#include <stdint.h>
#include <cuda_runtime.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __constant__
static const uint32_t d_alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
@ -716,11 +701,13 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint @@ -716,11 +701,13 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint
P_BIG;
T_BIG;
}
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG;
P_BIG;
T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG;
@ -729,7 +716,7 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint @@ -729,7 +716,7 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint
#pragma unroll 16
for (int i = 0; i < 16; i++)
Hash[i] = SWAB32(h[i]);
Hash[i] = cuda_swab32(h[i]);
}
}
@ -749,9 +736,8 @@ __host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNo @@ -749,9 +736,8 @@ __host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

17
x13/x13.cu

@ -20,10 +20,9 @@ extern "C" @@ -20,10 +20,9 @@ extern "C"
#include "sph/sph_fugue.h"
#include "miner.h"
}
#include <stdint.h>
#include <cuda_helper.h>
#include "cuda_helper.h"
}
// aus cpu-miner.c
extern int device_map[8];
@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun @@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
extern void x13_fugue512_cpu_init(int thr_id, int threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, @@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x11_echo512_cpu_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, @@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
uint32_t foundNonce;
@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, @@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

18
x15/cuda_x14_shabal512.cu

@ -1,26 +1,10 @@ @@ -1,26 +1,10 @@
/*
* Shabal-512 for X14/X15 (STUB)
*/
#include <stdint.h>
#include <cuda_runtime.h>
#include "cuda_helper.h"
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
/*
* Shabal implementation.

6
x15/cuda_x15_whirlpool.cu

@ -4,8 +4,8 @@ @@ -4,8 +4,8 @@
* tpruvot@github
*/
#include <stdio.h>
#include <stdint.h>
#include <cuda_helper.h>
#include "cuda_helper.h"
#define NULLTEST 0
@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t @@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
#define SPH_64 (1)
#define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1)
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// defined in cuda_helper.h
#define SPH_ROTL64(x,n) ROTL64(x,n)

17
x15/x14.cu

@ -22,10 +22,9 @@ extern "C" { @@ -22,10 +22,9 @@ extern "C" {
#include "sph/sph_shabal.h"
#include "miner.h"
}
#include <stdint.h>
#include <cuda_helper.h>
#include "cuda_helper.h"
}
// from cpu-miner.c
extern int device_map[8];
@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun @@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
extern void x14_shabal512_cpu_init(int thr_id, int threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, @@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, @@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, @@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
/* check now with the CPU to confirm */

17
x15/x15.cu

@ -23,10 +23,9 @@ extern "C" { @@ -23,10 +23,9 @@ extern "C" {
#include "sph/sph_whirlpool.h"
#include "miner.h"
}
#include <stdint.h>
#include <cuda_helper.h>
#include "cuda_helper.h"
}
// to test gpu hash on a null buffer
#define NULLTEST 0
@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou @@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
extern void x15_whirlpool_cpu_init(int thr_id, int threads);
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, @@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
x14_shabal512_cpu_init(thr_id, throughput);
x15_whirlpool_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, @@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, @@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
print_hash((unsigned char*)buf); printf("\n");
#endif
/* Scan with GPU */
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{

Loading…
Cancel
Save