Remove duplicated defines present in cuda_helper.h
also add cudaDeviceReset() on Ctrl+C for nvprof
This commit is contained in:
parent
a3ea36a5f9
commit
d9ea5f72ce
@ -1,11 +1,8 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
#include "sm_30_intrinsics.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "cuda_helper.h"
|
||||
#include <sm_30_intrinsics.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
|
||||
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
|
||||
*/
|
||||
|
@ -1,16 +1,7 @@
|
||||
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
|
||||
__constant__ uint64_t c_State[25];
|
||||
__constant__ uint32_t c_PaddedMessage[18];
|
||||
|
||||
static __device__ uint32_t cuda_swab32(uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, 0, 0x0123);
|
||||
}
|
||||
|
||||
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset >= 32) {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#else
|
||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||
#endif
|
||||
|
||||
#define U32TO64_LE(p) \
|
||||
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "sph/sph_keccak.h"
|
||||
@ -7,10 +6,9 @@ extern "C"
|
||||
#include "sph/sph_jh.h"
|
||||
#include "sph/sph_skein.h"
|
||||
#include "miner.h"
|
||||
#include "cuda_helper.h"
|
||||
}
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
|
||||
extern void quark_skein512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads);
|
||||
extern void quark_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
|
||||
extern void jackpot_compactTest_cpu_init(int thr_id, int threads);
|
||||
extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
|
||||
quark_groestl512_cpu_init(thr_id, throughput);
|
||||
quark_jh512_cpu_init(thr_id, throughput);
|
||||
quark_skein512_cpu_init(thr_id, throughput);
|
||||
quark_check_cpu_init(thr_id, throughput);
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2);
|
||||
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
|
||||
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
|
||||
@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
|
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||
|
||||
jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
|
||||
quark_check_cpu_setTarget(ptarget);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
|
||||
do {
|
||||
int order = 0;
|
||||
@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
|
||||
}
|
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU
|
||||
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
if (foundNonce != 0xffffffff)
|
||||
{
|
||||
unsigned int rounds;
|
||||
uint32_t vhash64[8];
|
||||
be32enc(&endiandata[19], foundNonce);
|
||||
|
||||
// diese jackpothash Funktion gibt die Zahl der Runden zurück
|
||||
unsigned int rounds = jackpothash(vhash64, endiandata);
|
||||
rounds = jackpothash(vhash64, endiandata);
|
||||
|
||||
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
|
||||
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
|
||||
|
@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command>
|
||||
<PtxAsOptionV>true</PtxAsOptionV>
|
||||
<Keep>false</Keep>
|
||||
<CodeGeneration>compute_50,sm_50</CodeGeneration>
|
||||
<Defines></Defines>
|
||||
<Defines>
|
||||
</Defines>
|
||||
</CudaCompile>
|
||||
<CudaLink>
|
||||
<GPUDebugInfo>false</GPUDebugInfo>
|
||||
@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
|
||||
<ClInclude Include="uint256.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CudaCompile Include="bitslice_transformations_quad.cu">
|
||||
<ExcludedFromBuild>true</ExcludedFromBuild>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="cuda_fugue256.cu">
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
|
||||
@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="groestl_functions_quad.cu">
|
||||
<ExcludedFromBuild>true</ExcludedFromBuild>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="heavy\cuda_blake512.cu">
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
|
||||
|
@ -391,5 +391,11 @@
|
||||
<CudaCompile Include="x15\cuda_x15_whirlpool.cu">
|
||||
<Filter>Source Files\CUDA\x15</Filter>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="groestl_functions_quad.cu">
|
||||
<Filter>Source Files\CUDA</Filter>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="bitslice_transformations_quad.cu">
|
||||
<Filter>Source Files\CUDA</Filter>
|
||||
</CudaCompile>
|
||||
</ItemGroup>
|
||||
</Project>
|
@ -1,12 +1,11 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include "sph/sph_fugue.h"
|
||||
|
||||
#include "cuda_helper.h"
|
||||
#include <host_defines.h>
|
||||
|
||||
#define USE_SHARED 1
|
||||
|
||||
// aus cpu-miner.c
|
||||
@ -15,14 +14,6 @@ extern int device_map[8];
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
|
||||
// schon in sph_fugue.h definiert
|
||||
//#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
|
||||
uint32_t *d_fugue256_hashoutput[8];
|
||||
uint32_t *d_resultNonce[8];
|
||||
|
||||
|
@ -1,23 +1,17 @@
|
||||
// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include "cuda_helper.h"
|
||||
#include <host_defines.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
|
||||
#include "groestl_functions_quad.cu"
|
||||
#include "bitslice_transformations_quad.cu"
|
||||
|
||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||
#define SWAB32(x) cuda_swab32(x)
|
||||
|
||||
__global__ void __launch_bounds__(256, 4)
|
||||
groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
|
||||
__global__ __launch_bounds__(256, 4)
|
||||
void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
|
||||
{
|
||||
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
|
||||
|
148
cuda_helper.h
148
cuda_helper.h
@ -1,33 +1,78 @@
|
||||
#ifndef CUDA_HELPER_H
|
||||
#define CUDA_HELPER_H
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
|
||||
#if defined(_MSC_VER)
|
||||
/* reduce warnings */
|
||||
#include <device_functions.h>
|
||||
#include <device_launch_parameters.h>
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
extern __device__ __device_builtin__ void __syncthreads(void);
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
// define blockDim and threadIdx for host
|
||||
extern const dim3 blockDim;
|
||||
extern const uint3 threadIdx;
|
||||
#endif
|
||||
|
||||
#ifndef SPH_C32
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#endif
|
||||
|
||||
#ifndef SPH_C64
|
||||
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||
#endif
|
||||
|
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
|
||||
#if __CUDA_ARCH__ < 350
|
||||
// Kepler (Compute 3.0)
|
||||
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||
#else
|
||||
// Kepler (Compute 3.5, 5.0)
|
||||
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
|
||||
#endif
|
||||
|
||||
__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return __double_as_longlong(__hiloint2double(HI, LO));
|
||||
return __double_as_longlong(__hiloint2double(HI, LO));
|
||||
#else
|
||||
return (unsigned long long)LO | (((unsigned long long)HI) << 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Hi Word aus einem 64 Bit Typen extrahieren
|
||||
static __device__ uint32_t HIWORD(const uint64_t &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2hiint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x >> 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Hi Word in einem 64 Bit Typen ersetzen
|
||||
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
|
||||
__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
|
||||
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
|
||||
}
|
||||
|
||||
// das Lo Word in einem 64 Bit Typen ersetzen
|
||||
__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
|
||||
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
|
||||
}
|
||||
|
||||
// Endian Drehung für 32 Bit Typen
|
||||
#ifdef __CUDA_ARCH__
|
||||
__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
|
||||
{
|
||||
/* device */
|
||||
return __byte_perm(x, x, 0x0123);
|
||||
}
|
||||
#else
|
||||
/* host */
|
||||
#define cuda_swab32(x) \
|
||||
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
|
||||
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
|
||||
#endif
|
||||
|
||||
// das Lo Word aus einem 64 Bit Typen extrahieren
|
||||
static __device__ uint32_t LOWORD(const uint64_t &x) {
|
||||
__device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2loint(__longlong_as_double(x));
|
||||
#else
|
||||
@ -35,34 +80,51 @@ static __device__ uint32_t LOWORD(const uint64_t &x) {
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Lo Word in einem 64 Bit Typen ersetzen
|
||||
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
|
||||
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
|
||||
// das Hi Word aus einem 64 Bit Typen extrahieren
|
||||
__device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2hiint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x >> 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Endian Drehung für 32 Bit Typen
|
||||
static __device__ uint32_t cuda_swab32(uint32_t x)
|
||||
#ifdef __CUDA_ARCH__
|
||||
__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
|
||||
{
|
||||
return __byte_perm(x, x, 0x0123);
|
||||
}
|
||||
// Input: 77665544 33221100
|
||||
// Output: 00112233 44556677
|
||||
uint64_t temp[2];
|
||||
temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
|
||||
temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);
|
||||
|
||||
// Endian Drehung für 64 Bit Typen
|
||||
static __device__ uint64_t cuda_swab64(uint64_t x) {
|
||||
return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x)));
|
||||
return temp[0] | (temp[1]<<32);
|
||||
}
|
||||
#else
|
||||
/* host */
|
||||
#define cuda_swab64(x) \
|
||||
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
|
||||
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
|
||||
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
|
||||
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
|
||||
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
|
||||
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
|
||||
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
|
||||
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
|
||||
#endif
|
||||
|
||||
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset < 32) {
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset < 32) {
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#else
|
||||
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
|
||||
@ -70,16 +132,16 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse
|
||||
|
||||
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset >= 32) {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset >= 32) {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#else
|
||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||
|
@ -1,23 +1,16 @@
|
||||
// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "sph/sph_blake.h"
|
||||
@ -7,10 +6,9 @@ extern "C"
|
||||
#include "sph/sph_jh.h"
|
||||
#include "sph/sph_keccak.h"
|
||||
#include "miner.h"
|
||||
#include "cuda_helper.h"
|
||||
}
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
|
||||
extern void quark_skein512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads);
|
||||
extern void quark_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
|
||||
// Original nist5hash Funktion aus einem miner Quelltext
|
||||
inline void nist5hash(void *state, const void *input)
|
||||
extern "C" void nist5hash(void *state, const void *input)
|
||||
{
|
||||
sph_blake512_context ctx_blake;
|
||||
sph_groestl512_context ctx_groestl;
|
||||
@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
|
||||
quark_jh512_cpu_init(thr_id, throughput);
|
||||
quark_keccak512_cpu_init(thr_id, throughput);
|
||||
quark_skein512_cpu_init(thr_id, throughput);
|
||||
quark_check_cpu_init(thr_id, throughput);
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
|
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||
|
||||
quark_blake512_cpu_setBlock_80((void*)endiandata);
|
||||
quark_check_cpu_setTarget(ptarget);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
|
||||
do {
|
||||
int order = 0;
|
||||
|
||||
// erstes Blake512 Hash mit CUDA
|
||||
// Hash with CUDA
|
||||
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Groestl512
|
||||
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für JH512
|
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Keccak512
|
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Skein512
|
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU
|
||||
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
if (foundNonce != 0xffffffff)
|
||||
{
|
||||
uint32_t vhash64[8];
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "cuda_helper.h"
|
||||
|
||||
__device__ __forceinline__ void G256_Mul2(uint32_t *regs)
|
||||
{
|
||||
|
@ -1,14 +1,7 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
extern uint32_t *d_heftyHashes[8];
|
||||
@ -20,7 +13,6 @@ uint32_t *d_hash5output[8];
|
||||
// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
|
||||
__constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
|
||||
|
||||
@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
|
||||
#define SWAP32(x) \
|
||||
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
|
||||
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
|
||||
|
||||
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
|
||||
#define SWAP64(x) \
|
||||
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
|
||||
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
|
||||
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
|
||||
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
|
||||
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
|
||||
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
|
||||
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
|
||||
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
|
||||
/* in cuda_helper */
|
||||
#define SWAP32(x) cuda_swab32(x)
|
||||
#define SWAP64(x) cuda_swab64(x)
|
||||
|
||||
__constant__ uint64_t c_SecondRound[15];
|
||||
|
||||
|
@ -1,9 +1,4 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned int uint32_t;
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// globaler Speicher für unsere Ergebnisse
|
||||
uint32_t *d_hashoutput[8];
|
||||
|
@ -1,14 +1,7 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
extern uint32_t *d_heftyHashes[8];
|
||||
@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
|
||||
cudaMemcpyToSymbol( groestl_gpu_msg,
|
||||
msgBlock,
|
||||
128);
|
||||
|
||||
BLOCKSIZE = len;
|
||||
}
|
||||
|
||||
|
@ -1,10 +1,9 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include "cuda_helper.h"
|
||||
#include <device_functions.h>
|
||||
|
||||
#define USE_SHARED 1
|
||||
|
||||
// aus cpu-miner.c
|
||||
@ -13,11 +12,6 @@ extern int device_map[8];
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
|
@ -1,14 +1,7 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
extern uint32_t *d_heftyHashes[8];
|
||||
@ -81,8 +74,8 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
|
||||
uint64_t t[5], u[5], v, w;
|
||||
|
||||
/* absorb input */
|
||||
#pragma unroll 9
|
||||
for (i = 0; i < 72 / 8; i++, in += 2)
|
||||
#pragma unroll 9
|
||||
for (i = 0; i < 9 /* 72/8 */; i++, in += 2)
|
||||
s[i] ^= U32TO64_LE(in);
|
||||
|
||||
for (i = 0; i < 24; i++) {
|
||||
|
@ -1,12 +1,7 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned int uint32_t;
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
extern uint32_t *d_heftyHashes[8];
|
||||
|
@ -1,7 +1,3 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
@ -34,6 +30,8 @@
|
||||
#include "heavy/cuda_blake512.h"
|
||||
#include "heavy/cuda_combine.h"
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern uint32_t *d_hash2output[8];
|
||||
extern uint32_t *d_hash3output[8];
|
||||
extern uint32_t *d_hash4output[8];
|
||||
|
1
miner.h
1
miner.h
@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
|
||||
void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
|
||||
void groestlhash(void *state, const void *input);
|
||||
void myriadhash(void *state, const void *input);
|
||||
void nist5hash(void *state, const void *input);
|
||||
void quarkhash(void *state, const void *input);
|
||||
void x11hash(void *output, const void *input);
|
||||
void x13hash(void *output, const void *input);
|
||||
|
@ -1,291 +1,292 @@
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "sph/sph_blake.h"
|
||||
#include "sph/sph_bmw.h"
|
||||
#include "sph/sph_groestl.h"
|
||||
#include "sph/sph_skein.h"
|
||||
#include "sph/sph_jh.h"
|
||||
#include "sph/sph_keccak.h"
|
||||
#include "miner.h"
|
||||
}
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// Speicher für Input/Output der verketteten Hashfunktionen
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
|
||||
static uint32_t *d_animeNonces[8];
|
||||
static uint32_t *d_branch1Nonces[8];
|
||||
static uint32_t *d_branch2Nonces[8];
|
||||
static uint32_t *d_branch3Nonces[8];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_bmw512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_bmw512_cpu_setBlock_80(void *pdata);
|
||||
extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
|
||||
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
|
||||
|
||||
extern void quark_groestl512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_skein512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_keccak512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_jh512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads);
|
||||
extern void quark_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
|
||||
extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, size_t *nrm1,
|
||||
uint32_t *d_nonces2, size_t *nrm2,
|
||||
int order);
|
||||
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, size_t *nrm1,
|
||||
int order);
|
||||
|
||||
// Original Quarkhash Funktion aus einem miner Quelltext
|
||||
inline void animehash(void *state, const void *input)
|
||||
{
|
||||
sph_blake512_context ctx_blake;
|
||||
sph_bmw512_context ctx_bmw;
|
||||
sph_groestl512_context ctx_groestl;
|
||||
sph_jh512_context ctx_jh;
|
||||
sph_keccak512_context ctx_keccak;
|
||||
sph_skein512_context ctx_skein;
|
||||
|
||||
unsigned char hash[64];
|
||||
|
||||
sph_bmw512_init(&ctx_bmw);
|
||||
// ZBMW;
|
||||
sph_bmw512 (&ctx_bmw, (const void*) input, 80);
|
||||
sph_bmw512_close(&ctx_bmw, (void*) hash);
|
||||
|
||||
sph_blake512_init(&ctx_blake);
|
||||
// ZBLAKE;
|
||||
sph_blake512 (&ctx_blake, hash, 64);
|
||||
sph_blake512_close(&ctx_blake, (void*) hash);
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
sph_groestl512_init(&ctx_groestl);
|
||||
// ZGROESTL;
|
||||
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
|
||||
sph_groestl512_close(&ctx_groestl, (void*) hash);
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_skein512_init(&ctx_skein);
|
||||
// ZSKEIN;
|
||||
sph_skein512 (&ctx_skein, (const void*) hash, 64);
|
||||
sph_skein512_close(&ctx_skein, (void*) hash);
|
||||
}
|
||||
|
||||
sph_groestl512_init(&ctx_groestl);
|
||||
// ZGROESTL;
|
||||
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
|
||||
sph_groestl512_close(&ctx_groestl, (void*) hash);
|
||||
|
||||
sph_jh512_init(&ctx_jh);
|
||||
// ZJH;
|
||||
sph_jh512 (&ctx_jh, (const void*) hash, 64);
|
||||
sph_jh512_close(&ctx_jh, (void*) hash);
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
sph_blake512_init(&ctx_blake);
|
||||
// ZBLAKE;
|
||||
sph_blake512 (&ctx_blake, (const void*) hash, 64);
|
||||
sph_blake512_close(&ctx_blake, (void*) hash);
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_bmw512_init(&ctx_bmw);
|
||||
// ZBMW;
|
||||
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
|
||||
sph_bmw512_close(&ctx_bmw, (void*) hash);
|
||||
}
|
||||
|
||||
sph_keccak512_init(&ctx_keccak);
|
||||
// ZKECCAK;
|
||||
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
|
||||
sph_keccak512_close(&ctx_keccak, (void*) hash);
|
||||
|
||||
sph_skein512_init(&ctx_skein);
|
||||
// SKEIN;
|
||||
sph_skein512 (&ctx_skein, (const void*) hash, 64);
|
||||
sph_skein512_close(&ctx_skein, (void*) hash);
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
sph_keccak512_init(&ctx_keccak);
|
||||
// ZKECCAK;
|
||||
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
|
||||
sph_keccak512_close(&ctx_keccak, (void*) hash);
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_jh512_init(&ctx_jh);
|
||||
// ZJH;
|
||||
sph_jh512 (&ctx_jh, (const void*) hash, 64);
|
||||
sph_jh512_close(&ctx_jh, (void*) hash);
|
||||
}
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
|
||||
struct HashPredicate
|
||||
{
|
||||
HashPredicate(uint32_t *hashes, uint32_t startNonce) :
|
||||
m_hashes(hashes),
|
||||
m_startNonce(startNonce)
|
||||
{ }
|
||||
|
||||
__device__
|
||||
bool operator()(const uint32_t x)
|
||||
{
|
||||
uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
|
||||
return hash[0] & 0x8;
|
||||
}
|
||||
|
||||
uint32_t *m_hashes;
|
||||
uint32_t m_startNonce;
|
||||
};
|
||||
|
||||
extern bool opt_benchmark;
|
||||
|
||||
extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
|
||||
const uint32_t *ptarget, uint32_t max_nonce,
|
||||
unsigned long *hashes_done)
|
||||
{
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x00000f;
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
const int throughput = 256*2048; // 100;
|
||||
|
||||
static bool init[8] = {0,0,0,0,0,0,0,0};
|
||||
if (!init[thr_id])
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
// Konstanten kopieren, Speicher belegen
|
||||
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
|
||||
quark_blake512_cpu_init(thr_id, throughput);
|
||||
quark_groestl512_cpu_init(thr_id, throughput);
|
||||
quark_skein512_cpu_init(thr_id, throughput);
|
||||
quark_bmw512_cpu_init(thr_id, throughput);
|
||||
quark_keccak512_cpu_init(thr_id, throughput);
|
||||
quark_jh512_cpu_init(thr_id, throughput);
|
||||
quark_check_cpu_init(thr_id, throughput);
|
||||
quark_compactTest_cpu_init(thr_id, throughput);
|
||||
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
uint32_t endiandata[20];
|
||||
for (int k=0; k < 20; k++)
|
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||
|
||||
quark_bmw512_cpu_setBlock_80((void*)endiandata);
|
||||
quark_check_cpu_setTarget(ptarget);
|
||||
|
||||
do {
|
||||
int order = 0;
|
||||
size_t nrm1=0, nrm2=0, nrm3=0;
|
||||
|
||||
// erstes BMW512 Hash mit CUDA
|
||||
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Blake512
|
||||
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
|
||||
d_branch3Nonces[thr_id], &nrm3,
|
||||
order++);
|
||||
|
||||
// nur den Skein Branch weiterverfolgen
|
||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Groestl512
|
||||
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für JH512
|
||||
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
|
||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
||||
d_branch1Nonces[thr_id], &nrm1,
|
||||
d_branch2Nonces[thr_id], &nrm2,
|
||||
order++);
|
||||
|
||||
// das ist der bedingte Branch für Blake512
|
||||
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der bedingte Branch für Bmw512
|
||||
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Keccak512
|
||||
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Skein512
|
||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
|
||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
||||
d_branch1Nonces[thr_id], &nrm1,
|
||||
d_branch2Nonces[thr_id], &nrm2,
|
||||
order++);
|
||||
|
||||
// das ist der bedingte Branch für Keccak512
|
||||
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der bedingte Branch für JH512
|
||||
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU
|
||||
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
if (foundNonce != 0xffffffff)
|
||||
{
|
||||
uint32_t vhash64[8];
|
||||
be32enc(&endiandata[19], foundNonce);
|
||||
animehash(vhash64, endiandata);
|
||||
|
||||
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
|
||||
|
||||
pdata[19] = foundNonce;
|
||||
*hashes_done = (foundNonce - first_nonce + 1)/2;
|
||||
return 1;
|
||||
} else {
|
||||
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
|
||||
}
|
||||
}
|
||||
|
||||
pdata[19] += throughput;
|
||||
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = (pdata[19] - first_nonce + 1)/2;
|
||||
return 0;
|
||||
}
|
||||
extern "C"
|
||||
{
|
||||
#include "sph/sph_blake.h"
|
||||
#include "sph/sph_bmw.h"
|
||||
#include "sph/sph_groestl.h"
|
||||
#include "sph/sph_skein.h"
|
||||
#include "sph/sph_jh.h"
|
||||
#include "sph/sph_keccak.h"
|
||||
#include "miner.h"
|
||||
#include "cuda_helper.h"
|
||||
}
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// Speicher für Input/Output der verketteten Hashfunktionen
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
|
||||
static uint32_t *d_animeNonces[8];
|
||||
static uint32_t *d_branch1Nonces[8];
|
||||
static uint32_t *d_branch2Nonces[8];
|
||||
static uint32_t *d_branch3Nonces[8];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_bmw512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_bmw512_cpu_setBlock_80(void *pdata);
|
||||
extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
|
||||
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
|
||||
|
||||
extern void quark_groestl512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_skein512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_keccak512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_jh512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
|
||||
extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, size_t *nrm1,
|
||||
uint32_t *d_nonces2, size_t *nrm2,
|
||||
int order);
|
||||
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, size_t *nrm1,
|
||||
int order);
|
||||
|
||||
// Original Quarkhash Funktion aus einem miner Quelltext
|
||||
inline void animehash(void *state, const void *input)
|
||||
{
|
||||
sph_blake512_context ctx_blake;
|
||||
sph_bmw512_context ctx_bmw;
|
||||
sph_groestl512_context ctx_groestl;
|
||||
sph_jh512_context ctx_jh;
|
||||
sph_keccak512_context ctx_keccak;
|
||||
sph_skein512_context ctx_skein;
|
||||
|
||||
unsigned char hash[64];
|
||||
|
||||
sph_bmw512_init(&ctx_bmw);
|
||||
// ZBMW;
|
||||
sph_bmw512 (&ctx_bmw, (const void*) input, 80);
|
||||
sph_bmw512_close(&ctx_bmw, (void*) hash);
|
||||
|
||||
sph_blake512_init(&ctx_blake);
|
||||
// ZBLAKE;
|
||||
sph_blake512 (&ctx_blake, hash, 64);
|
||||
sph_blake512_close(&ctx_blake, (void*) hash);
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
sph_groestl512_init(&ctx_groestl);
|
||||
// ZGROESTL;
|
||||
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
|
||||
sph_groestl512_close(&ctx_groestl, (void*) hash);
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_skein512_init(&ctx_skein);
|
||||
// ZSKEIN;
|
||||
sph_skein512 (&ctx_skein, (const void*) hash, 64);
|
||||
sph_skein512_close(&ctx_skein, (void*) hash);
|
||||
}
|
||||
|
||||
sph_groestl512_init(&ctx_groestl);
|
||||
// ZGROESTL;
|
||||
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
|
||||
sph_groestl512_close(&ctx_groestl, (void*) hash);
|
||||
|
||||
sph_jh512_init(&ctx_jh);
|
||||
// ZJH;
|
||||
sph_jh512 (&ctx_jh, (const void*) hash, 64);
|
||||
sph_jh512_close(&ctx_jh, (void*) hash);
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
sph_blake512_init(&ctx_blake);
|
||||
// ZBLAKE;
|
||||
sph_blake512 (&ctx_blake, (const void*) hash, 64);
|
||||
sph_blake512_close(&ctx_blake, (void*) hash);
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_bmw512_init(&ctx_bmw);
|
||||
// ZBMW;
|
||||
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
|
||||
sph_bmw512_close(&ctx_bmw, (void*) hash);
|
||||
}
|
||||
|
||||
sph_keccak512_init(&ctx_keccak);
|
||||
// ZKECCAK;
|
||||
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
|
||||
sph_keccak512_close(&ctx_keccak, (void*) hash);
|
||||
|
||||
sph_skein512_init(&ctx_skein);
|
||||
// SKEIN;
|
||||
sph_skein512 (&ctx_skein, (const void*) hash, 64);
|
||||
sph_skein512_close(&ctx_skein, (void*) hash);
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
sph_keccak512_init(&ctx_keccak);
|
||||
// ZKECCAK;
|
||||
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
|
||||
sph_keccak512_close(&ctx_keccak, (void*) hash);
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_jh512_init(&ctx_jh);
|
||||
// ZJH;
|
||||
sph_jh512 (&ctx_jh, (const void*) hash, 64);
|
||||
sph_jh512_close(&ctx_jh, (void*) hash);
|
||||
}
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
|
||||
struct HashPredicate
|
||||
{
|
||||
HashPredicate(uint32_t *hashes, uint32_t startNonce) :
|
||||
m_hashes(hashes),
|
||||
m_startNonce(startNonce)
|
||||
{ }
|
||||
|
||||
__device__
|
||||
bool operator()(const uint32_t x)
|
||||
{
|
||||
uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
|
||||
return hash[0] & 0x8;
|
||||
}
|
||||
|
||||
uint32_t *m_hashes;
|
||||
uint32_t m_startNonce;
|
||||
};
|
||||
|
||||
extern bool opt_benchmark;
|
||||
|
||||
extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
|
||||
const uint32_t *ptarget, uint32_t max_nonce,
|
||||
unsigned long *hashes_done)
|
||||
{
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x00000f;
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
const int throughput = 256*2048; // 100;
|
||||
|
||||
static bool init[8] = {0,0,0,0,0,0,0,0};
|
||||
if (!init[thr_id])
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
// Konstanten kopieren, Speicher belegen
|
||||
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
|
||||
|
||||
quark_blake512_cpu_init(thr_id, throughput);
|
||||
quark_groestl512_cpu_init(thr_id, throughput);
|
||||
quark_skein512_cpu_init(thr_id, throughput);
|
||||
quark_bmw512_cpu_init(thr_id, throughput);
|
||||
quark_keccak512_cpu_init(thr_id, throughput);
|
||||
quark_jh512_cpu_init(thr_id, throughput);
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
quark_compactTest_cpu_init(thr_id, throughput);
|
||||
|
||||
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
uint32_t endiandata[20];
|
||||
for (int k=0; k < 20; k++)
|
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||
|
||||
quark_bmw512_cpu_setBlock_80((void*)endiandata);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
|
||||
do {
|
||||
int order = 0;
|
||||
size_t nrm1=0, nrm2=0, nrm3=0;
|
||||
|
||||
// erstes BMW512 Hash mit CUDA
|
||||
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Blake512
|
||||
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
|
||||
d_branch3Nonces[thr_id], &nrm3,
|
||||
order++);
|
||||
|
||||
// nur den Skein Branch weiterverfolgen
|
||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Groestl512
|
||||
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für JH512
|
||||
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
|
||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
||||
d_branch1Nonces[thr_id], &nrm1,
|
||||
d_branch2Nonces[thr_id], &nrm2,
|
||||
order++);
|
||||
|
||||
// das ist der bedingte Branch für Blake512
|
||||
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der bedingte Branch für Bmw512
|
||||
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Keccak512
|
||||
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der unbedingte Branch für Skein512
|
||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
|
||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
||||
d_branch1Nonces[thr_id], &nrm1,
|
||||
d_branch2Nonces[thr_id], &nrm2,
|
||||
order++);
|
||||
|
||||
// das ist der bedingte Branch für Keccak512
|
||||
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der bedingte Branch für JH512
|
||||
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU
|
||||
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
if (foundNonce != 0xffffffff)
|
||||
{
|
||||
uint32_t vhash64[8];
|
||||
be32enc(&endiandata[19], foundNonce);
|
||||
animehash(vhash64, endiandata);
|
||||
|
||||
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
|
||||
|
||||
pdata[19] = foundNonce;
|
||||
*hashes_done = (foundNonce - first_nonce + 1)/2;
|
||||
return 1;
|
||||
} else {
|
||||
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
|
||||
}
|
||||
}
|
||||
|
||||
pdata[19] += throughput;
|
||||
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = (pdata[19] - first_nonce + 1)/2;
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,140 +1,9 @@
|
||||
#if 1
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
// Endian Drehung für 32 Bit Typen
|
||||
/*
|
||||
static __device__ uint32_t cuda_swab32(uint32_t x)
|
||||
{
|
||||
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
|
||||
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
|
||||
}
|
||||
*/
|
||||
static __device__ uint32_t cuda_swab32(uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, 0, 0x0123);
|
||||
}
|
||||
|
||||
// das Hi Word in einem 64 Bit Typen ersetzen
|
||||
static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) {
|
||||
return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL);
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Endian Drehung für 64 Bit Typen
|
||||
static __device__ unsigned long long cuda_swab64(unsigned long long x) {
|
||||
uint32_t h = (x >> 32);
|
||||
uint32_t l = (x & 0xFFFFFFFFULL);
|
||||
return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h));
|
||||
}
|
||||
|
||||
// das Hi Word aus einem 64 Bit Typen extrahieren
|
||||
static __device__ uint32_t HIWORD(const unsigned long long &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2hiint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x >> 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Lo Word aus einem 64 Bit Typen extrahieren
|
||||
static __device__ uint32_t LOWORD(const unsigned long long &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2loint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x & 0xFFFFFFFFULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return __double_as_longlong(__hiloint2double(HI, LO));
|
||||
#else
|
||||
return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Lo Word in einem 64 Bit Typen ersetzen
|
||||
static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) {
|
||||
return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y);
|
||||
}
|
||||
#endif
|
||||
|
||||
// der Versuch, einen Wrapper für einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen...
|
||||
#if 1
|
||||
typedef unsigned long long uint64_t;
|
||||
#else
|
||||
typedef class uint64
|
||||
{
|
||||
public:
|
||||
__device__ uint64()
|
||||
{
|
||||
}
|
||||
__device__ uint64(unsigned long long init)
|
||||
{
|
||||
val = make_uint2( LOWORD(init), HIWORD(init) );
|
||||
}
|
||||
__device__ uint64(uint32_t lo, uint32_t hi)
|
||||
{
|
||||
val = make_uint2( lo, hi );
|
||||
}
|
||||
__device__ const uint64 operator^(uint64 const& rhs) const
|
||||
{
|
||||
return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y);
|
||||
}
|
||||
__device__ const uint64 operator|(uint64 const& rhs) const
|
||||
{
|
||||
return uint64(val.x | rhs.val.x, val.y | rhs.val.y);
|
||||
}
|
||||
__device__ const uint64 operator+(unsigned long long const& rhs) const
|
||||
{
|
||||
return *this+uint64(rhs);
|
||||
}
|
||||
__device__ const uint64 operator+(uint64 const& rhs) const
|
||||
{
|
||||
uint64 res;
|
||||
asm ("add.cc.u32 %0, %2, %4;\n\t"
|
||||
"addc.cc.u32 %1, %3, %5;\n\t"
|
||||
: "=r"(res.val.x), "=r"(res.val.y)
|
||||
: "r"( val.x), "r"( val.y),
|
||||
"r"(rhs.val.x), "r"(rhs.val.y));
|
||||
return res;
|
||||
}
|
||||
__device__ const uint64 operator-(uint64 const& rhs) const
|
||||
{
|
||||
uint64 res;
|
||||
asm ("sub.cc.u32 %0, %2, %4;\n\t"
|
||||
"subc.cc.u32 %1, %3, %5;\n\t"
|
||||
: "=r"(res.val.x), "=r"(res.val.y)
|
||||
: "r"( val.x), "r"( val.y),
|
||||
"r"(rhs.val.x), "r"(rhs.val.y));
|
||||
return res;
|
||||
}
|
||||
__device__ const uint64 operator<<(int n) const
|
||||
{
|
||||
return uint64(unsigned long long(*this)<<n);
|
||||
}
|
||||
__device__ const uint64 operator>>(int n) const
|
||||
{
|
||||
return uint64(unsigned long long(*this)>>n);
|
||||
}
|
||||
__device__ operator unsigned long long() const
|
||||
{
|
||||
return MAKE_ULONGLONG(val.x, val.y);
|
||||
}
|
||||
uint2 val;
|
||||
} uint64_t;
|
||||
#endif
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
|
||||
// die Message it Padding zur Berechnung auf der GPU
|
||||
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
|
||||
|
||||
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset >= 32) {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#else
|
||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||
#endif
|
||||
#define SHL(x, n) ((x) << (n))
|
||||
#define SHR(x, n) ((x) >> (n))
|
||||
|
||||
|
@ -1,11 +1,8 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// Hash Target gegen das wir testen sollen
|
||||
__constant__ uint32_t pTarget[8];
|
||||
|
||||
@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32
|
||||
}
|
||||
|
||||
// Setup-Funktionen
|
||||
__host__ void quark_check_cpu_init(int thr_id, int threads)
|
||||
__host__ void cuda_check_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
|
||||
cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
|
||||
}
|
||||
|
||||
// Target Difficulty setzen
|
||||
__host__ void quark_check_cpu_setTarget(const void *ptarget)
|
||||
__host__ void cuda_check_cpu_setTarget(const void *ptarget)
|
||||
{
|
||||
// die Message zur Berechnung auf der GPU
|
||||
cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
|
||||
__host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
|
||||
{
|
||||
uint32_t result = 0xffffffff;
|
||||
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include <stdint.h>
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
@ -1,16 +1,11 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#define USE_SHUFFLE 0
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
#define ROTR(x,n) ROTR64(x,n)
|
||||
|
||||
#define USE_SHUFFLE 0
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] =
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
// das Hi Word aus einem 64 Bit Typen extrahieren
|
||||
static __device__ uint32_t HIWORD(const uint64_t &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2hiint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x >> 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Hi Word in einem 64 Bit Typen ersetzen
|
||||
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
|
||||
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
|
||||
}
|
||||
|
||||
// das Lo Word aus einem 64 Bit Typen extrahieren
|
||||
static __device__ uint32_t LOWORD(const uint64_t &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2loint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x & 0xFFFFFFFFULL);
|
||||
#endif
|
||||
}
|
||||
#if 0
|
||||
// das Lo Word in einem 64 Bit Typen ersetzen
|
||||
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
|
||||
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
|
||||
}
|
||||
#endif
|
||||
|
||||
__device__ __forceinline__ uint64_t SWAP64(uint64_t x)
|
||||
{
|
||||
// Input: 77665544 33221100
|
||||
// Output: 00112233 44556677
|
||||
uint64_t temp[2];
|
||||
temp[0] = __byte_perm(HIWORD(x), 0, 0x0123);
|
||||
temp[1] = __byte_perm(LOWORD(x), 0, 0x0123);
|
||||
|
||||
return temp[0] | (temp[1]<<32);
|
||||
}
|
||||
|
||||
__constant__ uint64_t c_u512[16];
|
||||
|
||||
const uint64_t host_u512[16] =
|
||||
__device__ __constant__
|
||||
const uint64_t c_u512[16] =
|
||||
{
|
||||
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
|
||||
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
|
||||
@ -96,24 +50,6 @@ const uint64_t host_u512[16] =
|
||||
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
|
||||
};
|
||||
|
||||
|
||||
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset < 32) {
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#else
|
||||
#define ROTR(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
|
||||
#endif
|
||||
|
||||
#define G(a,b,c,d,e) \
|
||||
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
|
||||
v[d] = ROTR( v[d] ^ v[a],32); \
|
||||
@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset)
|
||||
v[b] = ROTR( v[b] ^ v[c],11);
|
||||
|
||||
|
||||
__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
|
||||
__device__ static
|
||||
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
|
||||
{
|
||||
uint64_t v[16], m[16], i;
|
||||
|
||||
#pragma unroll 16
|
||||
for( i = 0; i < 16; ++i )
|
||||
{
|
||||
m[i] = SWAP64(block[i]);
|
||||
for( i = 0; i < 16; ++i ) {
|
||||
m[i] = cuda_swab64(block[i]);
|
||||
}
|
||||
|
||||
#pragma unroll 8
|
||||
@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con
|
||||
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
|
||||
}
|
||||
|
||||
// Endian Drehung für 32 Bit Typen
|
||||
|
||||
static __device__ uint32_t cuda_swab32(uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, 0, 0x0123);
|
||||
}
|
||||
|
||||
/*
|
||||
// Endian Drehung für 64 Bit Typen
|
||||
static __device__ uint64_t cuda_swab64(uint64_t x) {
|
||||
uint32_t h = (x >> 32);
|
||||
uint32_t l = (x & 0xFFFFFFFFULL);
|
||||
return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
|
||||
}
|
||||
*/
|
||||
|
||||
static __constant__ uint64_t d_constMem[8];
|
||||
static const uint64_t h_constMem[8] = {
|
||||
__device__ __constant__
|
||||
static const uint64_t d_constMem[8] = {
|
||||
0x6a09e667f3bcc908ULL,
|
||||
0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL,
|
||||
@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = {
|
||||
0x5be0cd19137e2179ULL };
|
||||
|
||||
// Hash-Padding
|
||||
static __constant__ uint64_t d_constHashPadding[8];
|
||||
static const uint64_t h_constHashPadding[8] = {
|
||||
__device__ __constant__
|
||||
static const uint64_t d_constHashPadding[8] = {
|
||||
0x0000000000000080ull,
|
||||
0,
|
||||
0,
|
||||
@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = {
|
||||
0,
|
||||
0x0002000000000000ull };
|
||||
|
||||
__global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
|
||||
__global__ __launch_bounds__(256, 4)
|
||||
void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
|
||||
{
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads
|
||||
if (thread < threads)
|
||||
#endif
|
||||
{
|
||||
uint8_t i;
|
||||
// bestimme den aktuellen Zähler
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
|
||||
int hashPosition = nounce - startNounce;
|
||||
//uint64_t *inpHash = &g_hash[8 * hashPosition];
|
||||
uint64_t *inpHash = &g_hash[hashPosition<<3];
|
||||
|
||||
// State vorbereiten
|
||||
uint64_t h[8];
|
||||
/*
|
||||
h[0] = 0x6a09e667f3bcc908ULL;
|
||||
h[1] = 0xbb67ae8584caa73bULL;
|
||||
h[2] = 0x3c6ef372fe94f82bULL;
|
||||
h[3] = 0xa54ff53a5f1d36f1ULL;
|
||||
h[4] = 0x510e527fade682d1ULL;
|
||||
h[5] = 0x9b05688c2b3e6c1fULL;
|
||||
h[6] = 0x1f83d9abfb41bd6bULL;
|
||||
h[7] = 0x5be0cd19137e2179ULL;
|
||||
*/
|
||||
#pragma unroll 8
|
||||
for(int i=0;i<8;i++)
|
||||
h[i] = d_constMem[i];
|
||||
uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
|
||||
|
||||
// 128 Byte für die Message
|
||||
uint64_t buf[16];
|
||||
|
||||
// Message für die erste Runde in Register holen
|
||||
#pragma unroll 8
|
||||
for (int i=0; i < 8; ++i) buf[i] = inpHash[i];
|
||||
// State vorbereiten
|
||||
uint64_t h[8];
|
||||
#pragma unroll 8
|
||||
for (i=0;i<8;i++)
|
||||
h[i] = d_constMem[i];
|
||||
|
||||
/*
|
||||
buf[ 8] = 0x0000000000000080ull;
|
||||
buf[ 9] = 0;
|
||||
buf[10] = 0;
|
||||
buf[11] = 0;
|
||||
buf[12] = 0;
|
||||
buf[13] = 0x0100000000000000ull;
|
||||
buf[14] = 0;
|
||||
buf[15] = 0x0002000000000000ull;
|
||||
*/
|
||||
#pragma unroll 8
|
||||
for(int i=0;i<8;i++)
|
||||
// Message für die erste Runde in Register holen
|
||||
#pragma unroll 8
|
||||
for (i=0; i < 8; ++i)
|
||||
buf[i] = inpHash[i];
|
||||
|
||||
#pragma unroll 8
|
||||
for (i=0; i < 8; i++)
|
||||
buf[i+8] = d_constHashPadding[i];
|
||||
|
||||
// die einzige Hashing-Runde
|
||||
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
|
||||
|
||||
// Hash rauslassen
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
|
||||
uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
|
||||
#pragma unroll 8
|
||||
for (int i=0; i < 8; ++i) {
|
||||
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
|
||||
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
|
||||
#pragma unroll 8
|
||||
for (i=0; i < 8; ++i) {
|
||||
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
|
||||
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
|
||||
}
|
||||
#else
|
||||
// in dieser Version passieren auch ein paar 64 Bit Shifts
|
||||
uint64_t *outHash = &g_hash[8 * hashPosition];
|
||||
#pragma unroll 8
|
||||
for (int i=0; i < 8; ++i)
|
||||
#pragma unroll 8
|
||||
for (i=0; i < 8; ++i)
|
||||
{
|
||||
//outHash[i] = cuda_swab64( h[i] );
|
||||
outHash[i] = SWAP64(h[i]);
|
||||
outHash[i] = cuda_swab64(h[i]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
// State vorbereiten
|
||||
uint64_t h[8];
|
||||
// 128 Byte für die Message
|
||||
uint64_t buf[16];
|
||||
uint8_t i;
|
||||
// bestimme den aktuellen Zähler
|
||||
uint32_t nounce = startNounce + thread;
|
||||
|
||||
// State vorbereiten
|
||||
uint64_t h[8];
|
||||
/*
|
||||
h[0] = 0x6a09e667f3bcc908ULL;
|
||||
h[1] = 0xbb67ae8584caa73bULL;
|
||||
h[2] = 0x3c6ef372fe94f82bULL;
|
||||
h[3] = 0xa54ff53a5f1d36f1ULL;
|
||||
h[4] = 0x510e527fade682d1ULL;
|
||||
h[5] = 0x9b05688c2b3e6c1fULL;
|
||||
h[6] = 0x1f83d9abfb41bd6bULL;
|
||||
h[7] = 0x5be0cd19137e2179ULL;
|
||||
*/
|
||||
#pragma unroll 8
|
||||
for(int i=0;i<8;i++)
|
||||
#pragma unroll 8
|
||||
for(i=0;i<8;i++)
|
||||
h[i] = d_constMem[i];
|
||||
// 128 Byte für die Message
|
||||
uint64_t buf[16];
|
||||
|
||||
// Message für die erste Runde in Register holen
|
||||
#pragma unroll 16
|
||||
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
|
||||
#pragma unroll 16
|
||||
for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
|
||||
|
||||
// die Nounce durch die thread-spezifische ersetzen
|
||||
buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
|
||||
@ -333,19 +224,17 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
|
||||
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
|
||||
#pragma unroll 8
|
||||
for (int i=0; i < 8; ++i) {
|
||||
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
|
||||
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
|
||||
#pragma unroll 8
|
||||
for (i=0; i < 8; ++i) {
|
||||
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
|
||||
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
|
||||
}
|
||||
#else
|
||||
// in dieser Version passieren auch ein paar 64 Bit Shifts
|
||||
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
|
||||
#pragma unroll 8
|
||||
for (int i=0; i < 8; ++i)
|
||||
{
|
||||
//outHash[i] = cuda_swab64( h[i] );
|
||||
outHash[i] = SWAP64(h[i]);
|
||||
#pragma unroll 8
|
||||
for (i=0; i < 8; ++i) {
|
||||
outHash[i] = cuda_swab64( h[i] );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads)
|
||||
host_sigma,
|
||||
sizeof(host_sigma),
|
||||
0, cudaMemcpyHostToDevice);
|
||||
|
||||
cudaMemcpyToSymbol( c_u512,
|
||||
host_u512,
|
||||
sizeof(host_u512),
|
||||
0, cudaMemcpyHostToDevice);
|
||||
|
||||
cudaMemcpyToSymbol( d_constMem,
|
||||
h_constMem,
|
||||
sizeof(h_constMem),
|
||||
0, cudaMemcpyHostToDevice);
|
||||
|
||||
cudaMemcpyToSymbol( d_constHashPadding,
|
||||
h_constHashPadding,
|
||||
sizeof(h_constHashPadding),
|
||||
0, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
// Blake512 für 80 Byte grosse Eingangsdaten
|
||||
|
@ -1,371 +1,368 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
#include "sm_30_intrinsics.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
static uint32_t *d_tempBranch1Nonces[8];
|
||||
static uint32_t *d_numValid[8];
|
||||
static uint32_t *h_numValid[8];
|
||||
|
||||
static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// True/False tester
|
||||
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
|
||||
|
||||
__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
|
||||
{
|
||||
return ((inpHash[0] & 0x08) == 0x08);
|
||||
}
|
||||
|
||||
__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
|
||||
{
|
||||
return ((inpHash[0] & 0x08) == 0);
|
||||
}
|
||||
|
||||
__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
|
||||
cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
|
||||
|
||||
// Setup-Funktionen
|
||||
__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
|
||||
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
|
||||
|
||||
// wir brauchen auch Speicherplatz auf dem Device
|
||||
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);
|
||||
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
|
||||
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
|
||||
|
||||
uint32_t s1;
|
||||
s1 = (threads / 256) * 2;
|
||||
|
||||
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
|
||||
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
|
||||
*/
|
||||
#undef __shfl_up
|
||||
#define __shfl_up(var, delta, width) (0)
|
||||
#endif
|
||||
|
||||
// Die Summenfunktion (vom NVIDIA SDK)
|
||||
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
{
|
||||
extern __shared__ uint32_t sums[];
|
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
|
||||
//int lane_id = id % warpSize;
|
||||
int lane_id = id % width;
|
||||
// determine a warp_id within a block
|
||||
//int warp_id = threadIdx.x / warpSize;
|
||||
int warp_id = threadIdx.x / width;
|
||||
|
||||
sums[lane_id] = 0;
|
||||
|
||||
// Below is the basic structure of using a shfl instruction
|
||||
// for a scan.
|
||||
// Record "value" as a variable - we accumulate it along the way
|
||||
uint32_t value;
|
||||
if(testFunc != NULL)
|
||||
{
|
||||
if (id < threads)
|
||||
{
|
||||
uint32_t *inpHash;
|
||||
if(d_validNonceTable == NULL)
|
||||
{
|
||||
// keine Nonce-Liste
|
||||
inpHash = &inpHashes[id<<4];
|
||||
}else
|
||||
{
|
||||
// Nonce-Liste verfügbar
|
||||
int nonce = d_validNonceTable[id] - startNounce;
|
||||
inpHash = &inpHashes[nonce<<4];
|
||||
}
|
||||
value = (*testFunc)(inpHash);
|
||||
}else
|
||||
{
|
||||
value = 0;
|
||||
}
|
||||
}else
|
||||
{
|
||||
value = data[id];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Now accumulate in log steps up the chain
|
||||
// compute sums, with another thread's value who is
|
||||
// distance delta away (i). Note
|
||||
// those threads where the thread 'i' away would have
|
||||
// been out of bounds of the warp are unaffected. This
|
||||
// creates the scan sum.
|
||||
#pragma unroll
|
||||
|
||||
for (int i=1; i<=width; i*=2)
|
||||
{
|
||||
uint32_t n = __shfl_up((int)value, i, width);
|
||||
|
||||
if (lane_id >= i) value += n;
|
||||
}
|
||||
|
||||
// value now holds the scan value for the individual thread
|
||||
// next sum the largest values for each warp
|
||||
|
||||
// write the sum of the warp to smem
|
||||
//if (threadIdx.x % warpSize == warpSize-1)
|
||||
if (threadIdx.x % width == width-1)
|
||||
{
|
||||
sums[warp_id] = value;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
//
|
||||
// scan sum the warp sums
|
||||
// the same shfl scan operation, but performed on warp sums
|
||||
//
|
||||
if (warp_id == 0)
|
||||
{
|
||||
uint32_t warp_sum = sums[lane_id];
|
||||
|
||||
for (int i=1; i<=width; i*=2)
|
||||
{
|
||||
uint32_t n = __shfl_up((int)warp_sum, i, width);
|
||||
|
||||
if (lane_id >= i) warp_sum += n;
|
||||
}
|
||||
|
||||
sums[lane_id] = warp_sum;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// perform a uniform add across warps in the block
|
||||
// read neighbouring warp's sum and add it to threads value
|
||||
uint32_t blockSum = 0;
|
||||
|
||||
if (warp_id > 0)
|
||||
{
|
||||
blockSum = sums[warp_id-1];
|
||||
}
|
||||
|
||||
value += blockSum;
|
||||
|
||||
// Now write out our result
|
||||
data[id] = value;
|
||||
|
||||
// last thread has sum, write write out the block's sum
|
||||
if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
|
||||
{
|
||||
partial_sums[blockIdx.x] = value;
|
||||
}
|
||||
}
|
||||
|
||||
// Uniform add: add partial sums array
|
||||
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
|
||||
{
|
||||
__shared__ uint32_t buf;
|
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
|
||||
|
||||
if (id > len) return;
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
buf = partial_sums[blockIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
data[id] += buf;
|
||||
}
|
||||
|
||||
// Der Scatter
|
||||
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
{
|
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
|
||||
uint32_t actNounce = id;
|
||||
uint32_t value;
|
||||
if (id < threads)
|
||||
{
|
||||
// uint32_t nounce = startNounce + id;
|
||||
uint32_t *inpHash;
|
||||
if(d_validNonceTable == NULL)
|
||||
{
|
||||
// keine Nonce-Liste
|
||||
inpHash = &inpHashes[id<<4];
|
||||
}else
|
||||
{
|
||||
// Nonce-Liste verfügbar
|
||||
int nonce = d_validNonceTable[id] - startNounce;
|
||||
actNounce = nonce;
|
||||
inpHash = &inpHashes[nonce<<4];
|
||||
}
|
||||
|
||||
value = (*testFunc)(inpHash);
|
||||
}else
|
||||
{
|
||||
value = 0;
|
||||
}
|
||||
|
||||
if( value )
|
||||
{
|
||||
int idx = sum[id];
|
||||
if(idx > 0)
|
||||
outp[idx-1] = startNounce + actNounce;
|
||||
}
|
||||
}
|
||||
|
||||
__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
|
||||
{
|
||||
if(val == 0)
|
||||
return 0;
|
||||
|
||||
uint32_t mask = 0x80000000;
|
||||
while( (val & mask) == 0 ) mask = mask >> 1;
|
||||
|
||||
if( (val & (~mask)) != 0 )
|
||||
return mask << 1;
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
|
||||
uint32_t *d_nonces1, cuda_compactTestFunction_t function,
|
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
|
||||
{
|
||||
int orgThreads = threads;
|
||||
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
|
||||
// threadsPerBlock ausrechnen
|
||||
int blockSize = 256;
|
||||
int nSummen = threads / blockSize;
|
||||
|
||||
int thr1 = (threads+blockSize-1) / blockSize;
|
||||
int thr2 = threads / (blockSize*blockSize);
|
||||
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
|
||||
int thr3 = (nSummen + blockSize2-1) / blockSize2;
|
||||
|
||||
bool callThrid = (thr2 > 0) ? true : false;
|
||||
|
||||
// Erster Initialscan
|
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
|
||||
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
// weitere Scans
|
||||
if(callThrid)
|
||||
{
|
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
|
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
|
||||
}else
|
||||
{
|
||||
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
|
||||
}
|
||||
|
||||
// Sync + Anzahl merken
|
||||
cudaStreamSynchronize(NULL);
|
||||
|
||||
if(callThrid)
|
||||
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
else
|
||||
cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
|
||||
|
||||
// Addieren
|
||||
if(callThrid)
|
||||
{
|
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
|
||||
}
|
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
|
||||
|
||||
// Scatter
|
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1,
|
||||
function, orgThreads, startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
// Sync
|
||||
cudaStreamSynchronize(NULL);
|
||||
}
|
||||
|
||||
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
|
||||
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
|
||||
uint32_t *d_nonces1, uint32_t *d_nonces2,
|
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
|
||||
{
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
/*
|
||||
// threadsPerBlock ausrechnen
|
||||
int blockSize = 256;
|
||||
int thr1 = threads / blockSize;
|
||||
int thr2 = threads / (blockSize*blockSize);
|
||||
|
||||
// 1
|
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
|
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
|
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
|
||||
cudaStreamSynchronize(NULL);
|
||||
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
|
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
|
||||
|
||||
// 2
|
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
|
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
|
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
|
||||
cudaStreamSynchronize(NULL);
|
||||
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
|
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
|
||||
|
||||
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
|
||||
// Schritt 3: Scatter
|
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
|
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
|
||||
cudaStreamSynchronize(NULL);
|
||||
*/
|
||||
}
|
||||
|
||||
__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, size_t *nrm1,
|
||||
uint32_t *d_nonces2, size_t *nrm2,
|
||||
int order)
|
||||
{
|
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
|
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
|
||||
|
||||
quark_compactTest_cpu_dualCompaction(thr_id, threads,
|
||||
h_numValid[thr_id], d_nonces1, d_nonces2,
|
||||
startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
|
||||
*nrm1 = (size_t)h_numValid[thr_id][0];
|
||||
*nrm2 = (size_t)h_numValid[thr_id][1];
|
||||
}
|
||||
|
||||
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, size_t *nrm1,
|
||||
int order)
|
||||
{
|
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
|
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
|
||||
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
|
||||
*nrm1 = (size_t)h_numValid[thr_id][0];
|
||||
}
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include "cuda_helper.h"
|
||||
#include <sm_30_intrinsics.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
static uint32_t *d_tempBranch1Nonces[8];
|
||||
static uint32_t *d_numValid[8];
|
||||
static uint32_t *h_numValid[8];
|
||||
|
||||
static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// True/False tester
|
||||
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
|
||||
|
||||
__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
|
||||
{
|
||||
return ((inpHash[0] & 0x08) == 0x08);
|
||||
}
|
||||
|
||||
__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
|
||||
{
|
||||
return ((inpHash[0] & 0x08) == 0);
|
||||
}
|
||||
|
||||
__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
|
||||
cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
|
||||
|
||||
// Setup-Funktionen
|
||||
__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
|
||||
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
|
||||
|
||||
// wir brauchen auch Speicherplatz auf dem Device
|
||||
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);
|
||||
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
|
||||
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
|
||||
|
||||
uint32_t s1;
|
||||
s1 = (threads / 256) * 2;
|
||||
|
||||
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
|
||||
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
|
||||
*/
|
||||
#undef __shfl_up
|
||||
#define __shfl_up(var, delta, width) (0)
|
||||
#endif
|
||||
|
||||
// Die Summenfunktion (vom NVIDIA SDK)
|
||||
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
{
|
||||
extern __shared__ uint32_t sums[];
|
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
|
||||
//int lane_id = id % warpSize;
|
||||
int lane_id = id % width;
|
||||
// determine a warp_id within a block
|
||||
//int warp_id = threadIdx.x / warpSize;
|
||||
int warp_id = threadIdx.x / width;
|
||||
|
||||
sums[lane_id] = 0;
|
||||
|
||||
// Below is the basic structure of using a shfl instruction
|
||||
// for a scan.
|
||||
// Record "value" as a variable - we accumulate it along the way
|
||||
uint32_t value;
|
||||
if(testFunc != NULL)
|
||||
{
|
||||
if (id < threads)
|
||||
{
|
||||
uint32_t *inpHash;
|
||||
if(d_validNonceTable == NULL)
|
||||
{
|
||||
// keine Nonce-Liste
|
||||
inpHash = &inpHashes[id<<4];
|
||||
}else
|
||||
{
|
||||
// Nonce-Liste verfügbar
|
||||
int nonce = d_validNonceTable[id] - startNounce;
|
||||
inpHash = &inpHashes[nonce<<4];
|
||||
}
|
||||
value = (*testFunc)(inpHash);
|
||||
}else
|
||||
{
|
||||
value = 0;
|
||||
}
|
||||
}else
|
||||
{
|
||||
value = data[id];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Now accumulate in log steps up the chain
|
||||
// compute sums, with another thread's value who is
|
||||
// distance delta away (i). Note
|
||||
// those threads where the thread 'i' away would have
|
||||
// been out of bounds of the warp are unaffected. This
|
||||
// creates the scan sum.
|
||||
#pragma unroll
|
||||
|
||||
for (int i=1; i<=width; i*=2)
|
||||
{
|
||||
uint32_t n = __shfl_up((int)value, i, width);
|
||||
|
||||
if (lane_id >= i) value += n;
|
||||
}
|
||||
|
||||
// value now holds the scan value for the individual thread
|
||||
// next sum the largest values for each warp
|
||||
|
||||
// write the sum of the warp to smem
|
||||
//if (threadIdx.x % warpSize == warpSize-1)
|
||||
if (threadIdx.x % width == width-1)
|
||||
{
|
||||
sums[warp_id] = value;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
//
|
||||
// scan sum the warp sums
|
||||
// the same shfl scan operation, but performed on warp sums
|
||||
//
|
||||
if (warp_id == 0)
|
||||
{
|
||||
uint32_t warp_sum = sums[lane_id];
|
||||
|
||||
for (int i=1; i<=width; i*=2)
|
||||
{
|
||||
uint32_t n = __shfl_up((int)warp_sum, i, width);
|
||||
|
||||
if (lane_id >= i) warp_sum += n;
|
||||
}
|
||||
|
||||
sums[lane_id] = warp_sum;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// perform a uniform add across warps in the block
|
||||
// read neighbouring warp's sum and add it to threads value
|
||||
uint32_t blockSum = 0;
|
||||
|
||||
if (warp_id > 0)
|
||||
{
|
||||
blockSum = sums[warp_id-1];
|
||||
}
|
||||
|
||||
value += blockSum;
|
||||
|
||||
// Now write out our result
|
||||
data[id] = value;
|
||||
|
||||
// last thread has sum, write write out the block's sum
|
||||
if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
|
||||
{
|
||||
partial_sums[blockIdx.x] = value;
|
||||
}
|
||||
}
|
||||
|
||||
// Uniform add: add partial sums array
|
||||
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
|
||||
{
|
||||
__shared__ uint32_t buf;
|
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
|
||||
|
||||
if (id > len) return;
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
buf = partial_sums[blockIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
data[id] += buf;
|
||||
}
|
||||
|
||||
// Der Scatter
|
||||
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
{
|
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
|
||||
uint32_t actNounce = id;
|
||||
uint32_t value;
|
||||
if (id < threads)
|
||||
{
|
||||
// uint32_t nounce = startNounce + id;
|
||||
uint32_t *inpHash;
|
||||
if(d_validNonceTable == NULL)
|
||||
{
|
||||
// keine Nonce-Liste
|
||||
inpHash = &inpHashes[id<<4];
|
||||
}else
|
||||
{
|
||||
// Nonce-Liste verfügbar
|
||||
int nonce = d_validNonceTable[id] - startNounce;
|
||||
actNounce = nonce;
|
||||
inpHash = &inpHashes[nonce<<4];
|
||||
}
|
||||
|
||||
value = (*testFunc)(inpHash);
|
||||
}else
|
||||
{
|
||||
value = 0;
|
||||
}
|
||||
|
||||
if( value )
|
||||
{
|
||||
int idx = sum[id];
|
||||
if(idx > 0)
|
||||
outp[idx-1] = startNounce + actNounce;
|
||||
}
|
||||
}
|
||||
|
||||
__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
|
||||
{
|
||||
if(val == 0)
|
||||
return 0;
|
||||
|
||||
uint32_t mask = 0x80000000;
|
||||
while( (val & mask) == 0 ) mask = mask >> 1;
|
||||
|
||||
if( (val & (~mask)) != 0 )
|
||||
return mask << 1;
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
|
||||
uint32_t *d_nonces1, cuda_compactTestFunction_t function,
|
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
|
||||
{
|
||||
int orgThreads = threads;
|
||||
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
|
||||
// threadsPerBlock ausrechnen
|
||||
int blockSize = 256;
|
||||
int nSummen = threads / blockSize;
|
||||
|
||||
int thr1 = (threads+blockSize-1) / blockSize;
|
||||
int thr2 = threads / (blockSize*blockSize);
|
||||
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
|
||||
int thr3 = (nSummen + blockSize2-1) / blockSize2;
|
||||
|
||||
bool callThrid = (thr2 > 0) ? true : false;
|
||||
|
||||
// Erster Initialscan
|
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
|
||||
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
// weitere Scans
|
||||
if(callThrid)
|
||||
{
|
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
|
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
|
||||
}else
|
||||
{
|
||||
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
|
||||
}
|
||||
|
||||
// Sync + Anzahl merken
|
||||
cudaStreamSynchronize(NULL);
|
||||
|
||||
if(callThrid)
|
||||
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
else
|
||||
cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
|
||||
|
||||
// Addieren
|
||||
if(callThrid)
|
||||
{
|
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
|
||||
}
|
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
|
||||
|
||||
// Scatter
|
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1,
|
||||
function, orgThreads, startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
// Sync
|
||||
cudaStreamSynchronize(NULL);
|
||||
}
|
||||
|
||||
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
|
||||
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
|
||||
uint32_t *d_nonces1, uint32_t *d_nonces2,
|
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
|
||||
{
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
/*
|
||||
// threadsPerBlock ausrechnen
|
||||
int blockSize = 256;
|
||||
int thr1 = threads / blockSize;
|
||||
int thr2 = threads / (blockSize*blockSize);
|
||||
|
||||
// 1
|
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
|
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
|
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
|
||||
cudaStreamSynchronize(NULL);
|
||||
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
|
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
|
||||
|
||||
// 2
|
||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
|
||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
|
||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
|
||||
cudaStreamSynchronize(NULL);
|
||||
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
|
||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
|
||||
|
||||
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
|
||||
// Schritt 3: Scatter
|
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
|
||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
|
||||
cudaStreamSynchronize(NULL);
|
||||
*/
|
||||
}
|
||||
|
||||
__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, size_t *nrm1,
|
||||
uint32_t *d_nonces2, size_t *nrm2,
|
||||
int order)
|
||||
{
|
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
|
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
|
||||
|
||||
quark_compactTest_cpu_dualCompaction(thr_id, threads,
|
||||
h_numValid[thr_id], d_nonces1, d_nonces2,
|
||||
startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
|
||||
*nrm1 = (size_t)h_numValid[thr_id][0];
|
||||
*nrm2 = (size_t)h_numValid[thr_id][1];
|
||||
}
|
||||
|
||||
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, size_t *nrm1,
|
||||
int order)
|
||||
{
|
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
|
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
|
||||
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||
|
||||
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
|
||||
*nrm1 = (size_t)h_numValid[thr_id][0];
|
||||
}
|
||||
|
@ -1,23 +1,16 @@
|
||||
// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
@ -25,8 +18,8 @@ static cudaDeviceProp props[8];
|
||||
#include "groestl_functions_quad.cu"
|
||||
#include "bitslice_transformations_quad.cu"
|
||||
|
||||
__global__ void __launch_bounds__(256, 4)
|
||||
quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
|
||||
__global__ __launch_bounds__(256, 4)
|
||||
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
|
||||
{
|
||||
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
|
||||
|
@ -1,27 +1,19 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
#define U32TO64_LE(p) \
|
||||
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
|
||||
|
||||
#define U64TO32_LE(p, v) \
|
||||
*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
|
||||
|
||||
static const uint64_t host_keccak_round_constants[24] = {
|
||||
__device__ __constant__
|
||||
static const uint64_t c_keccak_round_constants[24] = {
|
||||
0x0000000000000001ull, 0x0000000000008082ull,
|
||||
0x800000000000808aull, 0x8000000080008000ull,
|
||||
0x000000000000808bull, 0x0000000080000001ull,
|
||||
@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = {
|
||||
0x0000000080000001ull, 0x8000000080008008ull
|
||||
};
|
||||
|
||||
__constant__ uint64_t c_keccak_round_constants[24];
|
||||
|
||||
static __device__ __forceinline__ void
|
||||
keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
|
||||
size_t i;
|
||||
@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
|
||||
// Setup-Funktionen
|
||||
__host__ void quark_keccak512_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||
cudaMemcpyToSymbol( c_keccak_round_constants,
|
||||
host_keccak_round_constants,
|
||||
sizeof(host_keccak_round_constants),
|
||||
0, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||
|
@ -1,16 +1,8 @@
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
|
||||
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern "C" extern int device_map[8];
|
||||
@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
|
||||
|
||||
// Take a look at: https://www.schneier.com/skein1.3.pdf
|
||||
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset >= 32) {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#else
|
||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||
#endif
|
||||
#define SHL(x, n) ((x) << (n))
|
||||
#define SHR(x, n) ((x) >> (n))
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "sph/sph_blake.h"
|
||||
@ -8,9 +7,9 @@ extern "C"
|
||||
#include "sph/sph_jh.h"
|
||||
#include "sph/sph_keccak.h"
|
||||
#include "miner.h"
|
||||
}
|
||||
|
||||
#include <stdint.h>
|
||||
#include "cuda_helper.h"
|
||||
}
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
|
||||
extern void quark_jh512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads);
|
||||
extern void quark_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
|
||||
extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
|
||||
|
||||
// Konstanten kopieren, Speicher belegen
|
||||
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
|
||||
|
||||
quark_blake512_cpu_init(thr_id, throughput);
|
||||
quark_groestl512_cpu_init(thr_id, throughput);
|
||||
quark_skein512_cpu_init(thr_id, throughput);
|
||||
quark_bmw512_cpu_init(thr_id, throughput);
|
||||
quark_keccak512_cpu_init(thr_id, throughput);
|
||||
quark_jh512_cpu_init(thr_id, throughput);
|
||||
quark_check_cpu_init(thr_id, throughput);
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
quark_compactTest_cpu_init(thr_id, throughput);
|
||||
|
||||
cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
|
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||
|
||||
quark_blake512_cpu_setBlock_80((void*)endiandata);
|
||||
quark_check_cpu_setTarget(ptarget);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
|
||||
do {
|
||||
int order = 0;
|
||||
@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
|
||||
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU
|
||||
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
if (foundNonce != 0xffffffff)
|
||||
{
|
||||
uint32_t vhash64[8];
|
||||
|
4
util.c
4
util.c
@ -1352,6 +1352,10 @@ void print_hash_tests(void)
|
||||
myriadhash(&hash[0], &buf[0]);
|
||||
printf("\nmyriad: "); print_hash(hash);
|
||||
|
||||
memset(hash, 0, sizeof hash);
|
||||
nist5hash(&hash[0], &buf[0]);
|
||||
printf("\nnist5: "); print_hash(hash);
|
||||
|
||||
memset(hash, 0, sizeof hash);
|
||||
quarkhash(&hash[0], &buf[0]);
|
||||
printf("\nquark: "); print_hash(hash);
|
||||
|
@ -1,30 +1,13 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
typedef unsigned char BitSequence;
|
||||
typedef unsigned long long DataLength;
|
||||
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
|
||||
#if 0
|
||||
__device__ static uint32_t cuda_swab32(uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, 0, 0x0123);
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef unsigned char BitSequence;
|
||||
typedef unsigned long long DataLength;
|
||||
|
||||
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
|
||||
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
|
||||
|
||||
typedef unsigned int uint32_t; /* must be exactly 32 bits */
|
||||
|
||||
#define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
|
||||
#define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
|
||||
#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
|
||||
|
@ -1,33 +1,7 @@
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
// das Hi Word aus einem 64 Bit Typen extrahieren
|
||||
#if 0
|
||||
static __device__ uint32_t HIWORD(const uint64_t &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2hiint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x >> 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Lo Word aus einem 64 Bit Typen extrahieren
|
||||
static __device__ uint32_t LOWORD(const uint64_t &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2loint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x & 0xFFFFFFFFULL);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
@ -18,28 +18,18 @@
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
typedef unsigned char BitSequence;
|
||||
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t buffer[8]; /* Buffer to be hashed */
|
||||
uint32_t chainv[40]; /* Chaining values */
|
||||
} hashState;
|
||||
|
||||
__device__ __forceinline__
|
||||
static uint32_t BYTES_SWAP32(uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, x, 0x0123);
|
||||
}
|
||||
|
||||
#define MULT2(a,j)\
|
||||
tmp = a[7+(8*j)];\
|
||||
a[7+(8*j)] = a[6+(8*j)];\
|
||||
@ -289,11 +279,11 @@ __device__ __forceinline__
|
||||
void Update512(hashState *state, const BitSequence *data)
|
||||
{
|
||||
#pragma unroll 8
|
||||
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
|
||||
for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]);
|
||||
rnd512(state);
|
||||
|
||||
#pragma unroll 8
|
||||
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
|
||||
for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]);
|
||||
rnd512(state);
|
||||
}
|
||||
|
||||
@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b)
|
||||
for(j=0;j<5;j++) {
|
||||
b[i] ^= state->chainv[i+8*j];
|
||||
}
|
||||
b[i] = BYTES_SWAP32((b[i]));
|
||||
b[i] = cuda_swab32((b[i]));
|
||||
}
|
||||
|
||||
#pragma unroll 8
|
||||
@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b)
|
||||
for(j=0;j<5;j++) {
|
||||
b[8+i] ^= state->chainv[i+8*j];
|
||||
}
|
||||
b[8+i] = BYTES_SWAP32((b[8+i]));
|
||||
b[8 + i] = cuda_swab32((b[8 + i]));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,18 +1,13 @@
|
||||
#include <stdint.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
typedef unsigned char BitSequence;
|
||||
typedef unsigned long long DataLength;
|
||||
//typedef unsigned char BitSequence;
|
||||
//typedef unsigned long long DataLength;
|
||||
|
||||
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
|
||||
static __constant__ uint32_t d_ShaviteInitVector[16];
|
||||
static const uint32_t h_ShaviteInitVector[] = {
|
||||
__device__ __constant__
|
||||
static const uint32_t d_ShaviteInitVector[16] = {
|
||||
SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
|
||||
SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
|
||||
SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
|
||||
@ -1304,18 +1299,18 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
|
||||
|
||||
aes_gpu_init(sharedMemory);
|
||||
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
|
||||
|
||||
// kopiere init-state
|
||||
uint32_t state[16];
|
||||
|
||||
#pragma unroll 16
|
||||
#pragma unroll 16
|
||||
for(int i=0;i<16;i++)
|
||||
state[i] = d_ShaviteInitVector[i];
|
||||
|
||||
@ -1323,13 +1318,13 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
|
||||
uint32_t msg[32];
|
||||
|
||||
// fülle die Nachricht mit 64-byte (vorheriger Hash)
|
||||
#pragma unroll 16
|
||||
#pragma unroll 16
|
||||
for(int i=0;i<16;i++)
|
||||
msg[i] = Hash[i];
|
||||
|
||||
// Nachrichtenende
|
||||
msg[16] = 0x80;
|
||||
#pragma unroll 10
|
||||
#pragma unroll 10
|
||||
for(int i=17;i<27;i++)
|
||||
msg[i] = 0;
|
||||
|
||||
@ -1341,10 +1336,10 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
|
||||
|
||||
c512(sharedMemory, state, msg);
|
||||
|
||||
#pragma unroll 16
|
||||
#pragma unroll 16
|
||||
for(int i=0;i<16;i++)
|
||||
Hash[i] = state[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1352,25 +1347,19 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
|
||||
__host__ void x11_shavite512_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
aes_cpu_init();
|
||||
|
||||
cudaMemcpyToSymbol( d_ShaviteInitVector,
|
||||
h_ShaviteInitVector,
|
||||
sizeof(h_ShaviteInitVector),
|
||||
0, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
__host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||
{
|
||||
const int threadsperblock = 256;
|
||||
const int threadsperblock = 256;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
||||
|
@ -7,29 +7,17 @@
|
||||
|
||||
#define TPB 256
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
|
||||
int *d_state[8];
|
||||
uint4 *d_temp4[8];
|
||||
|
||||
// texture bound to d_temp4[thr_id], for read access in Compaction kernel
|
||||
texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
|
||||
|
||||
#define C32(x) ((uint32_t)(x ## U))
|
||||
#define T32(x) ((x) & C32(0xFFFFFFFF))
|
||||
|
||||
#if __CUDA_ARCH__ < 350
|
||||
// Kepler (Compute 3.0)
|
||||
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||
#else
|
||||
// Kepler (Compute 3.5)
|
||||
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
|
||||
#endif
|
||||
|
||||
__device__ __constant__
|
||||
const uint32_t c_IV_512[32] = {
|
||||
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
|
||||
@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \
|
||||
#undef BUTTERFLY
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
|
||||
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
|
||||
@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \
|
||||
|
||||
__device__ __forceinline__ void FFT_16(int *y) {
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||
#ifndef WIN32
|
||||
# warning FFT_16() function is not compatible with SM 2.1 devices!
|
||||
#endif
|
||||
@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
|
||||
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
|
||||
{
|
||||
int i;
|
||||
#if __CUDA_ARCH__ < 300
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||
#ifndef WIN32
|
||||
# warning Expansion() function is not compatible with SM 2.1 devices
|
||||
#endif
|
||||
|
19
x11/x11.cu
19
x11/x11.cu
@ -15,10 +15,11 @@ extern "C"
|
||||
#include "sph/sph_echo.h"
|
||||
|
||||
#include "miner.h"
|
||||
}
|
||||
#include "cuda_helper.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <cuda_helper.h>
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
}
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
|
||||
extern void x11_echo512_cpu_init(int thr_id, int threads);
|
||||
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads);
|
||||
extern void quark_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
|
||||
extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
|
||||
@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
||||
x11_shavite512_cpu_init(thr_id, throughput);
|
||||
x11_simd512_cpu_init(thr_id, throughput);
|
||||
x11_echo512_cpu_init(thr_id, throughput);
|
||||
quark_check_cpu_init(thr_id, throughput);
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
|
||||
init[thr_id] = true;
|
||||
}
|
||||
@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||
|
||||
quark_blake512_cpu_setBlock_80((void*)endiandata);
|
||||
quark_check_cpu_setTarget(ptarget);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
|
||||
do {
|
||||
uint32_t foundNonce;
|
||||
@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
||||
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU
|
||||
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
if (foundNonce != 0xffffffff)
|
||||
{
|
||||
uint32_t vhash64[8];
|
||||
|
@ -5,26 +5,11 @@
|
||||
* heavily based on phm's sgminer
|
||||
*
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
|
||||
#if __CUDA_ARCH__ < 350
|
||||
// Kepler (Compute 3.0)
|
||||
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||
#else
|
||||
// Kepler (Compute 3.5, 5.0)
|
||||
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
|
||||
#endif
|
||||
|
||||
/*
|
||||
* X13 kernel implementation.
|
||||
*
|
||||
@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
|
||||
* @author phm <phm@inbox.com>
|
||||
*/
|
||||
|
||||
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
|
||||
|
||||
#define mixtab0(x) (*((uint32_t*)mixtabs + ( (x))))
|
||||
#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
|
||||
#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
|
||||
@ -584,97 +567,97 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int i;
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
int i;
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
|
||||
|
||||
#pragma unroll 16
|
||||
for( i = 0; i < 16; i++ )
|
||||
Hash[i] = SWAB32(Hash[i]);
|
||||
#pragma unroll 16
|
||||
for( i = 0; i < 16; i++ )
|
||||
Hash[i] = cuda_swab32(Hash[i]);
|
||||
|
||||
uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
|
||||
uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
|
||||
uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
|
||||
uint32_t S30, S31, S32, S33, S34, S35;
|
||||
uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
|
||||
uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
|
||||
uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
|
||||
uint32_t S30, S31, S32, S33, S34, S35;
|
||||
|
||||
uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
|
||||
uint64_t bc = (uint64_t) 64 << 3;
|
||||
uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
|
||||
uint32_t bchi = (uint32_t)(bc >> 32);
|
||||
uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
|
||||
uint64_t bc = (uint64_t) 64 << 3;
|
||||
uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
|
||||
uint32_t bchi = (uint32_t)(bc >> 32);
|
||||
|
||||
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
|
||||
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
|
||||
S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1);
|
||||
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
|
||||
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
|
||||
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
|
||||
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
|
||||
S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1);
|
||||
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
|
||||
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
|
||||
|
||||
FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
|
||||
FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
|
||||
FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
|
||||
FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
|
||||
FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
|
||||
FUGUE512_3((Hash[0xF]), bchi, bclo);
|
||||
FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
|
||||
FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
|
||||
FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
|
||||
FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
|
||||
FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
|
||||
FUGUE512_3((Hash[0xF]), bchi, bclo);
|
||||
|
||||
#pragma unroll 32
|
||||
for (i = 0; i < 32; i ++) {
|
||||
ROR3;
|
||||
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
|
||||
SMIX(S00, S01, S02, S03);
|
||||
}
|
||||
#pragma unroll 13
|
||||
for (i = 0; i < 13; i ++) {
|
||||
S04 ^= S00;
|
||||
S09 ^= S00;
|
||||
S18 ^= S00;
|
||||
S27 ^= S00;
|
||||
ROR9;
|
||||
SMIX(S00, S01, S02, S03);
|
||||
S04 ^= S00;
|
||||
S10 ^= S00;
|
||||
S18 ^= S00;
|
||||
S27 ^= S00;
|
||||
ROR9;
|
||||
SMIX(S00, S01, S02, S03);
|
||||
S04 ^= S00;
|
||||
S10 ^= S00;
|
||||
S19 ^= S00;
|
||||
S27 ^= S00;
|
||||
ROR9;
|
||||
SMIX(S00, S01, S02, S03);
|
||||
S04 ^= S00;
|
||||
S10 ^= S00;
|
||||
S19 ^= S00;
|
||||
S28 ^= S00;
|
||||
ROR8;
|
||||
SMIX(S00, S01, S02, S03);
|
||||
}
|
||||
S04 ^= S00;
|
||||
S09 ^= S00;
|
||||
S18 ^= S00;
|
||||
S27 ^= S00;
|
||||
#pragma unroll 32
|
||||
for (i = 0; i < 32; i ++) {
|
||||
ROR3;
|
||||
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
|
||||
SMIX(S00, S01, S02, S03);
|
||||
}
|
||||
#pragma unroll 13
|
||||
for (i = 0; i < 13; i ++) {
|
||||
S04 ^= S00;
|
||||
S09 ^= S00;
|
||||
S18 ^= S00;
|
||||
S27 ^= S00;
|
||||
ROR9;
|
||||
SMIX(S00, S01, S02, S03);
|
||||
S04 ^= S00;
|
||||
S10 ^= S00;
|
||||
S18 ^= S00;
|
||||
S27 ^= S00;
|
||||
ROR9;
|
||||
SMIX(S00, S01, S02, S03);
|
||||
S04 ^= S00;
|
||||
S10 ^= S00;
|
||||
S19 ^= S00;
|
||||
S27 ^= S00;
|
||||
ROR9;
|
||||
SMIX(S00, S01, S02, S03);
|
||||
S04 ^= S00;
|
||||
S10 ^= S00;
|
||||
S19 ^= S00;
|
||||
S28 ^= S00;
|
||||
ROR8;
|
||||
SMIX(S00, S01, S02, S03);
|
||||
}
|
||||
S04 ^= S00;
|
||||
S09 ^= S00;
|
||||
S18 ^= S00;
|
||||
S27 ^= S00;
|
||||
|
||||
Hash[0] = SWAB32(S01);
|
||||
Hash[1] = SWAB32(S02);
|
||||
Hash[2] = SWAB32(S03);
|
||||
Hash[3] = SWAB32(S04);
|
||||
Hash[4] = SWAB32(S09);
|
||||
Hash[5] = SWAB32(S10);
|
||||
Hash[6] = SWAB32(S11);
|
||||
Hash[7] = SWAB32(S12);
|
||||
Hash[8] = SWAB32(S18);
|
||||
Hash[9] = SWAB32(S19);
|
||||
Hash[10] = SWAB32(S20);
|
||||
Hash[11] = SWAB32(S21);
|
||||
Hash[12] = SWAB32(S27);
|
||||
Hash[13] = SWAB32(S28);
|
||||
Hash[14] = SWAB32(S29);
|
||||
Hash[15] = SWAB32(S30);
|
||||
}
|
||||
Hash[0] = cuda_swab32(S01);
|
||||
Hash[1] = cuda_swab32(S02);
|
||||
Hash[2] = cuda_swab32(S03);
|
||||
Hash[3] = cuda_swab32(S04);
|
||||
Hash[4] = cuda_swab32(S09);
|
||||
Hash[5] = cuda_swab32(S10);
|
||||
Hash[6] = cuda_swab32(S11);
|
||||
Hash[7] = cuda_swab32(S12);
|
||||
Hash[8] = cuda_swab32(S18);
|
||||
Hash[9] = cuda_swab32(S19);
|
||||
Hash[10] = cuda_swab32(S20);
|
||||
Hash[11] = cuda_swab32(S21);
|
||||
Hash[12] = cuda_swab32(S27);
|
||||
Hash[13] = cuda_swab32(S28);
|
||||
Hash[14] = cuda_swab32(S29);
|
||||
Hash[15] = cuda_swab32(S30);
|
||||
}
|
||||
}
|
||||
|
||||
#define texDef(texname, texmem, texsource, texsize) \
|
||||
@ -697,17 +680,17 @@ __host__ void x13_fugue512_cpu_init(int thr_id, int threads)
|
||||
|
||||
__host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||
{
|
||||
const int threadsperblock = 256;
|
||||
const int threadsperblock = 256;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 4 * 256 * sizeof(uint32_t);
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
@ -37,26 +37,11 @@
|
||||
* @author phm <phm@inbox.com>
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
|
||||
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
|
||||
|
||||
#if __CUDA_ARCH__ < 350
|
||||
// Kepler (Compute 3.0)
|
||||
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||
#else
|
||||
// Kepler (Compute 3.5)
|
||||
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
|
||||
#endif
|
||||
|
||||
__device__ __constant__
|
||||
static const uint32_t d_alpha_n[] = {
|
||||
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
|
||||
@ -663,7 +648,7 @@ static const uint32_t d_T512[64][16] = {
|
||||
mD = 0; \
|
||||
mE = 0; \
|
||||
mF = 0; \
|
||||
for (u = 0; u < 8; u ++) { \
|
||||
for (u = 0; u < 8; u ++) { \
|
||||
unsigned db = buf(u); \
|
||||
for (v = 0; v < 8; v ++, db >>= 1) { \
|
||||
uint32_t dm = SPH_T32(-(uint32_t)(db & 1)); \
|
||||
@ -692,45 +677,47 @@ static const uint32_t d_T512[64][16] = {
|
||||
// Die Hash-Funktion
|
||||
__global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
||||
{
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
|
||||
unsigned char *h1 = (unsigned char *)Hash;
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
|
||||
unsigned char *h1 = (unsigned char *)Hash;
|
||||
|
||||
uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
|
||||
uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
|
||||
uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
|
||||
uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
|
||||
uint32_t m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
uint32_t m8, m9, mA, mB, mC, mD, mE, mF;
|
||||
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
|
||||
uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
|
||||
uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
|
||||
uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
|
||||
uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
|
||||
uint32_t m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
uint32_t m8, m9, mA, mB, mC, mD, mE, mF;
|
||||
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
|
||||
|
||||
#define buf(u) (h1[i+u])
|
||||
#pragma unroll 8
|
||||
for(int i = 0; i < 64; i += 8) {
|
||||
INPUT_BIG;
|
||||
P_BIG;
|
||||
T_BIG;
|
||||
}
|
||||
for(int i = 0; i < 64; i += 8) {
|
||||
INPUT_BIG;
|
||||
P_BIG;
|
||||
T_BIG;
|
||||
}
|
||||
|
||||
#undef buf
|
||||
#define buf(u) (u == 0 ? 0x80 : 0)
|
||||
INPUT_BIG;
|
||||
P_BIG;
|
||||
T_BIG;
|
||||
INPUT_BIG;
|
||||
P_BIG;
|
||||
T_BIG;
|
||||
|
||||
#undef buf
|
||||
#define buf(u) (u == 6 ? 2 : 0)
|
||||
INPUT_BIG;
|
||||
PF_BIG;
|
||||
T_BIG;
|
||||
INPUT_BIG;
|
||||
PF_BIG;
|
||||
T_BIG;
|
||||
|
||||
#pragma unroll 16
|
||||
for (int i = 0; i < 16; i++)
|
||||
Hash[i] = SWAB32(h[i]);
|
||||
}
|
||||
for (int i = 0; i < 16; i++)
|
||||
Hash[i] = cuda_swab32(h[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -740,18 +727,17 @@ __host__ void x13_hamsi512_cpu_init(int thr_id, int threads)
|
||||
|
||||
__host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||
{
|
||||
const int threadsperblock = 256;
|
||||
const int threadsperblock = 256;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
||||
|
17
x13/x13.cu
17
x13/x13.cu
@ -20,10 +20,9 @@ extern "C"
|
||||
#include "sph/sph_fugue.h"
|
||||
|
||||
#include "miner.h"
|
||||
}
|
||||
|
||||
#include <stdint.h>
|
||||
#include <cuda_helper.h>
|
||||
#include "cuda_helper.h"
|
||||
}
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
|
||||
extern void x13_fugue512_cpu_init(int thr_id, int threads);
|
||||
extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads);
|
||||
extern void quark_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
|
||||
extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
|
||||
@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
|
||||
x11_echo512_cpu_init(thr_id, throughput);
|
||||
x13_hamsi512_cpu_init(thr_id, throughput);
|
||||
x13_fugue512_cpu_init(thr_id, throughput);
|
||||
quark_check_cpu_init(thr_id, throughput);
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
|
||||
init[thr_id] = true;
|
||||
}
|
||||
@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
|
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||
|
||||
quark_blake512_cpu_setBlock_80((void*)endiandata);
|
||||
quark_check_cpu_setTarget(ptarget);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
|
||||
do {
|
||||
uint32_t foundNonce;
|
||||
@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
|
||||
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU
|
||||
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
if (foundNonce != 0xffffffff)
|
||||
{
|
||||
uint32_t vhash64[8];
|
||||
|
@ -1,26 +1,10 @@
|
||||
/*
|
||||
* Shabal-512 for X14/X15 (STUB)
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
|
||||
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
|
||||
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
|
||||
|
||||
#if __CUDA_ARCH__ < 350
|
||||
// Kepler (Compute 3.0)
|
||||
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||
#else
|
||||
// Kepler (Compute 3.5)
|
||||
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
|
||||
#endif
|
||||
|
||||
/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
|
||||
/*
|
||||
* Shabal implementation.
|
||||
|
@ -4,8 +4,8 @@
|
||||
* tpruvot@github
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <cuda_helper.h>
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
#define NULLTEST 0
|
||||
|
||||
@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
|
||||
#define SPH_64 (1)
|
||||
#define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1)
|
||||
|
||||
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||
|
||||
// defined in cuda_helper.h
|
||||
#define SPH_ROTL64(x,n) ROTL64(x,n)
|
||||
|
||||
|
17
x15/x14.cu
17
x15/x14.cu
@ -22,10 +22,9 @@ extern "C" {
|
||||
#include "sph/sph_shabal.h"
|
||||
|
||||
#include "miner.h"
|
||||
}
|
||||
|
||||
#include <stdint.h>
|
||||
#include <cuda_helper.h>
|
||||
#include "cuda_helper.h"
|
||||
}
|
||||
|
||||
// from cpu-miner.c
|
||||
extern int device_map[8];
|
||||
@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
|
||||
extern void x14_shabal512_cpu_init(int thr_id, int threads);
|
||||
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads);
|
||||
extern void quark_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
|
||||
extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
|
||||
@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
|
||||
x13_fugue512_cpu_init(thr_id, throughput);
|
||||
x14_shabal512_cpu_init(thr_id, throughput);
|
||||
|
||||
quark_check_cpu_init(thr_id, throughput);
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
|
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||
|
||||
quark_blake512_cpu_setBlock_80((void*)endiandata);
|
||||
quark_check_cpu_setTarget(ptarget);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
|
||||
do {
|
||||
int order = 0;
|
||||
@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
|
||||
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
if (foundNonce != 0xffffffff)
|
||||
{
|
||||
/* check now with the CPU to confirm */
|
||||
|
17
x15/x15.cu
17
x15/x15.cu
@ -23,10 +23,9 @@ extern "C" {
|
||||
#include "sph/sph_whirlpool.h"
|
||||
|
||||
#include "miner.h"
|
||||
}
|
||||
|
||||
#include <stdint.h>
|
||||
#include <cuda_helper.h>
|
||||
#include "cuda_helper.h"
|
||||
}
|
||||
|
||||
// to test gpu hash on a null buffer
|
||||
#define NULLTEST 0
|
||||
@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
|
||||
extern void x15_whirlpool_cpu_init(int thr_id, int threads);
|
||||
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads);
|
||||
extern void quark_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||
|
||||
extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
||||
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
|
||||
@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
|
||||
x14_shabal512_cpu_init(thr_id, throughput);
|
||||
x15_whirlpool_cpu_init(thr_id, throughput);
|
||||
|
||||
quark_check_cpu_init(thr_id, throughput);
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
|
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||
|
||||
quark_blake512_cpu_setBlock_80((void*)endiandata);
|
||||
quark_check_cpu_setTarget(ptarget);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
|
||||
do {
|
||||
int order = 0;
|
||||
@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
|
||||
print_hash((unsigned char*)buf); printf("\n");
|
||||
#endif
|
||||
/* Scan with GPU */
|
||||
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
if (foundNonce != 0xffffffff)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user