Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof
This commit is contained in:
Tanguy Pruvot 2014-08-18 03:45:48 +02:00
parent a3ea36a5f9
commit d9ea5f72ce
44 changed files with 1121 additions and 1611 deletions

View File

@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_30_intrinsics.h"
#include <stdio.h>
#include <memory.h>
#include <stdint.h>
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c
extern int device_map[8];
@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
}
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/

View File

@ -1,16 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
__constant__ uint64_t c_State[25];
__constant__ uint32_t c_PaddedMessage[18];
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))

View File

@ -1,4 +1,3 @@
extern "C"
{
#include "sph/sph_keccak.h"
@ -7,10 +6,9 @@ extern "C"
#include "sph/sph_jh.h"
#include "sph/sph_skein.h"
#include "miner.h"
#include "cuda_helper.h"
}
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void jackpot_compactTest_cpu_init(int thr_id, int threads);
extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
quark_groestl512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
}
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
unsigned int rounds;
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
// diese jackpothash Funktion gibt die Zahl der Runden zurück
unsigned int rounds = jackpothash(vhash64, endiandata);
rounds = jackpothash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {

View File

@ -1,5 +1,4 @@
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.

View File

@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration>
<Defines></Defines>
<Defines>
</Defines>
</CudaCompile>
<CudaLink>
<GPUDebugInfo>false</GPUDebugInfo>
@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<ClInclude Include="uint256.h" />
</ItemGroup>
<ItemGroup>
<CudaCompile Include="bitslice_transformations_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="cuda_fugue256.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="heavy\cuda_blake512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>

View File

@ -391,5 +391,11 @@
<CudaCompile Include="x15\cuda_x15_whirlpool.cu">
<Filter>Source Files\CUDA\x15</Filter>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="bitslice_transformations_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
</ItemGroup>
</Project>

View File

@ -1,12 +1,11 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "sph/sph_fugue.h"
#include "cuda_helper.h"
#include <host_defines.h>
#define USE_SHARED 1
// aus cpu-miner.c
@ -15,14 +14,6 @@ extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// schon in sph_fugue.h definiert
//#define SPH_C32(x) ((uint32_t)(x ## U))
uint32_t *d_fugue256_hashoutput[8];
uint32_t *d_resultNonce[8];

View File

@ -1,23 +1,17 @@
// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
#include <host_defines.h>
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
#define SWAB32(x) cuda_swab32(x)
__global__ void __launch_bounds__(256, 4)
groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
__global__ __launch_bounds__(256, 4)
void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
{
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;

View File

@ -1,33 +1,78 @@
#ifndef CUDA_HELPER_H
#define CUDA_HELPER_H
#include <cuda.h>
#include <cuda_runtime.h>
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
#if defined(_MSC_VER)
/* reduce warnings */
#include <device_functions.h>
#include <device_launch_parameters.h>
#endif
#include <stdint.h>
extern __device__ __device_builtin__ void __syncthreads(void);
#ifndef __CUDA_ARCH__
// define blockDim and threadIdx for host
extern const dim3 blockDim;
extern const uint3 threadIdx;
#endif
#ifndef SPH_C32
#define SPH_C32(x) ((uint32_t)(x ## U))
#endif
#ifndef SPH_C64
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#endif
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5, 5.0)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO));
return __double_as_longlong(__hiloint2double(HI, LO));
#else
return (unsigned long long)LO | (((unsigned long long)HI) << 32);
#endif
}
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
}
// das Lo Word in einem 64 Bit Typen ersetzen
__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
// Endian Drehung für 32 Bit Typen
#ifdef __CUDA_ARCH__
__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
{
/* device */
return __byte_perm(x, x, 0x0123);
}
#else
/* host */
#define cuda_swab32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
#endif
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
__device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
@ -35,34 +80,51 @@ static __device__ uint32_t LOWORD(const uint64_t &x) {
#endif
}
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
// das Hi Word aus einem 64 Bit Typen extrahieren
__device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// Endian Drehung für 32 Bit Typen
static __device__ uint32_t cuda_swab32(uint32_t x)
#ifdef __CUDA_ARCH__
__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
{
return __byte_perm(x, x, 0x0123);
}
// Input: 77665544 33221100
// Output: 00112233 44556677
uint64_t temp[2];
temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);
// Endian Drehung für 64 Bit Typen
static __device__ uint64_t cuda_swab64(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x)));
return temp[0] | (temp[1]<<32);
}
#else
/* host */
#define cuda_swab64(x) \
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
#endif
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) {
uint2 result;
if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
uint2 result;
if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
@ -70,16 +132,16 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))

View File

@ -1,23 +1,16 @@
// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];

View File

@ -1,4 +1,3 @@
extern "C"
{
#include "sph/sph_blake.h"
@ -7,10 +6,9 @@ extern "C"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "miner.h"
#include "cuda_helper.h"
}
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
// Original nist5hash Funktion aus einem miner Quelltext
inline void nist5hash(void *state, const void *input)
extern "C" void nist5hash(void *state, const void *input)
{
sph_blake512_context ctx_blake;
sph_groestl512_context ctx_groestl;
@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
// erstes Blake512 Hash mit CUDA
// Hash with CUDA
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

View File

@ -1,3 +1,4 @@
#include "cuda_helper.h"
__device__ __forceinline__ void G256_Mul2(uint32_t *regs)
{

View File

@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8];
@ -20,7 +13,6 @@ uint32_t *d_hash5output[8];
// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
#include "cuda_helper.h"
// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
#define SWAP32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
#define SWAP64(x) \
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
/* in cuda_helper */
#define SWAP32(x) cuda_swab32(x)
#define SWAP64(x) cuda_swab64(x)
__constant__ uint64_t c_SecondRound[15];

View File

@ -1,9 +1,4 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
#include "cuda_helper.h"
// globaler Speicher für unsere Ergebnisse
uint32_t *d_hashoutput[8];

View File

@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8];
@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
cudaMemcpyToSymbol( groestl_gpu_msg,
msgBlock,
128);
BLOCKSIZE = len;
}

View File

@ -1,10 +1,9 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
#include <device_functions.h>
#define USE_SHARED 1
// aus cpu-miner.c
@ -13,11 +12,6 @@ extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];

View File

@ -1,14 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8];
@ -81,8 +74,8 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
uint64_t t[5], u[5], v, w;
/* absorb input */
#pragma unroll 9
for (i = 0; i < 72 / 8; i++, in += 2)
#pragma unroll 9
for (i = 0; i < 9 /* 72/8 */; i++, in += 2)
s[i] ^= U32TO64_LE(in);
for (i = 0; i < 24; i++) {

View File

@ -1,12 +1,7 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
#include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8];

View File

@ -1,7 +1,3 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include <string.h>
@ -34,6 +30,8 @@
#include "heavy/cuda_blake512.h"
#include "heavy/cuda_combine.h"
#include "cuda_helper.h"
extern uint32_t *d_hash2output[8];
extern uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8];

View File

@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void groestlhash(void *state, const void *input);
void myriadhash(void *state, const void *input);
void nist5hash(void *state, const void *input);
void quarkhash(void *state, const void *input);
void x11hash(void *output, const void *input);
void x13hash(void *output, const void *input);

View File

@ -1,291 +1,292 @@
extern "C"
{
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "miner.h"
}
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
static uint32_t *d_animeNonces[8];
static uint32_t *d_branch1Nonces[8];
static uint32_t *d_branch2Nonces[8];
static uint32_t *d_branch3Nonces[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_setBlock_80(void *pdata);
extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
int order);
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
int order);
// Original Quarkhash Funktion aus einem miner Quelltext
inline void animehash(void *state, const void *input)
{
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
unsigned char hash[64];
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) input, 80);
sph_bmw512_close(&ctx_bmw, (void*) hash);
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
if (hash[0] & 0x8)
{
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
}
else
{
sph_skein512_init(&ctx_skein);
// ZSKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
}
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
if (hash[0] & 0x8)
{
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, (const void*) hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
}
else
{
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
}
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_skein512_init(&ctx_skein);
// SKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
if (hash[0] & 0x8)
{
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
}
else
{
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
}
memcpy(state, hash, 32);
}
struct HashPredicate
{
HashPredicate(uint32_t *hashes, uint32_t startNonce) :
m_hashes(hashes),
m_startNonce(startNonce)
{ }
__device__
bool operator()(const uint32_t x)
{
uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
return hash[0] & 0x8;
}
uint32_t *m_hashes;
uint32_t m_startNonce;
};
extern bool opt_benchmark;
extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00000f;
const uint32_t Htarg = ptarget[7];
const int throughput = 256*2048; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_bmw512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
do {
int order = 0;
size_t nrm1=0, nrm2=0, nrm3=0;
// erstes BMW512 Hash mit CUDA
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
d_branch3Nonces[thr_id], &nrm3,
order++);
// nur den Skein Branch weiterverfolgen
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für Bmw512
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
animehash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = (foundNonce - first_nonce + 1)/2;
return 1;
} else {
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = (pdata[19] - first_nonce + 1)/2;
return 0;
}
extern "C"
{
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "miner.h"
#include "cuda_helper.h"
}
// aus cpu-miner.c
extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
static uint32_t *d_animeNonces[8];
static uint32_t *d_branch1Nonces[8];
static uint32_t *d_branch2Nonces[8];
static uint32_t *d_branch3Nonces[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_setBlock_80(void *pdata);
extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
int order);
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
int order);
// Original Quarkhash Funktion aus einem miner Quelltext
inline void animehash(void *state, const void *input)
{
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
unsigned char hash[64];
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) input, 80);
sph_bmw512_close(&ctx_bmw, (void*) hash);
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
if (hash[0] & 0x8)
{
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
}
else
{
sph_skein512_init(&ctx_skein);
// ZSKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
}
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
if (hash[0] & 0x8)
{
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, (const void*) hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
}
else
{
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
}
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_skein512_init(&ctx_skein);
// SKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
if (hash[0] & 0x8)
{
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
}
else
{
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
}
memcpy(state, hash, 32);
}
struct HashPredicate
{
HashPredicate(uint32_t *hashes, uint32_t startNonce) :
m_hashes(hashes),
m_startNonce(startNonce)
{ }
__device__
bool operator()(const uint32_t x)
{
uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
return hash[0] & 0x8;
}
uint32_t *m_hashes;
uint32_t m_startNonce;
};
extern bool opt_benchmark;
extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00000f;
const uint32_t Htarg = ptarget[7];
const int throughput = 256*2048; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_bmw512_cpu_setBlock_80((void*)endiandata);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
size_t nrm1=0, nrm2=0, nrm3=0;
// erstes BMW512 Hash mit CUDA
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
d_branch3Nonces[thr_id], &nrm3,
order++);
// nur den Skein Branch weiterverfolgen
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für Bmw512
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
animehash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = (foundNonce - first_nonce + 1)/2;
return 1;
} else {
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = (pdata[19] - first_nonce + 1)/2;
return 0;
}

View File

@ -1,140 +1,9 @@
#if 1
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
// Endian Drehung für 32 Bit Typen
/*
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
}
*/
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL);
}
#if 0
// Endian Drehung für 64 Bit Typen
static __device__ unsigned long long cuda_swab64(unsigned long long x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h));
}
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO));
#else
return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL);
#endif
}
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y);
}
#endif
// der Versuch, einen Wrapper für einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen...
#if 1
typedef unsigned long long uint64_t;
#else
typedef class uint64
{
public:
__device__ uint64()
{
}
__device__ uint64(unsigned long long init)
{
val = make_uint2( LOWORD(init), HIWORD(init) );
}
__device__ uint64(uint32_t lo, uint32_t hi)
{
val = make_uint2( lo, hi );
}
__device__ const uint64 operator^(uint64 const& rhs) const
{
return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y);
}
__device__ const uint64 operator|(uint64 const& rhs) const
{
return uint64(val.x | rhs.val.x, val.y | rhs.val.y);
}
__device__ const uint64 operator+(unsigned long long const& rhs) const
{
return *this+uint64(rhs);
}
__device__ const uint64 operator+(uint64 const& rhs) const
{
uint64 res;
asm ("add.cc.u32 %0, %2, %4;\n\t"
"addc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator-(uint64 const& rhs) const
{
uint64 res;
asm ("sub.cc.u32 %0, %2, %4;\n\t"
"subc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator<<(int n) const
{
return uint64(unsigned long long(*this)<<n);
}
__device__ const uint64 operator>>(int n) const
{
return uint64(unsigned long long(*this)>>n);
}
__device__ operator unsigned long long() const
{
return MAKE_ULONGLONG(val.x, val.y);
}
uint2 val;
} uint64_t;
#endif
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))

View File

@ -1,11 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include "cuda_helper.h"
// Hash Target gegen das wir testen sollen
__constant__ uint32_t pTarget[8];
@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32
}
// Setup-Funktionen
__host__ void quark_check_cpu_init(int thr_id, int threads)
__host__ void cuda_check_cpu_init(int thr_id, int threads)
{
cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
}
// Target Difficulty setzen
__host__ void quark_check_cpu_setTarget(const void *ptarget)
__host__ void cuda_check_cpu_setTarget(const void *ptarget)
{
// die Message zur Berechnung auf der GPU
cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}
__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
__host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));

View File

@ -1,4 +1,4 @@
#include <stdint.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

View File

@ -1,16 +1,11 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#define USE_SHUFFLE 0
#include "cuda_helper.h"
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define ROTR(x,n) ROTR64(x,n)
#define USE_SHUFFLE 0
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] =
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#if 0
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
#endif
__device__ __forceinline__ uint64_t SWAP64(uint64_t x)
{
// Input: 77665544 33221100
// Output: 00112233 44556677
uint64_t temp[2];
temp[0] = __byte_perm(HIWORD(x), 0, 0x0123);
temp[1] = __byte_perm(LOWORD(x), 0, 0x0123);
return temp[0] | (temp[1]<<32);
}
__constant__ uint64_t c_u512[16];
const uint64_t host_u512[16] =
__device__ __constant__
const uint64_t c_u512[16] =
{
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
@ -96,24 +50,6 @@ const uint64_t host_u512[16] =
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
};
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
uint2 result;
if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTR(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#endif
#define G(a,b,c,d,e) \
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
v[d] = ROTR( v[d] ^ v[a],32); \
@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset)
v[b] = ROTR( v[b] ^ v[c],11);
__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
__device__ static
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
{
uint64_t v[16], m[16], i;
#pragma unroll 16
for( i = 0; i < 16; ++i )
{
m[i] = SWAP64(block[i]);
for( i = 0; i < 16; ++i ) {
m[i] = cuda_swab64(block[i]);
}
#pragma unroll 8
@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
}
// Endian Drehung für 32 Bit Typen
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
/*
// Endian Drehung für 64 Bit Typen
static __device__ uint64_t cuda_swab64(uint64_t x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
}
*/
static __constant__ uint64_t d_constMem[8];
static const uint64_t h_constMem[8] = {
__device__ __constant__
static const uint64_t d_constMem[8] = {
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = {
0x5be0cd19137e2179ULL };
// Hash-Padding
static __constant__ uint64_t d_constHashPadding[8];
static const uint64_t h_constHashPadding[8] = {
__device__ __constant__
static const uint64_t d_constHashPadding[8] = {
0x0000000000000080ull,
0,
0,
@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = {
0,
0x0002000000000000ull };
__global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
__global__ __launch_bounds__(256, 4)
void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads
if (thread < threads)
#endif
{
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
//uint64_t *inpHash = &g_hash[8 * hashPosition];
uint64_t *inpHash = &g_hash[hashPosition<<3];
// State vorbereiten
uint64_t h[8];
/*
h[0] = 0x6a09e667f3bcc908ULL;
h[1] = 0xbb67ae8584caa73bULL;
h[2] = 0x3c6ef372fe94f82bULL;
h[3] = 0xa54ff53a5f1d36f1ULL;
h[4] = 0x510e527fade682d1ULL;
h[5] = 0x9b05688c2b3e6c1fULL;
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
h[i] = d_constMem[i];
uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
// 128 Byte für die Message
uint64_t buf[16];
// Message für die erste Runde in Register holen
#pragma unroll 8
for (int i=0; i < 8; ++i) buf[i] = inpHash[i];
// State vorbereiten
uint64_t h[8];
#pragma unroll 8
for (i=0;i<8;i++)
h[i] = d_constMem[i];
/*
buf[ 8] = 0x0000000000000080ull;
buf[ 9] = 0;
buf[10] = 0;
buf[11] = 0;
buf[12] = 0;
buf[13] = 0x0100000000000000ull;
buf[14] = 0;
buf[15] = 0x0002000000000000ull;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
// Message für die erste Runde in Register holen
#pragma unroll 8
for (i=0; i < 8; ++i)
buf[i] = inpHash[i];
#pragma unroll 8
for (i=0; i < 8; i++)
buf[i+8] = d_constHashPadding[i];
// die einzige Hashing-Runde
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
// Hash rauslassen
#if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 8
for (int i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
#pragma unroll 8
for (i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
// in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = &g_hash[8 * hashPosition];
#pragma unroll 8
for (int i=0; i < 8; ++i)
#pragma unroll 8
for (i=0; i < 8; ++i)
{
//outHash[i] = cuda_swab64( h[i] );
outHash[i] = SWAP64(h[i]);
outHash[i] = cuda_swab64(h[i]);
}
#endif
}
@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// State vorbereiten
uint64_t h[8];
// 128 Byte für die Message
uint64_t buf[16];
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
// State vorbereiten
uint64_t h[8];
/*
h[0] = 0x6a09e667f3bcc908ULL;
h[1] = 0xbb67ae8584caa73bULL;
h[2] = 0x3c6ef372fe94f82bULL;
h[3] = 0xa54ff53a5f1d36f1ULL;
h[4] = 0x510e527fade682d1ULL;
h[5] = 0x9b05688c2b3e6c1fULL;
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;
*/
#pragma unroll 8
for(int i=0;i<8;i++)
#pragma unroll 8
for(i=0;i<8;i++)
h[i] = d_constMem[i];
// 128 Byte für die Message
uint64_t buf[16];
// Message für die erste Runde in Register holen
#pragma unroll 16
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
#pragma unroll 16
for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
// die Nounce durch die thread-spezifische ersetzen
buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
@ -333,19 +224,17 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
#if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 8
for (int i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
#pragma unroll 8
for (i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
// in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
#pragma unroll 8
for (int i=0; i < 8; ++i)
{
//outHash[i] = cuda_swab64( h[i] );
outHash[i] = SWAP64(h[i]);
#pragma unroll 8
for (i=0; i < 8; ++i) {
outHash[i] = cuda_swab64( h[i] );
}
#endif
}
@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads)
host_sigma,
sizeof(host_sigma),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_u512,
host_u512,
sizeof(host_u512),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_constMem,
h_constMem,
sizeof(h_constMem),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_constHashPadding,
h_constHashPadding,
sizeof(h_constHashPadding),
0, cudaMemcpyHostToDevice);
}
// Blake512 für 80 Byte grosse Eingangsdaten

View File

@ -1,371 +1,368 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_30_intrinsics.h"
#include <stdio.h>
#include <memory.h>
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
static uint32_t *d_tempBranch1Nonces[8];
static uint32_t *d_numValid[8];
static uint32_t *h_numValid[8];
static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// True/False tester
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
{
return ((inpHash[0] & 0x08) == 0x08);
}
__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
{
return ((inpHash[0] & 0x08) == 0);
}
__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
// Setup-Funktionen
__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
// wir brauchen auch Speicherplatz auf dem Device
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
uint32_t s1;
s1 = (threads / 256) * 2;
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
}
#if __CUDA_ARCH__ < 300
/**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/
#undef __shfl_up
#define __shfl_up(var, delta, width) (0)
#endif
// Die Summenfunktion (vom NVIDIA SDK)
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
extern __shared__ uint32_t sums[];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
//int lane_id = id % warpSize;
int lane_id = id % width;
// determine a warp_id within a block
//int warp_id = threadIdx.x / warpSize;
int warp_id = threadIdx.x / width;
sums[lane_id] = 0;
// Below is the basic structure of using a shfl instruction
// for a scan.
// Record "value" as a variable - we accumulate it along the way
uint32_t value;
if(testFunc != NULL)
{
if (id < threads)
{
uint32_t *inpHash;
if(d_validNonceTable == NULL)
{
// keine Nonce-Liste
inpHash = &inpHashes[id<<4];
}else
{
// Nonce-Liste verfügbar
int nonce = d_validNonceTable[id] - startNounce;
inpHash = &inpHashes[nonce<<4];
}
value = (*testFunc)(inpHash);
}else
{
value = 0;
}
}else
{
value = data[id];
}
__syncthreads();
// Now accumulate in log steps up the chain
// compute sums, with another thread's value who is
// distance delta away (i). Note
// those threads where the thread 'i' away would have
// been out of bounds of the warp are unaffected. This
// creates the scan sum.
#pragma unroll
for (int i=1; i<=width; i*=2)
{
uint32_t n = __shfl_up((int)value, i, width);
if (lane_id >= i) value += n;
}
// value now holds the scan value for the individual thread
// next sum the largest values for each warp
// write the sum of the warp to smem
//if (threadIdx.x % warpSize == warpSize-1)
if (threadIdx.x % width == width-1)
{
sums[warp_id] = value;
}
__syncthreads();
//
// scan sum the warp sums
// the same shfl scan operation, but performed on warp sums
//
if (warp_id == 0)
{
uint32_t warp_sum = sums[lane_id];
for (int i=1; i<=width; i*=2)
{
uint32_t n = __shfl_up((int)warp_sum, i, width);
if (lane_id >= i) warp_sum += n;
}
sums[lane_id] = warp_sum;
}
__syncthreads();
// perform a uniform add across warps in the block
// read neighbouring warp's sum and add it to threads value
uint32_t blockSum = 0;
if (warp_id > 0)
{
blockSum = sums[warp_id-1];
}
value += blockSum;
// Now write out our result
data[id] = value;
// last thread has sum, write write out the block's sum
if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
{
partial_sums[blockIdx.x] = value;
}
}
// Uniform add: add partial sums array
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
{
__shared__ uint32_t buf;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
if (id > len) return;
if (threadIdx.x == 0)
{
buf = partial_sums[blockIdx.x];
}
__syncthreads();
data[id] += buf;
}
// Der Scatter
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t actNounce = id;
uint32_t value;
if (id < threads)
{
// uint32_t nounce = startNounce + id;
uint32_t *inpHash;
if(d_validNonceTable == NULL)
{
// keine Nonce-Liste
inpHash = &inpHashes[id<<4];
}else
{
// Nonce-Liste verfügbar
int nonce = d_validNonceTable[id] - startNounce;
actNounce = nonce;
inpHash = &inpHashes[nonce<<4];
}
value = (*testFunc)(inpHash);
}else
{
value = 0;
}
if( value )
{
int idx = sum[id];
if(idx > 0)
outp[idx-1] = startNounce + actNounce;
}
}
__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
{
if(val == 0)
return 0;
uint32_t mask = 0x80000000;
while( (val & mask) == 0 ) mask = mask >> 1;
if( (val & (~mask)) != 0 )
return mask << 1;
return mask;
}
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
uint32_t *d_nonces1, cuda_compactTestFunction_t function,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{
int orgThreads = threads;
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
// threadsPerBlock ausrechnen
int blockSize = 256;
int nSummen = threads / blockSize;
int thr1 = (threads+blockSize-1) / blockSize;
int thr2 = threads / (blockSize*blockSize);
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
int thr3 = (nSummen + blockSize2-1) / blockSize2;
bool callThrid = (thr2 > 0) ? true : false;
// Erster Initialscan
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);
// weitere Scans
if(callThrid)
{
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
}else
{
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
}
// Sync + Anzahl merken
cudaStreamSynchronize(NULL);
if(callThrid)
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
else
cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
// Addieren
if(callThrid)
{
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
}
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
// Scatter
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1,
function, orgThreads, startNounce, inpHashes, d_validNonceTable);
// Sync
cudaStreamSynchronize(NULL);
}
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
uint32_t *d_nonces1, uint32_t *d_nonces2,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
/*
// threadsPerBlock ausrechnen
int blockSize = 256;
int thr1 = threads / blockSize;
int thr2 = threads / (blockSize*blockSize);
// 1
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
cudaStreamSynchronize(NULL);
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
// 2
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
cudaStreamSynchronize(NULL);
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
// Schritt 3: Scatter
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
cudaStreamSynchronize(NULL);
*/
}
__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
int order)
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
quark_compactTest_cpu_dualCompaction(thr_id, threads,
h_numValid[thr_id], d_nonces1, d_nonces2,
startNounce, inpHashes, d_validNonceTable);
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
*nrm1 = (size_t)h_numValid[thr_id][0];
*nrm2 = (size_t)h_numValid[thr_id][1];
}
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
int order)
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
*nrm1 = (size_t)h_numValid[thr_id][0];
}
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c
extern int device_map[8];
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
static uint32_t *d_tempBranch1Nonces[8];
static uint32_t *d_numValid[8];
static uint32_t *h_numValid[8];
static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// True/False tester
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
{
return ((inpHash[0] & 0x08) == 0x08);
}
__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
{
return ((inpHash[0] & 0x08) == 0);
}
__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
// Setup-Funktionen
__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
// wir brauchen auch Speicherplatz auf dem Device
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
uint32_t s1;
s1 = (threads / 256) * 2;
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
}
#if __CUDA_ARCH__ < 300
/**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/
#undef __shfl_up
#define __shfl_up(var, delta, width) (0)
#endif
// Die Summenfunktion (vom NVIDIA SDK)
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
extern __shared__ uint32_t sums[];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
//int lane_id = id % warpSize;
int lane_id = id % width;
// determine a warp_id within a block
//int warp_id = threadIdx.x / warpSize;
int warp_id = threadIdx.x / width;
sums[lane_id] = 0;
// Below is the basic structure of using a shfl instruction
// for a scan.
// Record "value" as a variable - we accumulate it along the way
uint32_t value;
if(testFunc != NULL)
{
if (id < threads)
{
uint32_t *inpHash;
if(d_validNonceTable == NULL)
{
// keine Nonce-Liste
inpHash = &inpHashes[id<<4];
}else
{
// Nonce-Liste verfügbar
int nonce = d_validNonceTable[id] - startNounce;
inpHash = &inpHashes[nonce<<4];
}
value = (*testFunc)(inpHash);
}else
{
value = 0;
}
}else
{
value = data[id];
}
__syncthreads();
// Now accumulate in log steps up the chain
// compute sums, with another thread's value who is
// distance delta away (i). Note
// those threads where the thread 'i' away would have
// been out of bounds of the warp are unaffected. This
// creates the scan sum.
#pragma unroll
for (int i=1; i<=width; i*=2)
{
uint32_t n = __shfl_up((int)value, i, width);
if (lane_id >= i) value += n;
}
// value now holds the scan value for the individual thread
// next sum the largest values for each warp
// write the sum of the warp to smem
//if (threadIdx.x % warpSize == warpSize-1)
if (threadIdx.x % width == width-1)
{
sums[warp_id] = value;
}
__syncthreads();
//
// scan sum the warp sums
// the same shfl scan operation, but performed on warp sums
//
if (warp_id == 0)
{
uint32_t warp_sum = sums[lane_id];
for (int i=1; i<=width; i*=2)
{
uint32_t n = __shfl_up((int)warp_sum, i, width);
if (lane_id >= i) warp_sum += n;
}
sums[lane_id] = warp_sum;
}
__syncthreads();
// perform a uniform add across warps in the block
// read neighbouring warp's sum and add it to threads value
uint32_t blockSum = 0;
if (warp_id > 0)
{
blockSum = sums[warp_id-1];
}
value += blockSum;
// Now write out our result
data[id] = value;
// last thread has sum, write write out the block's sum
if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
{
partial_sums[blockIdx.x] = value;
}
}
// Uniform add: add partial sums array
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
{
__shared__ uint32_t buf;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
if (id > len) return;
if (threadIdx.x == 0)
{
buf = partial_sums[blockIdx.x];
}
__syncthreads();
data[id] += buf;
}
// Der Scatter
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t actNounce = id;
uint32_t value;
if (id < threads)
{
// uint32_t nounce = startNounce + id;
uint32_t *inpHash;
if(d_validNonceTable == NULL)
{
// keine Nonce-Liste
inpHash = &inpHashes[id<<4];
}else
{
// Nonce-Liste verfügbar
int nonce = d_validNonceTable[id] - startNounce;
actNounce = nonce;
inpHash = &inpHashes[nonce<<4];
}
value = (*testFunc)(inpHash);
}else
{
value = 0;
}
if( value )
{
int idx = sum[id];
if(idx > 0)
outp[idx-1] = startNounce + actNounce;
}
}
__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
{
if(val == 0)
return 0;
uint32_t mask = 0x80000000;
while( (val & mask) == 0 ) mask = mask >> 1;
if( (val & (~mask)) != 0 )
return mask << 1;
return mask;
}
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
uint32_t *d_nonces1, cuda_compactTestFunction_t function,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{
int orgThreads = threads;
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
// threadsPerBlock ausrechnen
int blockSize = 256;
int nSummen = threads / blockSize;
int thr1 = (threads+blockSize-1) / blockSize;
int thr2 = threads / (blockSize*blockSize);
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
int thr3 = (nSummen + blockSize2-1) / blockSize2;
bool callThrid = (thr2 > 0) ? true : false;
// Erster Initialscan
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);
// weitere Scans
if(callThrid)
{
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
}else
{
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
}
// Sync + Anzahl merken
cudaStreamSynchronize(NULL);
if(callThrid)
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
else
cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
// Addieren
if(callThrid)
{
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
}
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
// Scatter
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1,
function, orgThreads, startNounce, inpHashes, d_validNonceTable);
// Sync
cudaStreamSynchronize(NULL);
}
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
uint32_t *d_nonces1, uint32_t *d_nonces2,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
/*
// threadsPerBlock ausrechnen
int blockSize = 256;
int thr1 = threads / blockSize;
int thr2 = threads / (blockSize*blockSize);
// 1
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
cudaStreamSynchronize(NULL);
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
// 2
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
cudaStreamSynchronize(NULL);
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
// Schritt 3: Scatter
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
cudaStreamSynchronize(NULL);
*/
}
__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
int order)
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
quark_compactTest_cpu_dualCompaction(thr_id, threads,
h_numValid[thr_id], d_nonces1, d_nonces2,
startNounce, inpHashes, d_validNonceTable);
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
*nrm1 = (size_t)h_numValid[thr_id][0];
*nrm2 = (size_t)h_numValid[thr_id][1];
}
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
int order)
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
*nrm1 = (size_t)h_numValid[thr_id][0];
}

View File

@ -1,23 +1,16 @@
// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
@ -25,8 +18,8 @@ static cudaDeviceProp props[8];
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
__global__ void __launch_bounds__(256, 4)
quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
__global__ __launch_bounds__(256, 4)
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;

View File

@ -1,27 +1,19 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#include "cuda_helper.h"
#define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
#define U64TO32_LE(p, v) \
*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
static const uint64_t host_keccak_round_constants[24] = {
__device__ __constant__
static const uint64_t c_keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull,
@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = {
0x0000000080000001ull, 0x8000000080008008ull
};
__constant__ uint64_t c_keccak_round_constants[24];
static __device__ __forceinline__ void
keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
size_t i;
@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
// Setup-Funktionen
__host__ void quark_keccak512_cpu_init(int thr_id, int threads)
{
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants,
host_keccak_round_constants,
sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice);
}
__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

View File

@ -1,16 +1,8 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#include "cuda_helper.h"
// aus cpu-miner.c
extern "C" extern int device_map[8];
@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// Take a look at: https://www.schneier.com/skein1.3.pdf
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))

View File

@ -1,4 +1,3 @@
extern "C"
{
#include "sph/sph_blake.h"
@ -8,9 +7,9 @@ extern "C"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "miner.h"
}
#include <stdint.h>
#include "cuda_helper.h"
}
// aus cpu-miner.c
extern int device_map[8];
@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true;
}
@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

4
util.c
View File

@ -1352,6 +1352,10 @@ void print_hash_tests(void)
myriadhash(&hash[0], &buf[0]);
printf("\nmyriad: "); print_hash(hash);
memset(hash, 0, sizeof hash);
nist5hash(&hash[0], &buf[0]);
printf("\nnist5: "); print_hash(hash);
memset(hash, 0, sizeof hash);
quarkhash(&hash[0], &buf[0]);
printf("\nquark: "); print_hash(hash);

View File

@ -1,30 +1,13 @@
#include <cuda_runtime.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#if 0
__device__ static uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
#endif
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
typedef unsigned int uint32_t; /* must be exactly 32 bits */
#define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
#define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }

View File

@ -1,33 +1,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
// das Hi Word aus einem 64 Bit Typen extrahieren
#if 0
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#endif
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

View File

@ -18,28 +18,18 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <cuda_runtime.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef struct {
uint32_t buffer[8]; /* Buffer to be hashed */
uint32_t chainv[40]; /* Chaining values */
} hashState;
__device__ __forceinline__
static uint32_t BYTES_SWAP32(uint32_t x)
{
return __byte_perm(x, x, 0x0123);
}
#define MULT2(a,j)\
tmp = a[7+(8*j)];\
a[7+(8*j)] = a[6+(8*j)];\
@ -289,11 +279,11 @@ __device__ __forceinline__
void Update512(hashState *state, const BitSequence *data)
{
#pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]);
rnd512(state);
#pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]);
rnd512(state);
}
@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b)
for(j=0;j<5;j++) {
b[i] ^= state->chainv[i+8*j];
}
b[i] = BYTES_SWAP32((b[i]));
b[i] = cuda_swab32((b[i]));
}
#pragma unroll 8
@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b)
for(j=0;j<5;j++) {
b[8+i] ^= state->chainv[i+8*j];
}
b[8+i] = BYTES_SWAP32((b[8+i]));
b[8 + i] = cuda_swab32((b[8 + i]));
}
}

View File

@ -1,18 +1,13 @@
#include <stdint.h>
#include <cuda_runtime.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
//typedef unsigned char BitSequence;
//typedef unsigned long long DataLength;
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
static __constant__ uint32_t d_ShaviteInitVector[16];
static const uint32_t h_ShaviteInitVector[] = {
__device__ __constant__
static const uint32_t d_ShaviteInitVector[16] = {
SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
@ -1304,18 +1299,18 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
aes_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
// kopiere init-state
uint32_t state[16];
#pragma unroll 16
#pragma unroll 16
for(int i=0;i<16;i++)
state[i] = d_ShaviteInitVector[i];
@ -1323,13 +1318,13 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
uint32_t msg[32];
// fülle die Nachricht mit 64-byte (vorheriger Hash)
#pragma unroll 16
#pragma unroll 16
for(int i=0;i<16;i++)
msg[i] = Hash[i];
// Nachrichtenende
msg[16] = 0x80;
#pragma unroll 10
#pragma unroll 10
for(int i=17;i<27;i++)
msg[i] = 0;
@ -1341,10 +1336,10 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
c512(sharedMemory, state, msg);
#pragma unroll 16
#pragma unroll 16
for(int i=0;i<16;i++)
Hash[i] = state[i];
}
}
}
@ -1352,25 +1347,19 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
__host__ void x11_shavite512_cpu_init(int thr_id, int threads)
{
aes_cpu_init();
cudaMemcpyToSymbol( d_ShaviteInitVector,
h_ShaviteInitVector,
sizeof(h_ShaviteInitVector),
0, cudaMemcpyHostToDevice);
}
__host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

View File

@ -7,29 +7,17 @@
#define TPB 256
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
int *d_state[8];
uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
#define C32(x) ((uint32_t)(x ## U))
#define T32(x) ((x) & C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __constant__
const uint32_t c_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \
#undef BUTTERFLY
}
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \
__device__ __forceinline__ void FFT_16(int *y) {
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning FFT_16() function is not compatible with SM 2.1 devices!
#endif
@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
{
int i;
#if __CUDA_ARCH__ < 300
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#ifndef WIN32
# warning Expansion() function is not compatible with SM 2.1 devices
#endif

View File

@ -15,10 +15,11 @@ extern "C"
#include "sph/sph_echo.h"
#include "miner.h"
}
#include "cuda_helper.h"
#include <stdint.h>
#include <cuda_helper.h>
#include <stdio.h>
#include <memory.h>
}
// aus cpu-miner.c
extern int device_map[8];
@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
extern void x11_echo512_cpu_init(int thr_id, int threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
uint32_t foundNonce;
@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

View File

@ -5,26 +5,11 @@
* heavily based on phm's sgminer
*
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdint.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5, 5.0)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
/*
* X13 kernel implementation.
*
@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
* @author phm <phm@inbox.com>
*/
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#define mixtab0(x) (*((uint32_t*)mixtabs + ( (x))))
#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
@ -584,97 +567,97 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
__syncthreads();
int i;
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int i;
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
#pragma unroll 16
for( i = 0; i < 16; i++ )
Hash[i] = SWAB32(Hash[i]);
#pragma unroll 16
for( i = 0; i < 16; i++ )
Hash[i] = cuda_swab32(Hash[i]);
uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
uint32_t S30, S31, S32, S33, S34, S35;
uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
uint32_t S30, S31, S32, S33, S34, S35;
uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
uint64_t bc = (uint64_t) 64 << 3;
uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
uint32_t bchi = (uint32_t)(bc >> 32);
uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
uint64_t bc = (uint64_t) 64 << 3;
uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
uint32_t bchi = (uint32_t)(bc >> 32);
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1);
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1);
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
FUGUE512_3((Hash[0xF]), bchi, bclo);
FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
FUGUE512_3((Hash[0xF]), bchi, bclo);
#pragma unroll 32
for (i = 0; i < 32; i ++) {
ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03);
}
#pragma unroll 13
for (i = 0; i < 13; i ++) {
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
S27 ^= S00;
ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S18 ^= S00;
S27 ^= S00;
ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S19 ^= S00;
S27 ^= S00;
ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S19 ^= S00;
S28 ^= S00;
ROR8;
SMIX(S00, S01, S02, S03);
}
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
S27 ^= S00;
#pragma unroll 32
for (i = 0; i < 32; i ++) {
ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03);
}
#pragma unroll 13
for (i = 0; i < 13; i ++) {
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
S27 ^= S00;
ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S18 ^= S00;
S27 ^= S00;
ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S19 ^= S00;
S27 ^= S00;
ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S19 ^= S00;
S28 ^= S00;
ROR8;
SMIX(S00, S01, S02, S03);
}
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
S27 ^= S00;
Hash[0] = SWAB32(S01);
Hash[1] = SWAB32(S02);
Hash[2] = SWAB32(S03);
Hash[3] = SWAB32(S04);
Hash[4] = SWAB32(S09);
Hash[5] = SWAB32(S10);
Hash[6] = SWAB32(S11);
Hash[7] = SWAB32(S12);
Hash[8] = SWAB32(S18);
Hash[9] = SWAB32(S19);
Hash[10] = SWAB32(S20);
Hash[11] = SWAB32(S21);
Hash[12] = SWAB32(S27);
Hash[13] = SWAB32(S28);
Hash[14] = SWAB32(S29);
Hash[15] = SWAB32(S30);
}
Hash[0] = cuda_swab32(S01);
Hash[1] = cuda_swab32(S02);
Hash[2] = cuda_swab32(S03);
Hash[3] = cuda_swab32(S04);
Hash[4] = cuda_swab32(S09);
Hash[5] = cuda_swab32(S10);
Hash[6] = cuda_swab32(S11);
Hash[7] = cuda_swab32(S12);
Hash[8] = cuda_swab32(S18);
Hash[9] = cuda_swab32(S19);
Hash[10] = cuda_swab32(S20);
Hash[11] = cuda_swab32(S21);
Hash[12] = cuda_swab32(S27);
Hash[13] = cuda_swab32(S28);
Hash[14] = cuda_swab32(S29);
Hash[15] = cuda_swab32(S30);
}
}
#define texDef(texname, texmem, texsource, texsize) \
@ -697,17 +680,17 @@ __host__ void x13_fugue512_cpu_init(int thr_id, int threads)
__host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 4 * 256 * sizeof(uint32_t);
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

View File

@ -37,26 +37,11 @@
* @author phm <phm@inbox.com>
*/
#include <stdint.h>
#include <cuda_runtime.h>
#include "cuda_helper.h"
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__device__ __constant__
static const uint32_t d_alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
@ -663,7 +648,7 @@ static const uint32_t d_T512[64][16] = {
mD = 0; \
mE = 0; \
mF = 0; \
for (u = 0; u < 8; u ++) { \
for (u = 0; u < 8; u ++) { \
unsigned db = buf(u); \
for (v = 0; v < 8; v ++, db >>= 1) { \
uint32_t dm = SPH_T32(-(uint32_t)(db & 1)); \
@ -692,45 +677,47 @@ static const uint32_t d_T512[64][16] = {
// Die Hash-Funktion
__global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
unsigned char *h1 = (unsigned char *)Hash;
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
unsigned char *h1 = (unsigned char *)Hash;
uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
uint32_t m0, m1, m2, m3, m4, m5, m6, m7;
uint32_t m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
uint32_t m0, m1, m2, m3, m4, m5, m6, m7;
uint32_t m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) (h1[i+u])
#pragma unroll 8
for(int i = 0; i < 64; i += 8) {
INPUT_BIG;
P_BIG;
T_BIG;
}
for(int i = 0; i < 64; i += 8) {
INPUT_BIG;
P_BIG;
T_BIG;
}
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG;
P_BIG;
T_BIG;
INPUT_BIG;
P_BIG;
T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG;
PF_BIG;
T_BIG;
INPUT_BIG;
PF_BIG;
T_BIG;
#pragma unroll 16
for (int i = 0; i < 16; i++)
Hash[i] = SWAB32(h[i]);
}
for (int i = 0; i < 16; i++)
Hash[i] = cuda_swab32(h[i]);
}
}
@ -740,18 +727,17 @@ __host__ void x13_hamsi512_cpu_init(int thr_id, int threads)
__host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

View File

@ -20,10 +20,9 @@ extern "C"
#include "sph/sph_fugue.h"
#include "miner.h"
}
#include <stdint.h>
#include <cuda_helper.h>
#include "cuda_helper.h"
}
// aus cpu-miner.c
extern int device_map[8];
@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
extern void x13_fugue512_cpu_init(int thr_id, int threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x11_echo512_cpu_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
uint32_t foundNonce;
@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

View File

@ -1,26 +1,10 @@
/*
* Shabal-512 for X14/X15 (STUB)
*/
#include <stdint.h>
#include <cuda_runtime.h>
#include "cuda_helper.h"
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
/*
* Shabal implementation.

View File

@ -4,8 +4,8 @@
* tpruvot@github
*/
#include <stdio.h>
#include <stdint.h>
#include <cuda_helper.h>
#include "cuda_helper.h"
#define NULLTEST 0
@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
#define SPH_64 (1)
#define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1)
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// defined in cuda_helper.h
#define SPH_ROTL64(x,n) ROTL64(x,n)

View File

@ -22,10 +22,9 @@ extern "C" {
#include "sph/sph_shabal.h"
#include "miner.h"
}
#include <stdint.h>
#include <cuda_helper.h>
#include "cuda_helper.h"
}
// from cpu-miner.c
extern int device_map[8];
@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
extern void x14_shabal512_cpu_init(int thr_id, int threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
/* check now with the CPU to confirm */

View File

@ -23,10 +23,9 @@ extern "C" {
#include "sph/sph_whirlpool.h"
#include "miner.h"
}
#include <stdint.h>
#include <cuda_helper.h>
#include "cuda_helper.h"
}
// to test gpu hash on a null buffer
#define NULLTEST 0
@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
extern void x15_whirlpool_cpu_init(int thr_id, int threads);
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
x14_shabal512_cpu_init(thr_id, throughput);
x15_whirlpool_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
print_hash((unsigned char*)buf); printf("\n");
#endif
/* Scan with GPU */
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{