@ -5,19 +5,36 @@
#include "cuda_helper.h"
#include "cuda_helper.h"
// aus cpu-miner.c
// aus cpu-miner.c
extern "C" extern int device_map[8];
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Take a look at: https://www.schneier.com/skein1.3.pdf
// Take a look at: https://www.schneier.com/skein1.3.pdf
#define SHL(x, n) ((x) << (n))
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define SHR(x, n) ((x) >> (n))
// Zum testen Hostcode...
__device__
/* Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll.
uint64_t skein_rotl64(const uint64_t x, const int offset)
mal direkt in den Code eingesetzt werden
{
*/
uint64_t res;
asm("{\n\t"
".reg .u32 tl,th,vl,vh;\n\t"
".reg .pred p;\n\t"
"mov.b64 {tl,th}, %1;\n\t"
"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
"setp.lt.u32 p, %2, 32;\n\t"
"@!p mov.b64 %0, {vl,vh};\n\t"
"@p mov.b64 %0, {vh,vl};\n\t"
"}"
: "=l"(res) : "l"(x) , "r"(offset)
);
return res;
}
#if __CUDA_ARCH__ >= 350
#undef ROTL64
#define ROTL64 skein_rotl64
#endif
/*
/*
* M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
* M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \
TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \
}
}
static __constant__ uint64_t d_constMem[8];
__global__
static const uint64_t h_constMem[8] = {
void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector)
SPH_C64(0x4903ADFF749C51CE),
SPH_C64(0x0D95DE399746DF03),
SPH_C64(0x8FD1934127C79BCE),
SPH_C64(0x9A255629FF352CB1),
SPH_C64(0x5DB62599DF6CA7B0),
SPH_C64(0xEABE394CA9D5C3F4),
SPH_C64(0x991112C71A75B523),
SPH_C64(0xAE18A40B660FCC33) };
__global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
if (thread < threads)
@ -315,14 +322,14 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
uint64_t *inpHash = &g_hash[8 * hashPosition];
uint64_t *inpHash = &g_hash[8 * hashPosition];
// Initialisierung
// Initialisierung
h0 = d_constMem[0] ;
h0 = 0x4903ADFF749C51CEull ;
h1 = d_constMem[1] ;
h1 = 0x0D95DE399746DF03ull ;
h2 = d_constMem[2] ;
h2 = 0x8FD1934127C79BCEull ;
h3 = d_constMem[3] ;
h3 = 0x9A255629FF352CB1ull ;
h4 = d_constMem[4] ;
h4 = 0x5DB62599DF6CA7B0ull ;
h5 = d_constMem[5] ;
h5 = 0xEABE394CA9D5C3F4ull ;
h6 = d_constMem[6] ;
h6 = 0x991112C71A75B523ull ;
h7 = d_constMem[7] ;
h7 = 0xAE18A40B660FCC33ull ;
// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
#pragma unroll 8
#pragma unroll 8
@ -399,16 +406,13 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
}
}
// Setup-Funktionen
// Setup-Funktionen
__host__ void quark_skein512_cpu_init(int thr_id, int threads)
__host__
void quark_skein512_cpu_init(int thr_id, int threads)
{
{
// nix zu tun ;-)
cudaMemcpyToSymbol( d_constMem,
h_constMem,
sizeof(h_constMem),
0, cudaMemcpyHostToDevice);
}
}
__host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
__host__
void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
{
const int threadsperblock = 256;
const int threadsperblock = 256;