mirror of
https://github.com/GOSTSec/ccminer
synced 2025-02-02 01:44:32 +00:00
whirlpool midstate and debug/trace defines
+ new cuda_debug.cuh include to trace gpu data Happy new year! Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
This commit is contained in:
parent
394d50987c
commit
76a22479b1
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
ccMiner release 1.7.1 (Dec 2015) "Mining Diff & basic MultiAlgo"
|
ccMiner release 1.7.1 (Dec 2015) "Sibcoin & Whirlpool midstate"
|
||||||
---------------------------------------------------------------
|
---------------------------------------------------------------
|
||||||
|
|
||||||
***************************************************************
|
***************************************************************
|
||||||
@ -235,6 +235,7 @@ features.
|
|||||||
|
|
||||||
Dec. 31th 2015 v1.7.1
|
Dec. 31th 2015 v1.7.1
|
||||||
Implement sib algo (X11 + Russian Streebog-512/GOST)
|
Implement sib algo (X11 + Russian Streebog-512/GOST)
|
||||||
|
Whirlpool speed x2 with the midstate precompute
|
||||||
Small bug fixes about device ids mapping (and vendor names)
|
Small bug fixes about device ids mapping (and vendor names)
|
||||||
|
|
||||||
Nov. 06th 2015 v1.7
|
Nov. 06th 2015 v1.7
|
||||||
|
47
cuda_debug.cuh
Normal file
47
cuda_debug.cuh
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
/**
|
||||||
|
* Helper to trace gpu computed data with --cputest
|
||||||
|
*
|
||||||
|
* Sample usage in an algo scan cuda unit :
|
||||||
|
*
|
||||||
|
* #define _DEBUG
|
||||||
|
* #define _DEBUG_PREFIX "x11-"
|
||||||
|
* #include "cuda_debug.cuh"
|
||||||
|
*
|
||||||
|
* TRACE64("luffa", d_hash);
|
||||||
|
* or
|
||||||
|
* TRACE("luffa")
|
||||||
|
*
|
||||||
|
* Dont forget to link the scan function in util.cpp (do_gpu_tests)
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
//#include "cuda_helper.h"
|
||||||
|
|
||||||
|
#ifndef _DEBUG_PREFIX
|
||||||
|
#define _DEBUG_PREFIX ""
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _DEBUG
|
||||||
|
#define TRACE64(algo, d_buf) { \
|
||||||
|
if (max_nonce == 1 && pdata[19] <= 1 && !opt_benchmark) { \
|
||||||
|
uint32_t oft = 0; \
|
||||||
|
uint32_t* debugbuf = NULL; \
|
||||||
|
cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \
|
||||||
|
cudaMemcpy(debugbuf, d_buf[thr_id] + oft, 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
|
||||||
|
printf(_DEBUG_PREFIX "%s %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", \
|
||||||
|
algo, \
|
||||||
|
swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3]), \
|
||||||
|
swab32(debugbuf[4]), swab32(debugbuf[5]), swab32(debugbuf[6]), swab32(debugbuf[7]), \
|
||||||
|
swab32(debugbuf[8]), swab32(debugbuf[9]), swab32(debugbuf[10]),swab32(debugbuf[11]), \
|
||||||
|
swab32(debugbuf[12]),swab32(debugbuf[13]),swab32(debugbuf[14]),swab32(debugbuf[15])); \
|
||||||
|
cudaFreeHost(debugbuf); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define TRACE64(algo, d_buf) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// simplified default
|
||||||
|
#define TRACE(algo) TRACE64(algo, d_hash)
|
||||||
|
|
24
x11/sib.cu
24
x11/sib.cu
@ -22,8 +22,6 @@ extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
|
|
||||||
//#define _DEBUG
|
|
||||||
|
|
||||||
static uint32_t *d_hash[MAX_GPUS];
|
static uint32_t *d_hash[MAX_GPUS];
|
||||||
|
|
||||||
// Sibcoin CPU Hash
|
// Sibcoin CPU Hash
|
||||||
@ -71,7 +69,6 @@ extern "C" void sibhash(void *output, const void *input)
|
|||||||
sph_gost512_init(&ctx_gost);
|
sph_gost512_init(&ctx_gost);
|
||||||
sph_gost512(&ctx_gost, (const void*) hash, 64);
|
sph_gost512(&ctx_gost, (const void*) hash, 64);
|
||||||
sph_gost512_close(&ctx_gost, (void*) hash);
|
sph_gost512_close(&ctx_gost, (void*) hash);
|
||||||
//applog_hash64(hash);
|
|
||||||
|
|
||||||
sph_luffa512_init(&ctx_luffa);
|
sph_luffa512_init(&ctx_luffa);
|
||||||
sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
|
sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
|
||||||
@ -96,24 +93,9 @@ extern "C" void sibhash(void *output, const void *input)
|
|||||||
memcpy(output, hash, 32);
|
memcpy(output, hash, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _DEBUG
|
//#define _DEBUG
|
||||||
#define TRACE(algo) { \
|
#define _DEBUG_PREFIX "sib"
|
||||||
if (max_nonce == 1 && pdata[19] <= 1 && !opt_benchmark) { \
|
#include "cuda_debug.cuh"
|
||||||
uint32_t oft = 0; \
|
|
||||||
uint32_t* debugbuf = NULL; \
|
|
||||||
cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \
|
|
||||||
cudaMemcpy(debugbuf, d_hash[thr_id] + oft, 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
|
|
||||||
printf("SIB %s %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", algo, \
|
|
||||||
swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3]), \
|
|
||||||
swab32(debugbuf[4]), swab32(debugbuf[5]), swab32(debugbuf[6]), swab32(debugbuf[7]), \
|
|
||||||
swab32(debugbuf[8]), swab32(debugbuf[9]), swab32(debugbuf[10]),swab32(debugbuf[11]), \
|
|
||||||
swab32(debugbuf[12]),swab32(debugbuf[13]),swab32(debugbuf[14]),swab32(debugbuf[15])); \
|
|
||||||
cudaFreeHost(debugbuf); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#define TRACE(algo) {}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static bool init[MAX_GPUS] = { 0 };
|
static bool init[MAX_GPUS] = { 0 };
|
||||||
|
|
||||||
|
17
x11/x11.cu
17
x11/x11.cu
@ -87,20 +87,9 @@ extern "C" void x11hash(void *output, const void *input)
|
|||||||
memcpy(output, hash, 32);
|
memcpy(output, hash, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _DEBUG
|
//#define _DEBUG
|
||||||
#define TRACE(algo) { \
|
#define _DEBUG_PREFIX "x11"
|
||||||
if (max_nonce == 1 && pdata[19] <= 1) { \
|
#include "cuda_debug.cuh"
|
||||||
uint32_t* debugbuf = NULL; \
|
|
||||||
cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \
|
|
||||||
cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
|
|
||||||
printf("X11 %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
|
|
||||||
swab32(debugbuf[2]), swab32(debugbuf[3])); \
|
|
||||||
cudaFreeHost(debugbuf); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#define TRACE(algo) {}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static bool init[MAX_GPUS] = { 0 };
|
static bool init[MAX_GPUS] = { 0 };
|
||||||
|
|
||||||
|
@ -1,43 +1,9 @@
|
|||||||
/*
|
|
||||||
* Built on cbuchner1's implementation, actual hashing code
|
|
||||||
* based on sphlib 3.0
|
|
||||||
*/
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <memory.h>
|
|
||||||
|
|
||||||
#define threadsperblock 256
|
|
||||||
|
|
||||||
//#define __DEV_STORAGE__ __constant__
|
|
||||||
#define __DEV_STORAGE__ __device__
|
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
|
||||||
extern __device__ __device_builtin__ void __threadfence_block(void);
|
|
||||||
|
|
||||||
__DEV_STORAGE__ static uint64_t c_PaddedMessage80[16]; // input end block after midstate
|
|
||||||
__DEV_STORAGE__ static uint32_t pTarget[8];
|
|
||||||
|
|
||||||
static uint32_t *h_wnounce[MAX_GPUS] = { 0 };
|
|
||||||
static uint32_t *d_WNonce[MAX_GPUS] = { 0 };
|
|
||||||
|
|
||||||
#define USE_ALL_TABLES 1
|
|
||||||
|
|
||||||
__DEV_STORAGE__ static uint64_t mixTob0Tox[256];
|
|
||||||
#if USE_ALL_TABLES
|
|
||||||
__DEV_STORAGE__ static uint64_t mixTob1Tox[256];
|
|
||||||
__DEV_STORAGE__ static uint64_t mixTob2Tox[256];
|
|
||||||
__DEV_STORAGE__ static uint64_t mixTob3Tox[256];
|
|
||||||
__DEV_STORAGE__ static uint64_t mixTob4Tox[256];
|
|
||||||
__DEV_STORAGE__ static uint64_t mixTob5Tox[256];
|
|
||||||
__DEV_STORAGE__ static uint64_t mixTob6Tox[256];
|
|
||||||
__DEV_STORAGE__ static uint64_t mixTob7Tox[256];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whirlpool CUDA kernel implementation.
|
* Whirlpool-512 CUDA implementation.
|
||||||
*
|
*
|
||||||
* ==========================(LICENSE BEGIN)============================
|
* ==========================(LICENSE BEGIN)============================
|
||||||
*
|
*
|
||||||
* Copyright (c) 2014 djm34 & tpruvot & SP
|
* Copyright (c) 2014-2016 djm34, tpruvot, SP
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining
|
* Permission is hereby granted, free of charge, to any person obtaining
|
||||||
* a copy of this software and associated documentation files (the
|
* a copy of this software and associated documentation files (the
|
||||||
@ -59,10 +25,40 @@ __DEV_STORAGE__ static uint64_t mixTob7Tox[256];
|
|||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*
|
*
|
||||||
* ===========================(LICENSE END)=============================
|
* ===========================(LICENSE END)=============================
|
||||||
* @author djm34
|
* @author djm34 (initial draft)
|
||||||
* @author tpruvot
|
* @author tpruvot (dual old/whirlpool modes, midstate)
|
||||||
* @author SP
|
* @author SP ("final" function opt and tuning)
|
||||||
*/
|
*/
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <memory.h>
|
||||||
|
|
||||||
|
// don't change, used by shared mem fetch!
|
||||||
|
#define threadsperblock 256
|
||||||
|
|
||||||
|
#include "cuda_helper.h"
|
||||||
|
#include "miner.h"
|
||||||
|
|
||||||
|
extern __device__ __device_builtin__ void __threadfence_block(void);
|
||||||
|
|
||||||
|
__device__ static uint64_t c_PaddedMessage80[16];
|
||||||
|
__device__ static uint32_t pTarget[8];
|
||||||
|
|
||||||
|
static uint32_t *h_wnounce[MAX_GPUS] = { 0 };
|
||||||
|
static uint32_t *d_WNonce[MAX_GPUS] = { 0 };
|
||||||
|
|
||||||
|
#define HOST_MIDSTATE 1
|
||||||
|
#define USE_ALL_TABLES 1
|
||||||
|
|
||||||
|
__constant__ static uint64_t mixTob0Tox[256];
|
||||||
|
#if USE_ALL_TABLES
|
||||||
|
__constant__ static uint64_t mixTob1Tox[256];
|
||||||
|
__constant__ static uint64_t mixTob2Tox[256];
|
||||||
|
__constant__ static uint64_t mixTob3Tox[256];
|
||||||
|
__constant__ static uint64_t mixTob4Tox[256];
|
||||||
|
__constant__ static uint64_t mixTob5Tox[256];
|
||||||
|
__constant__ static uint64_t mixTob6Tox[256];
|
||||||
|
__constant__ static uint64_t mixTob7Tox[256];
|
||||||
|
#endif
|
||||||
|
|
||||||
static const uint64_t old1_T0[256] = {
|
static const uint64_t old1_T0[256] = {
|
||||||
SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323),
|
SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323),
|
||||||
@ -2181,7 +2177,7 @@ static const uint64_t plain_T7[256] = {
|
|||||||
/**
|
/**
|
||||||
* Round constants.
|
* Round constants.
|
||||||
*/
|
*/
|
||||||
__DEV_STORAGE__ uint64_t InitVector_RC[10];
|
__device__ uint64_t InitVector_RC[10];
|
||||||
|
|
||||||
static const uint64_t plain_RC[10] = {
|
static const uint64_t plain_RC[10] = {
|
||||||
SPH_C64(0x4F01B887E8C62318),
|
SPH_C64(0x4F01B887E8C62318),
|
||||||
@ -2291,7 +2287,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
|
|||||||
|
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
|
void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab)
|
||||||
{
|
{
|
||||||
__shared__ uint64_t sharedMemory[2048];
|
__shared__ uint64_t sharedMemory[2048];
|
||||||
|
|
||||||
@ -2307,29 +2303,30 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
|
|||||||
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
|
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
__threadfence_block(); // ensure shared mem is ready
|
||||||
|
|
||||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
uint32_t nounce = startNounce + thread;
|
|
||||||
union {
|
|
||||||
uint8_t h1[64];
|
|
||||||
uint32_t h4[16];
|
|
||||||
uint64_t h8[8];
|
|
||||||
} hash;
|
|
||||||
|
|
||||||
uint64_t n[8];
|
uint64_t n[8];
|
||||||
uint64_t h[8];
|
uint64_t h[8];
|
||||||
|
uint32_t nonce = startNounce + thread;
|
||||||
|
nonce = swab ? cuda_swab32(nonce) : nonce;
|
||||||
|
|
||||||
|
#if HOST_MIDSTATE
|
||||||
|
uint64_t state[8];
|
||||||
|
#pragma unroll 8
|
||||||
|
for (int i=0; i < 8; i++) {
|
||||||
|
state[i] = c_PaddedMessage80[i];
|
||||||
|
}
|
||||||
|
#else
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (int i=0; i<8; i++) {
|
for (int i=0; i<8; i++) {
|
||||||
n[i] = c_PaddedMessage80[i]; // read data
|
n[i] = c_PaddedMessage80[i]; // read data
|
||||||
h[i] = 0; // read state
|
h[i] = 0; // read state
|
||||||
}
|
}
|
||||||
|
|
||||||
__threadfence_block(); // ensure shared mem is ready
|
#pragma unroll 1
|
||||||
|
|
||||||
// #pragma unroll 10
|
|
||||||
for (unsigned r=0; r < 10; r++) {
|
for (unsigned r=0; r < 10; r++) {
|
||||||
uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||||
ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
|
ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
|
||||||
@ -2341,11 +2338,11 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
|
|||||||
for (int i=0; i < 8; i++) {
|
for (int i=0; i < 8; i++) {
|
||||||
state[i] = xor1(n[i],c_PaddedMessage80[i]);
|
state[i] = xor1(n[i],c_PaddedMessage80[i]);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
/// round 2 ///////
|
/// round 2 ///////
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
n[0] = c_PaddedMessage80[8]; //read data
|
n[0] = c_PaddedMessage80[8]; //read data
|
||||||
n[1] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); //whirlpool
|
n[1] = REPLACE_HIDWORD(c_PaddedMessage80[9], nonce); //whirlpool
|
||||||
n[2] = 0x0000000000000080; //whirlpool
|
n[2] = 0x0000000000000080; //whirlpool
|
||||||
n[3] = 0;
|
n[3] = 0;
|
||||||
n[4] = 0;
|
n[4] = 0;
|
||||||
@ -2359,7 +2356,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
|
|||||||
n[i] = xor1(n[i],h[i]);
|
n[i] = xor1(n[i],h[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// #pragma unroll 10
|
// #pragma unroll
|
||||||
for (unsigned r=0; r < 10; r++) {
|
for (unsigned r=0; r < 10; r++) {
|
||||||
uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||||
ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
|
ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
|
||||||
@ -2367,7 +2364,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
|
|||||||
}
|
}
|
||||||
|
|
||||||
state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]);
|
state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]);
|
||||||
state[1] = xor3(state[1], n[1], REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)) );
|
state[1] = xor3(state[1], n[1], REPLACE_HIDWORD(c_PaddedMessage80[9], nonce) );
|
||||||
state[2] = xor3(state[2], n[2], 0x0000000000000080);
|
state[2] = xor3(state[2], n[2], 0x0000000000000080);
|
||||||
state[3] = xor1(state[3], n[3]);
|
state[3] = xor1(state[3], n[3]);
|
||||||
state[4] = xor1(state[4], n[4]);
|
state[4] = xor1(state[4], n[4]);
|
||||||
@ -2375,15 +2372,10 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
|
|||||||
state[6] = xor1(state[6], n[6]);
|
state[6] = xor1(state[6], n[6]);
|
||||||
state[7] = xor3(state[7], n[7], 0x8002000000000000);
|
state[7] = xor3(state[7], n[7], 0x8002000000000000);
|
||||||
|
|
||||||
|
uint64_t* outHash = &(((uint64_t*)outputHash)[(size_t)8 * thread]);
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (unsigned i = 0; i < 8; i++)
|
for (unsigned i = 0; i < 8; i++)
|
||||||
hash.h8[i] = state[i];
|
outHash[i] = state[i];
|
||||||
|
|
||||||
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
|
|
||||||
|
|
||||||
#pragma unroll 16
|
|
||||||
for (int i=0; i<16; i++)
|
|
||||||
outHash[i] = hash.h4[i];
|
|
||||||
|
|
||||||
} // thread < threads
|
} // thread < threads
|
||||||
}
|
}
|
||||||
@ -2405,6 +2397,7 @@ void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
|
|||||||
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
|
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
__threadfence_block(); // ensure shared mem is ready
|
||||||
|
|
||||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
@ -2465,7 +2458,7 @@ void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint32_t *resNounce)
|
void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNounce)
|
||||||
{
|
{
|
||||||
__shared__ uint64_t sharedMemory[2048];
|
__shared__ uint64_t sharedMemory[2048];
|
||||||
|
|
||||||
@ -2482,14 +2475,13 @@ void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint6
|
|||||||
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
|
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
__threadfence_block(); // ensure shared mem is ready
|
||||||
|
|
||||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
|
uint32_t nonce = startNounce + thread;
|
||||||
|
uint64_t *inpHash = (uint64_t*) &g_hash[(size_t)8 * thread];
|
||||||
int hashPosition = nounce - startNounce;
|
|
||||||
uint64_t *inpHash = (uint64_t*) &g_hash[8 * hashPosition];
|
|
||||||
uint64_t h8[8];
|
uint64_t h8[8];
|
||||||
|
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
@ -2529,7 +2521,7 @@ void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint6
|
|||||||
n[i] = xor1(n[i], h[i]);
|
n[i] = xor1(n[i], h[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll 10
|
// #pragma unroll 10
|
||||||
for (unsigned r=0; r < 10; r++) {
|
for (unsigned r=0; r < 10; r++) {
|
||||||
uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||||
ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
|
ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
|
||||||
@ -2546,8 +2538,8 @@ void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint6
|
|||||||
state[7] = xor3(state[7], n[7], 0x2000000000000);
|
state[7] = xor3(state[7], n[7], 0x2000000000000);
|
||||||
|
|
||||||
bool rc = (state[3] <= ((uint64_t*)pTarget)[3]);
|
bool rc = (state[3] <= ((uint64_t*)pTarget)[3]);
|
||||||
if (rc && resNounce[0] > nounce)
|
if (rc && resNounce[0] > nonce)
|
||||||
resNounce[0] = nounce;
|
resNounce[0] = nonce;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2581,18 +2573,20 @@ extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode)
|
|||||||
cudaMemcpyToSymbol(mixTob6Tox, old1_T6, (256*8), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(mixTob6Tox, old1_T6, (256*8), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(mixTob7Tox, old1_T7, (256*8), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(mixTob7Tox, old1_T7, (256*8), 0, cudaMemcpyHostToDevice);
|
||||||
#endif
|
#endif
|
||||||
|
cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t));
|
||||||
|
cudaMallocHost(&h_wnounce[thr_id], sizeof(uint32_t));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t));
|
|
||||||
cudaMallocHost(&h_wnounce[thr_id], sizeof(uint32_t));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
extern void x15_whirlpool_cpu_free(int thr_id)
|
extern void x15_whirlpool_cpu_free(int thr_id)
|
||||||
{
|
{
|
||||||
cudaFree(d_WNonce[thr_id]);
|
if (h_wnounce[thr_id]) {
|
||||||
cudaFreeHost(h_wnounce[thr_id]);
|
cudaFree(d_WNonce[thr_id]);
|
||||||
|
cudaFreeHost(h_wnounce[thr_id]);
|
||||||
|
h_wnounce[thr_id] = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
@ -2614,13 +2608,10 @@ extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint
|
|||||||
dim3 grid((threads + threadsperblock-1) / threadsperblock);
|
dim3 grid((threads + threadsperblock-1) / threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
size_t shared_size = 0;
|
|
||||||
|
|
||||||
cudaMemset(d_WNonce[thr_id], 0xff, sizeof(uint32_t));
|
cudaMemset(d_WNonce[thr_id], 0xff, sizeof(uint32_t));
|
||||||
|
|
||||||
oldwhirlpool_gpu_finalhash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector,d_WNonce[thr_id]);
|
oldwhirlpool_gpu_finalhash_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_WNonce[thr_id]);
|
||||||
|
|
||||||
MyStreamSynchronize(NULL, order, thr_id);
|
|
||||||
cudaMemcpy(h_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
cudaMemcpy(h_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
result = *h_wnounce[thr_id];
|
result = *h_wnounce[thr_id];
|
||||||
@ -2634,18 +2625,30 @@ void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
|
|||||||
dim3 grid((threads + threadsperblock-1) / threadsperblock);
|
dim3 grid((threads + threadsperblock-1) / threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
oldwhirlpool_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_outputHash);
|
if (threads < 256)
|
||||||
|
applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
|
||||||
|
|
||||||
MyStreamSynchronize(NULL, order, thr_id);
|
oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern void whirl_midstate(void *state, const void *input);
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
void whirlpool512_setBlock_80(void *pdata, const void *ptarget)
|
void whirlpool512_setBlock_80(void *pdata, const void *ptarget)
|
||||||
{
|
{
|
||||||
unsigned char PaddedMessage[128];
|
unsigned char PaddedMessage[128];
|
||||||
|
|
||||||
memcpy(PaddedMessage, pdata, 80);
|
memcpy(PaddedMessage, pdata, 80);
|
||||||
memset(PaddedMessage+80, 0, 48);
|
memset(PaddedMessage+80, 0, 48);
|
||||||
PaddedMessage[80] = 0x80; /* ending */
|
PaddedMessage[80] = 0x80; /* ending */
|
||||||
cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
|
|
||||||
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
|
#if HOST_MIDSTATE
|
||||||
|
// compute constant first block
|
||||||
|
unsigned char midstate[64] = { 0 };
|
||||||
|
whirl_midstate(midstate, pdata);
|
||||||
|
memcpy(PaddedMessage, midstate, 64);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
|
||||||
}
|
}
|
||||||
|
@ -19,6 +19,9 @@ extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget);
|
|||||||
extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
|
extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
|
||||||
extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
//#define _DEBUG
|
||||||
|
#define _DEBUG_PREFIX "whirl"
|
||||||
|
#include "cuda_debug.cuh"
|
||||||
|
|
||||||
// CPU Hash function
|
// CPU Hash function
|
||||||
extern "C" void wcoinhash(void *state, const void *input)
|
extern "C" void wcoinhash(void *state, const void *input)
|
||||||
@ -49,6 +52,16 @@ extern "C" void wcoinhash(void *state, const void *input)
|
|||||||
memcpy(state, hash, 32);
|
memcpy(state, hash, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void whirl_midstate(void *state, const void *input)
|
||||||
|
{
|
||||||
|
sph_whirlpool_context ctx;
|
||||||
|
|
||||||
|
sph_whirlpool1_init(&ctx);
|
||||||
|
sph_whirlpool1(&ctx, input, 64);
|
||||||
|
|
||||||
|
memcpy(state, ctx.state, 64);
|
||||||
|
}
|
||||||
|
|
||||||
static bool init[MAX_GPUS] = { 0 };
|
static bool init[MAX_GPUS] = { 0 };
|
||||||
|
|
||||||
extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||||
@ -60,6 +73,7 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
|
|||||||
|
|
||||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
|
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
|
||||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||||
|
if (init[thr_id]) throughput = max(throughput, 256); // shared mem requirement
|
||||||
|
|
||||||
if (opt_benchmark)
|
if (opt_benchmark)
|
||||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||||
@ -91,8 +105,11 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
|
|||||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||||
|
|
||||||
whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||||
|
TRACE64(" 80 :", d_hash);
|
||||||
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
TRACE64(" 64 :", d_hash);
|
||||||
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
TRACE64(" 64 :", d_hash);
|
||||||
|
|
||||||
foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
if (foundNonce != UINT32_MAX && bench_algo < 0)
|
if (foundNonce != UINT32_MAX && bench_algo < 0)
|
||||||
@ -115,7 +132,7 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
|
|||||||
pdata[19] = foundNonce;
|
pdata[19] = foundNonce;
|
||||||
return res;
|
return res;
|
||||||
} else {
|
} else {
|
||||||
applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
|
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
|
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user