Browse Source

Add fresh algo (based on djm34 code)

Cleaned up and adapted to my changes (cputest added)

Remove Makefile.in which should be in gitignore

(Plz refresh it with ./config.sh to compile on linux)
2upstream
Tanguy Pruvot 10 years ago
parent
commit
bc2eb75758
  1. 16
      .gitignore
  2. 4
      Makefile.am
  3. 1488
      Makefile.in
  4. 6
      ccminer.vcxproj
  5. 3
      ccminer.vcxproj.filters
  6. 10
      cpu-miner.c
  7. 5
      miner.h
  8. 15
      quark/cuda_quark_groestl512.cu
  9. 4
      util.c
  10. 95
      x11/cuda_x11_shavite512.cu
  11. 169
      x11/fresh.cu

16
.gitignore vendored

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
minerd
ccminer
*.o
autom4te.cache
@ -26,6 +26,9 @@ config.sub @@ -26,6 +26,9 @@ config.sub
mingw32-config.cache
*/.dirstamp
.DS_Store
Desktop.ini
Thumbs.db
*.iml
@ -33,5 +36,14 @@ Debug/ @@ -33,5 +36,14 @@ Debug/
Release/
x64/Debug/
x64/Release/
ccminer.*.suo
*.suo
*.user
.settings/
.project
.metadata
.classpath
.loadpath
.cproject
.buildpath

4
Makefile.am

@ -37,9 +37,9 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -37,9 +37,9 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/cubehash.c sph/echo.c sph/luffa.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
sph/shabal.c sph/whirlpool.c \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
x11/x11.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu
ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@

1488
Makefile.in

File diff suppressed because it is too large Load Diff

6
ccminer.vcxproj

@ -503,6 +503,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command> @@ -503,6 +503,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="x11\fresh.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="x11\simd_functions.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>

3
ccminer.vcxproj.filters

@ -361,6 +361,9 @@ @@ -361,6 +361,9 @@
<CudaCompile Include="x11\cuda_x11_simd512.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\fresh.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\x11.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>

10
cpu-miner.c

@ -132,6 +132,7 @@ typedef enum { @@ -132,6 +132,7 @@ typedef enum {
ALGO_JACKPOT,
ALGO_QUARK,
ALGO_ANIME,
ALGO_FRESH,
ALGO_NIST5,
ALGO_X11,
ALGO_X13,
@ -149,6 +150,7 @@ static const char *algo_names[] = { @@ -149,6 +150,7 @@ static const char *algo_names[] = {
"jackpot",
"quark",
"anime",
"fresh",
"nist5",
"x11",
"x13",
@ -225,6 +227,7 @@ Options:\n\ @@ -225,6 +227,7 @@ Options:\n\
jackpot Jackpot hash\n\
quark Quark hash\n\
anime Animecoin hash\n\
fresh Freshcoin hash (shavite 80)\n\
nist5 NIST5 (TalkCoin) hash\n\
x11 X11 (DarkCoin) hash\n\
x13 X13 (MaruCoin) hash\n\
@ -782,7 +785,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -782,7 +785,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
if (opt_algo == ALGO_JACKPOT)
diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR)
else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
else
diff_to_target(work->target, sctx->job.diff / opt_difficulty);
@ -918,6 +921,11 @@ static void *miner_thread(void *userdata) @@ -918,6 +921,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done);
break;
case ALGO_FRESH:
rc = scanhash_fresh(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
case ALGO_NIST5:
rc = scanhash_nist5(thr_id, work.data, work.target,
max_nonce, &hashes_done);

5
miner.h

@ -234,6 +234,10 @@ extern int scanhash_anime(int thr_id, uint32_t *pdata, @@ -234,6 +234,10 @@ extern int scanhash_anime(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_fresh(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
@ -355,6 +359,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len); @@ -355,6 +359,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void groestlhash(void *state, const void *input);
void myriadhash(void *state, const void *input);
void fresh_hash(void *state, const void *input);
void nist5hash(void *state, const void *input);
void quarkhash(void *state, const void *input);
void x11hash(void *output, const void *input);

15
quark/cuda_quark_groestl512.cu

@ -5,6 +5,9 @@ @@ -5,6 +5,9 @@
#include "cuda_helper.h"
#define TPB 256
#define THF 4
// aus cpu-miner.c
extern int device_map[8];
@ -18,7 +21,7 @@ static cudaDeviceProp props[8]; @@ -18,7 +21,7 @@ static cudaDeviceProp props[8];
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
__global__ __launch_bounds__(256, 4)
__global__ __launch_bounds__(TPB, THF)
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
@ -60,7 +63,7 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32 @@ -60,7 +63,7 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
}
}
__global__ void __launch_bounds__(256, 4)
__global__ void __launch_bounds__(TPB, THF)
quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
@ -125,11 +128,11 @@ __host__ void quark_groestl512_cpu_init(int thr_id, int threads) @@ -125,11 +128,11 @@ __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
int threadsperblock = 256;
int threadsperblock = TPB;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
const int factor = 4;
const int factor = THF;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
@ -146,11 +149,11 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta @@ -146,11 +149,11 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
int threadsperblock = 256;
int threadsperblock = TPB;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
const int factor = 4;
const int factor = THF;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));

4
util.c

@ -1360,6 +1360,10 @@ void print_hash_tests(void) @@ -1360,6 +1360,10 @@ void print_hash_tests(void)
quarkhash(&hash[0], &buf[0]);
printf("\nquark: "); print_hash(hash);
memset(hash, 0, sizeof hash);
fresh_hash(&hash[0], &buf[0]);
printf("\nfresh: "); print_hash(hash);
memset(hash, 0, sizeof hash);
x11hash(&hash[0], &buf[0]);
printf("\nX11: "); print_hash(hash);

95
x11/cuda_x11_shavite512.cu

@ -1,10 +1,11 @@ @@ -1,10 +1,11 @@
#include "cuda_helper.h"
#define TPB 256
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
//typedef unsigned char BitSequence;
//typedef unsigned long long DataLength;
__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
__device__ __constant__
static const uint32_t d_ShaviteInitVector[16] = {
@ -16,7 +17,8 @@ static const uint32_t d_ShaviteInitVector[16] = { @@ -16,7 +17,8 @@ static const uint32_t d_ShaviteInitVector[16] = {
#include "cuda_x11_aes.cu"
static __device__ __forceinline__ void AES_ROUND_NOKEY(
__device__ __forceinline__
static void AES_ROUND_NOKEY(
const uint32_t* __restrict__ sharedMemory,
uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3)
{
@ -31,7 +33,8 @@ static __device__ __forceinline__ void AES_ROUND_NOKEY( @@ -31,7 +33,8 @@ static __device__ __forceinline__ void AES_ROUND_NOKEY(
x3 = y3;
}
static __device__ __forceinline__ void KEY_EXPAND_ELT(
__device__ __forceinline__
static void KEY_EXPAND_ELT(
const uint32_t* __restrict__ sharedMemory,
uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
{
@ -46,8 +49,8 @@ static __device__ __forceinline__ void KEY_EXPAND_ELT( @@ -46,8 +49,8 @@ static __device__ __forceinline__ void KEY_EXPAND_ELT(
k3 = y0;
}
static __device__ void
c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
__device__
static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, uint32_t count)
{
uint32_t p0, p1, p2, p3, p4, p5, p6, p7;
uint32_t p8, p9, pA, pB, pC, pD, pE, pF;
@ -56,7 +59,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg) @@ -56,7 +59,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
uint32_t rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F;
uint32_t rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
uint32_t rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
const uint32_t counter = 512;
const uint32_t counter = count;
p0 = state[0x0];
p1 = state[0x1];
@ -1291,8 +1294,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg) @@ -1291,8 +1294,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
state[0xF] ^= p7;
}
// Die Hash-Funktion
// GPU Hash
__global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
__shared__ uint32_t sharedMemory[1024];
@ -1305,7 +1307,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui @@ -1305,7 +1307,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
// kopiere init-state
uint32_t state[16];
@ -1320,7 +1322,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui @@ -1320,7 +1322,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
// fülle die Nachricht mit 64-byte (vorheriger Hash)
#pragma unroll 16
for(int i=0;i<16;i++)
msg[i] = Hash[i];
msg[i] = Hash[i];
// Nachrichtenende
msg[16] = 0x80;
@ -1334,7 +1336,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui @@ -1334,7 +1336,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
msg[30] = 0;
msg[31] = 0x02000000;
c512(sharedMemory, state, msg);
c512(sharedMemory, state, msg, 512);
#pragma unroll 16
for(int i=0;i<16;i++)
@ -1342,8 +1344,46 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui @@ -1342,8 +1344,46 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
}
}
__global__ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
{
__shared__ uint32_t sharedMemory[1024];
aes_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nounce = startNounce + thread;
// kopiere init-state
uint32_t state[16];
#pragma unroll 16
for(int i=0;i<16;i++) {
state[i] = d_ShaviteInitVector[i];}
uint32_t msg[32];
#pragma unroll 32
for(int i=0;i<32;i++) {
msg[i] = c_PaddedMessage80[i];
}
msg[19] = cuda_swab32(nounce);
msg[20] = 0x80;
msg[27] = 0x2800000;
msg[31] = 0x2000000;
c512(sharedMemory, state, msg, 640);
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 16
for(int i=0;i<16;i++)
outHash[i] = state[i];
} //thread < threads
}
// Setup-Funktionen
__host__ void x11_shavite512_cpu_init(int thr_id, int threads)
{
aes_cpu_init();
@ -1351,15 +1391,40 @@ __host__ void x11_shavite512_cpu_init(int thr_id, int threads) @@ -1351,15 +1391,40 @@ __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
__host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
const int threadsperblock = TPB;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{
const int threadsperblock = TPB;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
x11_shavite512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ void x11_shavite512_setBlock_80(void *pdata)
{
// Message mit Padding bereitstellen
// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
unsigned char PaddedMessage[128];
memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage+80, 0, 48);
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}

169
x11/fresh.cu

@ -0,0 +1,169 @@ @@ -0,0 +1,169 @@
/**
* Fresh algorithm
*/
extern "C" {
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "miner.h"
#include "cuda_helper.h"
}
// to test gpu hash on a null buffer
#define NULLTEST 0
static uint32_t *d_hash[8];
extern int device_map[8];
extern bool opt_benchmark;
extern void x11_shavite512_cpu_init(int thr_id, int threads);
extern void x11_shavite512_setBlock_80(void *pdata);
extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_simd512_cpu_init(int thr_id, int threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
int order);
// CPU Hash
extern "C" void fresh_hash(void *state, const void *input)
{
// shavite-simd-shavite-simd-echo
sph_shavite512_context ctx_shavite;
sph_simd512_context ctx_simd;
sph_echo512_context ctx_echo;
unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
#define hashA hash
#define hashB hash+64
memset(hash, 0, sizeof hash);
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, input, 80);
sph_shavite512_close(&ctx_shavite, hashA);
sph_simd512_init(&ctx_simd);
sph_simd512(&ctx_simd, hashA, 64);
sph_simd512_close(&ctx_simd, hashB);
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, hashB, 64);
sph_shavite512_close(&ctx_shavite, hashA);
sph_simd512_init(&ctx_simd);
sph_simd512(&ctx_simd, hashA, 64);
sph_simd512_close(&ctx_simd, hashB);
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, hashB, 64);
sph_echo512_close(&ctx_echo, hashA);
memcpy(state, hash, 32);
}
#if NULLTEST
static void print_hash(unsigned char *hash)
{
for (int i=0; i < 32; i += 4) {
printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]);
}
}
#endif
extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
const int throughput = 256*256*8;
static bool init[8] = {0,0,0,0,0,0,0,0};
uint32_t endiandata[20];
uint32_t Htarg = ptarget[7];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
#if NULLTEST
for (int k=0; k < 20; k++)
pdata[k] = 0;
#endif
if (!init[thr_id])
{
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4));
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
x11_shavite512_setBlock_80((void*)endiandata);
cuda_check_cpu_setTarget(ptarget);
do {
uint32_t foundNonce;
int order = 0;
// GPU Hash
x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
#if NULLTEST
uint32_t buf[8]; memset(buf, 0, sizeof buf);
CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaThreadSynchronize());
print_hash((unsigned char*)buf); printf("\n");
#endif
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
fresh_hash(vhash64, endiandata);
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = foundNonce - first_nonce + 1;
return 1;
}
else if (vhash64[7] > Htarg) {
applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
}
else {
applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
Loading…
Cancel
Save