Browse Source

Add fresh algo (based on djm34 code)

Cleaned up and adapted to my changes (cputest added)

Remove Makefile.in which should be in gitignore

(Plz refresh it with ./config.sh to compile on linux)
master
Tanguy Pruvot 10 years ago
parent
commit
bc2eb75758
  1. 16
      .gitignore
  2. 4
      Makefile.am
  3. 1488
      Makefile.in
  4. 6
      ccminer.vcxproj
  5. 3
      ccminer.vcxproj.filters
  6. 10
      cpu-miner.c
  7. 5
      miner.h
  8. 15
      quark/cuda_quark_groestl512.cu
  9. 4
      util.c
  10. 93
      x11/cuda_x11_shavite512.cu
  11. 169
      x11/fresh.cu

16
.gitignore vendored

@ -1,5 +1,5 @@
minerd ccminer
*.o *.o
autom4te.cache autom4te.cache
@ -26,6 +26,9 @@ config.sub
mingw32-config.cache mingw32-config.cache
*/.dirstamp */.dirstamp
.DS_Store
Desktop.ini
Thumbs.db
*.iml *.iml
@ -33,5 +36,14 @@ Debug/
Release/ Release/
x64/Debug/ x64/Debug/
x64/Release/ x64/Release/
ccminer.*.suo *.suo
*.user
.settings/
.project
.metadata
.classpath
.loadpath
.cproject
.buildpath

4
Makefile.am

@ -37,9 +37,9 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/cubehash.c sph/echo.c sph/luffa.c sph/shavite.c sph/simd.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
sph/shabal.c sph/whirlpool.c \ sph/shabal.c sph/whirlpool.c \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/x11.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu
ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@

1488
Makefile.in

File diff suppressed because it is too large Load Diff

6
ccminer.vcxproj

@ -503,6 +503,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x11\fresh.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="x11\simd_functions.cu"> <CudaCompile Include="x11\simd_functions.cu">
<ExcludedFromBuild>true</ExcludedFromBuild> <ExcludedFromBuild>true</ExcludedFromBuild>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>

3
ccminer.vcxproj.filters

@ -361,6 +361,9 @@
<CudaCompile Include="x11\cuda_x11_simd512.cu"> <CudaCompile Include="x11\cuda_x11_simd512.cu">
<Filter>Source Files\CUDA\x11</Filter> <Filter>Source Files\CUDA\x11</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x11\fresh.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\x11.cu"> <CudaCompile Include="x11\x11.cu">
<Filter>Source Files\CUDA\x11</Filter> <Filter>Source Files\CUDA\x11</Filter>
</CudaCompile> </CudaCompile>

10
cpu-miner.c

@ -132,6 +132,7 @@ typedef enum {
ALGO_JACKPOT, ALGO_JACKPOT,
ALGO_QUARK, ALGO_QUARK,
ALGO_ANIME, ALGO_ANIME,
ALGO_FRESH,
ALGO_NIST5, ALGO_NIST5,
ALGO_X11, ALGO_X11,
ALGO_X13, ALGO_X13,
@ -149,6 +150,7 @@ static const char *algo_names[] = {
"jackpot", "jackpot",
"quark", "quark",
"anime", "anime",
"fresh",
"nist5", "nist5",
"x11", "x11",
"x13", "x13",
@ -225,6 +227,7 @@ Options:\n\
jackpot Jackpot hash\n\ jackpot Jackpot hash\n\
quark Quark hash\n\ quark Quark hash\n\
anime Animecoin hash\n\ anime Animecoin hash\n\
fresh Freshcoin hash (shavite 80)\n\
nist5 NIST5 (TalkCoin) hash\n\ nist5 NIST5 (TalkCoin) hash\n\
x11 X11 (DarkCoin) hash\n\ x11 X11 (DarkCoin) hash\n\
x13 X13 (MaruCoin) hash\n\ x13 X13 (MaruCoin) hash\n\
@ -782,7 +785,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
if (opt_algo == ALGO_JACKPOT) if (opt_algo == ALGO_JACKPOT)
diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR) else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
else else
diff_to_target(work->target, sctx->job.diff / opt_difficulty); diff_to_target(work->target, sctx->job.diff / opt_difficulty);
@ -918,6 +921,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done); max_nonce, &hashes_done);
break; break;
case ALGO_FRESH:
rc = scanhash_fresh(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
case ALGO_NIST5: case ALGO_NIST5:
rc = scanhash_nist5(thr_id, work.data, work.target, rc = scanhash_nist5(thr_id, work.data, work.target,
max_nonce, &hashes_done); max_nonce, &hashes_done);

5
miner.h

@ -234,6 +234,10 @@ extern int scanhash_anime(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
extern int scanhash_fresh(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_nist5(int thr_id, uint32_t *pdata, extern int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
@ -355,6 +359,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void groestlhash(void *state, const void *input); void groestlhash(void *state, const void *input);
void myriadhash(void *state, const void *input); void myriadhash(void *state, const void *input);
void fresh_hash(void *state, const void *input);
void nist5hash(void *state, const void *input); void nist5hash(void *state, const void *input);
void quarkhash(void *state, const void *input); void quarkhash(void *state, const void *input);
void x11hash(void *output, const void *input); void x11hash(void *output, const void *input);

15
quark/cuda_quark_groestl512.cu

@ -5,6 +5,9 @@
#include "cuda_helper.h" #include "cuda_helper.h"
#define TPB 256
#define THF 4
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -18,7 +21,7 @@ static cudaDeviceProp props[8];
#include "groestl_functions_quad.cu" #include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu" #include "bitslice_transformations_quad.cu"
__global__ __launch_bounds__(256, 4) __global__ __launch_bounds__(TPB, THF)
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{ {
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
@ -60,7 +63,7 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
} }
} }
__global__ void __launch_bounds__(256, 4) __global__ void __launch_bounds__(TPB, THF)
quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2; int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
@ -125,11 +128,11 @@ __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
int threadsperblock = 256; int threadsperblock = TPB;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
const int factor = 4; const int factor = THF;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
@ -146,11 +149,11 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
int threadsperblock = 256; int threadsperblock = TPB;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
const int factor = 4; const int factor = THF;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));

4
util.c

@ -1360,6 +1360,10 @@ void print_hash_tests(void)
quarkhash(&hash[0], &buf[0]); quarkhash(&hash[0], &buf[0]);
printf("\nquark: "); print_hash(hash); printf("\nquark: "); print_hash(hash);
memset(hash, 0, sizeof hash);
fresh_hash(&hash[0], &buf[0]);
printf("\nfresh: "); print_hash(hash);
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
x11hash(&hash[0], &buf[0]); x11hash(&hash[0], &buf[0]);
printf("\nX11: "); print_hash(hash); printf("\nX11: "); print_hash(hash);

93
x11/cuda_x11_shavite512.cu

@ -1,10 +1,11 @@
#include "cuda_helper.h" #include "cuda_helper.h"
#define TPB 256
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
//typedef unsigned char BitSequence; __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
//typedef unsigned long long DataLength;
__device__ __constant__ __device__ __constant__
static const uint32_t d_ShaviteInitVector[16] = { static const uint32_t d_ShaviteInitVector[16] = {
@ -16,7 +17,8 @@ static const uint32_t d_ShaviteInitVector[16] = {
#include "cuda_x11_aes.cu" #include "cuda_x11_aes.cu"
static __device__ __forceinline__ void AES_ROUND_NOKEY( __device__ __forceinline__
static void AES_ROUND_NOKEY(
const uint32_t* __restrict__ sharedMemory, const uint32_t* __restrict__ sharedMemory,
uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3) uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3)
{ {
@ -31,7 +33,8 @@ static __device__ __forceinline__ void AES_ROUND_NOKEY(
x3 = y3; x3 = y3;
} }
static __device__ __forceinline__ void KEY_EXPAND_ELT( __device__ __forceinline__
static void KEY_EXPAND_ELT(
const uint32_t* __restrict__ sharedMemory, const uint32_t* __restrict__ sharedMemory,
uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3) uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
{ {
@ -46,8 +49,8 @@ static __device__ __forceinline__ void KEY_EXPAND_ELT(
k3 = y0; k3 = y0;
} }
static __device__ void __device__
c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg) static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, uint32_t count)
{ {
uint32_t p0, p1, p2, p3, p4, p5, p6, p7; uint32_t p0, p1, p2, p3, p4, p5, p6, p7;
uint32_t p8, p9, pA, pB, pC, pD, pE, pF; uint32_t p8, p9, pA, pB, pC, pD, pE, pF;
@ -56,7 +59,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
uint32_t rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; uint32_t rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F;
uint32_t rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; uint32_t rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
uint32_t rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; uint32_t rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
const uint32_t counter = 512; const uint32_t counter = count;
p0 = state[0x0]; p0 = state[0x0];
p1 = state[0x1]; p1 = state[0x1];
@ -1291,8 +1294,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
state[0xF] ^= p7; state[0xF] ^= p7;
} }
// GPU Hash
// Die Hash-Funktion
__global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
__shared__ uint32_t sharedMemory[1024]; __shared__ uint32_t sharedMemory[1024];
@ -1305,7 +1307,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
// kopiere init-state // kopiere init-state
uint32_t state[16]; uint32_t state[16];
@ -1334,7 +1336,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
msg[30] = 0; msg[30] = 0;
msg[31] = 0x02000000; msg[31] = 0x02000000;
c512(sharedMemory, state, msg); c512(sharedMemory, state, msg, 512);
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
@ -1342,8 +1344,46 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
} }
} }
__global__ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
{
__shared__ uint32_t sharedMemory[1024];
aes_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nounce = startNounce + thread;
// kopiere init-state
uint32_t state[16];
#pragma unroll 16
for(int i=0;i<16;i++) {
state[i] = d_ShaviteInitVector[i];}
uint32_t msg[32];
#pragma unroll 32
for(int i=0;i<32;i++) {
msg[i] = c_PaddedMessage80[i];
}
msg[19] = cuda_swab32(nounce);
msg[20] = 0x80;
msg[27] = 0x2800000;
msg[31] = 0x2000000;
c512(sharedMemory, state, msg, 640);
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 16
for(int i=0;i<16;i++)
outHash[i] = state[i];
} //thread < threads
}
// Setup-Funktionen
__host__ void x11_shavite512_cpu_init(int thr_id, int threads) __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
{ {
aes_cpu_init(); aes_cpu_init();
@ -1351,15 +1391,40 @@ __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
__host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const int threadsperblock = TPB;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
__host__ void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{
const int threadsperblock = TPB;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
x11_shavite512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ void x11_shavite512_setBlock_80(void *pdata)
{
// Message mit Padding bereitstellen
// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
unsigned char PaddedMessage[128];
memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage+80, 0, 48);
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}

169
x11/fresh.cu

@ -0,0 +1,169 @@
/**
* Fresh algorithm
*/
extern "C" {
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "miner.h"
#include "cuda_helper.h"
}
// to test gpu hash on a null buffer
#define NULLTEST 0
static uint32_t *d_hash[8];
extern int device_map[8];
extern bool opt_benchmark;
extern void x11_shavite512_cpu_init(int thr_id, int threads);
extern void x11_shavite512_setBlock_80(void *pdata);
extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_simd512_cpu_init(int thr_id, int threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
int order);
// CPU Hash
extern "C" void fresh_hash(void *state, const void *input)
{
// shavite-simd-shavite-simd-echo
sph_shavite512_context ctx_shavite;
sph_simd512_context ctx_simd;
sph_echo512_context ctx_echo;
unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
#define hashA hash
#define hashB hash+64
memset(hash, 0, sizeof hash);
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, input, 80);
sph_shavite512_close(&ctx_shavite, hashA);
sph_simd512_init(&ctx_simd);
sph_simd512(&ctx_simd, hashA, 64);
sph_simd512_close(&ctx_simd, hashB);
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, hashB, 64);
sph_shavite512_close(&ctx_shavite, hashA);
sph_simd512_init(&ctx_simd);
sph_simd512(&ctx_simd, hashA, 64);
sph_simd512_close(&ctx_simd, hashB);
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, hashB, 64);
sph_echo512_close(&ctx_echo, hashA);
memcpy(state, hash, 32);
}
#if NULLTEST
static void print_hash(unsigned char *hash)
{
for (int i=0; i < 32; i += 4) {
printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]);
}
}
#endif
extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
const int throughput = 256*256*8;
static bool init[8] = {0,0,0,0,0,0,0,0};
uint32_t endiandata[20];
uint32_t Htarg = ptarget[7];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
#if NULLTEST
for (int k=0; k < 20; k++)
pdata[k] = 0;
#endif
if (!init[thr_id])
{
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4));
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
x11_shavite512_setBlock_80((void*)endiandata);
cuda_check_cpu_setTarget(ptarget);
do {
uint32_t foundNonce;
int order = 0;
// GPU Hash
x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
#if NULLTEST
uint32_t buf[8]; memset(buf, 0, sizeof buf);
CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaThreadSynchronize());
print_hash((unsigned char*)buf); printf("\n");
#endif
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
fresh_hash(vhash64, endiandata);
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = foundNonce - first_nonce + 1;
return 1;
}
else if (vhash64[7] > Htarg) {
applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
}
else {
applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
Loading…
Cancel
Save