Browse Source

heavy: add error checks, fix strict aliasing and linux

The core problem was the cuda hefty Thread per block set to high
but took me several hours to find that...

btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512
if max reg count is set to 80...
master
Tanguy Pruvot 10 years ago
parent
commit
1b65cd05cc
  1. 3
      Makefile.am
  2. 10
      blake32.cu
  3. 10
      ccminer.vcxproj
  4. 22
      ccminer.vcxproj.filters
  5. 32
      heavy/cuda_blake512.cu
  6. 7
      heavy/cuda_blake512.h
  7. 34
      heavy/cuda_combine.cu
  8. 7
      heavy/cuda_combine.h
  9. 32
      heavy/cuda_groestl512.cu
  10. 9
      heavy/cuda_groestl512.h
  11. 101
      heavy/cuda_hefty1.cu
  12. 8
      heavy/cuda_hefty1.h
  13. 45
      heavy/cuda_keccak512.cu
  14. 9
      heavy/cuda_keccak512.h
  15. 31
      heavy/cuda_sha256.cu
  16. 8
      heavy/cuda_sha256.h
  17. 130
      heavy/heavy.cu
  18. 30
      heavy/heavy.h
  19. 2
      stats.cpp
  20. 18
      util.cpp

3
Makefile.am

@ -80,6 +80,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
blake32.o: blake32.cu blake32.o: blake32.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<

10
blake32.cu

@ -303,7 +303,11 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
ending[3] = nonce; /* our tested value */ ending[3] = nonce; /* our tested value */
blake256_compress(h, ending, 640, rounds); blake256_compress(h, ending, 640, rounds);
#if 0
if (trace) {
printf("blake hash[6][7]: %08x %08x\n", h[6], h[7]);
}
#endif
//if (h[7] == 0 && high64 <= highTarget) { //if (h[7] == 0 && high64 <= highTarget) {
if (h[7] == 0) { if (h[7] == 0) {
#if NBN == 2 #if NBN == 2
@ -318,14 +322,14 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
#else #else
resNonce[0] = nonce; resNonce[0] = nonce;
#endif #endif
if (trace) {
#ifdef _DEBUG #ifdef _DEBUG
if (trace) {
uint64_t high64 = ((uint64_t*)h)[3]; uint64_t high64 = ((uint64_t*)h)[3];
printf("gpu: %16llx\n", high64); printf("gpu: %16llx\n", high64);
printf("gpu: %08x.%08x\n", h[7], h[6]); printf("gpu: %08x.%08x\n", h[7], h[6]);
printf("tgt: %16llx\n", highTarget); printf("tgt: %16llx\n", highTarget);
#endif
} }
#endif
} }
} }
} }

10
ccminer.vcxproj

@ -173,7 +173,7 @@
<MaxRegCount>80</MaxRegCount> <MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep> <Keep>false</Keep>
<CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration> <CodeGeneration>compute_50,sm_50</CodeGeneration>
<AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
<Defines> <Defines>
</Defines> </Defines>
@ -306,12 +306,7 @@
<ClInclude Include="cuda_groestlcoin.h" /> <ClInclude Include="cuda_groestlcoin.h" />
<ClInclude Include="cuda_helper.h" /> <ClInclude Include="cuda_helper.h" />
<ClInclude Include="elist.h" /> <ClInclude Include="elist.h" />
<ClInclude Include="heavy\cuda_blake512.h" /> <ClInclude Include="heavy\heavy.h" />
<ClInclude Include="heavy\cuda_combine.h" />
<ClInclude Include="heavy\cuda_groestl512.h" />
<ClInclude Include="heavy\cuda_hefty1.h" />
<ClInclude Include="heavy\cuda_keccak512.h" />
<ClInclude Include="heavy\cuda_sha256.h" />
<ClInclude Include="hefty1.h" /> <ClInclude Include="hefty1.h" />
<ClInclude Include="miner.h" /> <ClInclude Include="miner.h" />
<ClInclude Include="nvml.h" /> <ClInclude Include="nvml.h" />
@ -358,6 +353,7 @@
<CudaCompile Include="heavy\cuda_groestl512.cu"> <CudaCompile Include="heavy\cuda_groestl512.cu">
</CudaCompile> </CudaCompile>
<CudaCompile Include="heavy\cuda_hefty1.cu"> <CudaCompile Include="heavy\cuda_hefty1.cu">
<MaxRegCount>80</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile Include="heavy\cuda_keccak512.cu"> <CudaCompile Include="heavy\cuda_keccak512.cu">
</CudaCompile> </CudaCompile>

22
ccminer.vcxproj.filters

@ -43,9 +43,6 @@
<Filter Include="Source Files\CUDA\heavy"> <Filter Include="Source Files\CUDA\heavy">
<UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier> <UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Header Files\CUDA\heavy">
<UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\qubit"> <Filter Include="Source Files\CUDA\qubit">
<UniqueIdentifier>{f3ed23a2-8ce7-41a5-b051-6da56047dc35}</UniqueIdentifier> <UniqueIdentifier>{f3ed23a2-8ce7-41a5-b051-6da56047dc35}</UniqueIdentifier>
</Filter> </Filter>
@ -293,23 +290,8 @@
<ClInclude Include="sph\sph_types.h"> <ClInclude Include="sph\sph_types.h">
<Filter>Header Files\sph</Filter> <Filter>Header Files\sph</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="heavy\cuda_blake512.h"> <ClInclude Include="heavy\heavy.h">
<Filter>Header Files\CUDA\heavy</Filter> <Filter>Header Files\CUDA</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_combine.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_groestl512.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_hefty1.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_keccak512.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_sha256.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="cuda_helper.h"> <ClInclude Include="cuda_helper.h">
<Filter>Header Files\CUDA</Filter> <Filter>Header Files\CUDA</Filter>

32
heavy/cuda_blake512.cu

@ -3,11 +3,11 @@
#include "cuda_helper.h" #include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *heavy_heftyHashes[8];
extern uint32_t *d_nonceVector[8]; extern uint32_t *heavy_nonceVector[8];
// globaler Speicher für unsere Ergebnisse // globaler Speicher für unsere Ergebnisse
uint32_t *d_hash5output[8]; uint32_t *d_hash5output[8];
// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
@ -123,7 +123,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
//uint32_t nounce = startNounce + thread; //uint32_t nounce = startNounce + thread;
uint32_t nounce = nonceVector[thread]; uint32_t nounce = nonceVector[thread];
@ -141,10 +141,10 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
h[6] = 0x1f83d9abfb41bd6bULL; h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL; h[7] = 0x5be0cd19137e2179ULL;
// 128 Byte für die Message // 128 Byte für die Message
uint64_t buf[16]; uint64_t buf[16];
// Message für die erste Runde in Register holen // Message für die erste Runde in Register holen
#pragma unroll 16 #pragma unroll 16
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i]; for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
@ -154,7 +154,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
uint32_t *hefty = heftyHashes + 8 * hashPosition; uint32_t *hefty = heftyHashes + 8 * hashPosition;
if (BLOCKSIZE == 84) { if (BLOCKSIZE == 84) {
// den thread-spezifischen Hefty1 hash einsetzen // den thread-spezifischen Hefty1 hash einsetzen
// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist. // aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
buf[10] = REPLACE_HIWORD(buf[10], hefty[0]); buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
buf[11] = REPLACE_LOWORD(buf[11], hefty[1]); buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
buf[11] = REPLACE_HIWORD(buf[11], hefty[2]); buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
@ -210,8 +210,8 @@ __host__ void blake512_cpu_init(int thr_id, int threads)
sizeof(host_SecondRound), sizeof(host_SecondRound),
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
// Speicher für alle Ergebnisse belegen // Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads); CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads));
} }
static int BLOCKSIZE = 84; static int BLOCKSIZE = 84;
@ -222,14 +222,14 @@ __host__ void blake512_cpu_setBlock(void *pdata, int len)
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
if (len == 84) { if (len == 84) {
// Message mit Padding für erste Runde bereitstellen // Message mit Padding für erste Runde bereitstellen
memcpy(PaddedMessage, pdata, 84); memcpy(PaddedMessage, pdata, 84);
memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+116, 0, 12); memset(PaddedMessage+116, 0, 12);
PaddedMessage[116] = 0x80; PaddedMessage[116] = 0x80;
} else if (len == 80) { } else if (len == 80) {
memcpy(PaddedMessage, pdata, 80); memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+112, 0, 16); memset(PaddedMessage+112, 0, 16);
PaddedMessage[112] = 0x80; PaddedMessage[112] = 0x80;
} }
@ -246,11 +246,11 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
if (BLOCKSIZE == 80) if (BLOCKSIZE == 80)
blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
else if (BLOCKSIZE == 84) else if (BLOCKSIZE == 84)
blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
} }

7
heavy/cuda_blake512.h

@ -1,7 +0,0 @@
#ifndef _CUDA_BLAKE512_H
#define _CUDA_BLAKE512_H
void blake512_cpu_init(int thr_id, int threads);
void blake512_cpu_setBlock(void *pdata, int len);
void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
#endif

34
heavy/cuda_combine.cu

@ -1,16 +1,19 @@
#include "cuda_helper.h" #include <stdio.h>
// globaler Speicher für unsere Ergebnisse #include "cuda_helper.h"
uint32_t *d_hashoutput[8];
// globaler Speicher für unsere Ergebnisse
static uint32_t *d_hashoutput[8];
extern uint32_t *d_hash2output[8]; extern uint32_t *d_hash2output[8];
extern uint32_t *d_hash3output[8]; extern uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8]; extern uint32_t *d_hash4output[8];
extern uint32_t *d_hash5output[8]; extern uint32_t *d_hash5output[8];
extern uint32_t *d_nonceVector[8];
extern uint32_t *heavy_nonceVector[8];
/* Combines top 64-bits from each hash into a single hash */ /* Combines top 64-bits from each hash into a single hash */
static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4) __device__
static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
{ {
uint32_t lout[8]; // Combining in Registern machen uint32_t lout[8]; // Combining in Registern machen
@ -98,7 +101,8 @@ static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *
out[i] = lout[i]; out[i] = lout[i];
} }
__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector) __global__
void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -116,13 +120,14 @@ __global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *ou
} }
} }
// Setup-Funktionen __host__
__host__ void combine_cpu_init(int thr_id, int threads) void combine_cpu_init(int thr_id, int threads)
{ {
// Speicher für alle Ergebnisse belegen // Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
} }
__host__
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash) void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash)
{ {
// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten // diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
@ -133,11 +138,8 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs combine_gpu_hash <<<grid, block>>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]);
size_t shared_size = 0;
combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost); CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost));
} }

7
heavy/cuda_combine.h

@ -1,7 +0,0 @@
#ifndef _CUDA_COMBINE_H
#define _CUDA_COMBINE_H
void combine_cpu_init(int thr_id, int threads);
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
#endif

32
heavy/cuda_groestl512.cu

@ -3,11 +3,11 @@
#include "cuda_helper.h" #include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *heavy_heftyHashes[8];
extern uint32_t *d_nonceVector[8]; extern uint32_t *heavy_nonceVector[8];
// globaler Speicher für unsere Ergebnisse // globaler Speicher für unsere Ergebnisse
uint32_t *d_hash4output[8]; uint32_t *d_hash4output[8];
__constant__ uint32_t groestl_gpu_state[32]; __constant__ uint32_t groestl_gpu_state[32];
@ -677,7 +677,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
uint32_t message[32]; uint32_t message[32];
uint32_t state[32]; uint32_t state[32];
// lese message ein & verknüpfe diese mit dem hash1 von hefty1 // lese message ein & verknüpfe diese mit dem hash1 von hefty1
// lese den state ein // lese den state ein
#pragma unroll 32 #pragma unroll 32
@ -753,7 +753,7 @@ __host__ void groestl512_cpu_init(int thr_id, int threads)
texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256); texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
// Speicher für alle Ergebnisse belegen // Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads); cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
} }
@ -778,8 +778,8 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
msgBlock[28] = 0x80; msgBlock[28] = 0x80;
msgBlock[31] = 0x01000000; msgBlock[31] = 0x01000000;
} }
// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
// auf der GPU ausgeführt) // auf der GPU ausgeführt)
// setze register // setze register
uint32_t groestl_state_init[32]; uint32_t groestl_state_init[32];
@ -787,14 +787,10 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
groestl_state_init[31] = 0x20000; groestl_state_init[31] = 0x20000;
// state speichern // state speichern
cudaMemcpyToSymbol( groestl_gpu_state, cudaMemcpyToSymbol(groestl_gpu_state, groestl_state_init, 128);
groestl_state_init,
128);
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
cudaMemcpyToSymbol( groestl_gpu_msg, cudaMemcpyToSymbol(groestl_gpu_msg, msgBlock, 128);
msgBlock,
128);
BLOCKSIZE = len; BLOCKSIZE = len;
} }
@ -802,7 +798,7 @@ __host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyH
{ {
// Hefty1 Hashes kopieren (eigentlich nur zum debuggen) // Hefty1 Hashes kopieren (eigentlich nur zum debuggen)
if (copy) if (copy)
cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
} }
__host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
@ -813,11 +809,11 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
if (BLOCKSIZE == 84) if (BLOCKSIZE == 84)
groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
else if (BLOCKSIZE == 80) else if (BLOCKSIZE == 80)
groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
} }

9
heavy/cuda_groestl512.h

@ -1,9 +0,0 @@
#ifndef _CUDA_GROESTL512_H
#define _CUDA_GROESTL512_H
void groestl512_cpu_init(int thr_id, int threads);
void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
void groestl512_cpu_setBlock(void *data, int len);
void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
#endif

101
heavy/cuda_hefty1.cu

@ -1,12 +1,14 @@
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#include "miner.h"
#include "cuda_helper.h" #include "cuda_helper.h"
#define USE_SHARED 1 #define USE_SHARED 1
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
uint32_t *d_heftyHashes[8]; uint32_t *heavy_heftyHashes[8];
/* Hash-Tabellen */ /* Hash-Tabellen */
__constant__ uint32_t hefty_gpu_constantTable[64]; __constant__ uint32_t hefty_gpu_constantTable[64];
@ -50,11 +52,16 @@ uint32_t hefty_cpu_constantTable[] = {
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
}; };
//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #if 0
static __host__ __device__ uint32_t S(uint32_t x, int n) #define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#else
__host__ __device__
static uint32_t S(uint32_t x, int n)
{ {
return (((x) >> (n)) | ((x) << (32 - (n)))); return (((x) >> (n)) | ((x) << (32 - (n))));
} }
#endif
#define R(x, n) ((x) >> (n)) #define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z) #define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z)) #define Maj(x, y, z) ((x & (y | z)) | (y & z))
@ -67,7 +74,9 @@ static __host__ __device__ uint32_t S(uint32_t x, int n)
// uint8_t // uint8_t
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) ) #define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
__host__ __forceinline__ __device__
uint8_t smoosh2(uint32_t x)
{ {
uint16_t w = (x >> 16) ^ (x & 0xffff); uint16_t w = (x >> 16) ^ (x & 0xffff);
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) ); uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
@ -77,13 +86,14 @@ __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F ) #define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
#define getByte(x,y) ( ((x) >> (y)) & 0xFF ) #define getByte(x,y) ( ((x) >> (y)) & 0xFF )
__host__ __forceinline__ __device__ void Mangle(uint32_t *inp) __host__ __forceinline__ __device__
void Mangle(uint32_t *inp)
{ {
uint32_t r = smoosh4Quad(inp[0]); uint32_t r = smoosh4Quad(inp[0]);
uint32_t inp0org; uint32_t inp0org;
uint32_t tmp0Mask, tmp1Mask; uint32_t tmp0Mask, tmp1Mask;
uint32_t in1, in2, isAddition; uint32_t in1, in2, isAddition;
uint32_t tmp; int32_t tmp;
uint8_t b; uint8_t b;
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24)); inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
@ -92,8 +102,8 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
tmp = smoosh2(inp[1]); tmp = smoosh2(inp[1]);
b = getByte(r,tmp); b = getByte(r,tmp);
inp0org = S(inp[0], b); inp0org = S(inp[0], b);
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
in1 = (inp[2] & ~inp0org) | in1 = (inp[2] & ~inp0org) |
(tmp1Mask & ~inp[2] & inp0org) | (tmp1Mask & ~inp[2] & inp0org) |
@ -106,8 +116,8 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
tmp = smoosh2(inp[1] ^ inp[2]); tmp = smoosh2(inp[1] ^ inp[2]);
b = getByte(r,tmp); b = getByte(r,tmp);
inp0org = S(inp[0], b); inp0org = S(inp[0], b);
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0 tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0 tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
in1 = (inp[3] & ~inp0org) | in1 = (inp[3] & ~inp0org) |
(tmp1Mask & ~inp[3] & inp0org) | (tmp1Mask & ~inp[3] & inp0org) |
@ -119,20 +129,23 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
inp[0] ^= (inp[1] ^ inp[2]) + inp[3]; inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
} }
__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x) __host__ __forceinline__ __device__
void Absorb(uint32_t *inp, uint32_t x)
{ {
inp[0] ^= x; inp[0] ^= x;
Mangle(inp); Mangle(inp);
} }
__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp) __host__ __forceinline__ __device__
uint32_t Squeeze(uint32_t *inp)
{ {
uint32_t y = inp[0]; uint32_t y = inp[0];
Mangle(inp); Mangle(inp);
return y; return y;
} }
__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x) __host__ __forceinline__ __device__
uint32_t Br(uint32_t *sponge, uint32_t x)
{ {
uint32_t r = Squeeze(sponge); uint32_t r = Squeeze(sponge);
uint32_t t = ((r >> 8) & 0x1F); uint32_t t = ((r >> 8) & 0x1F);
@ -146,7 +159,8 @@ __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
return retVal; return retVal;
} }
__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) __device__ __forceinline__
void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{ {
uint32_t tmpBr; uint32_t tmpBr;
@ -169,7 +183,8 @@ __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint
regs[4] += tmpBr; regs[4] += tmpBr;
} }
__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) __host__
void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{ {
uint32_t tmpBr; uint32_t tmpBr;
@ -191,11 +206,11 @@ __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *
regs[4] += tmpBr; regs[4] += tmpBr;
} }
// Die Hash-Funktion __global__
__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash) void hefty_gpu_hash(int threads, uint32_t startNounce, uint32_t *outputHash)
{ {
#if USE_SHARED #if USE_SHARED
extern __shared__ char heftytab[]; extern __shared__ unsigned char heftytab[];
if(threadIdx.x < 64) if(threadIdx.x < 64)
{ {
*((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x]; *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
@ -207,7 +222,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
@ -290,8 +305,8 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
} }
} }
// Setup-Funktionen __host__
__host__ void hefty_cpu_init(int thr_id, int threads) void hefty_cpu_init(int thr_id, int threads)
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
@ -300,17 +315,18 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
hefty_cpu_constantTable, hefty_cpu_constantTable,
sizeof(uint32_t) * 64 ); sizeof(uint32_t) * 64 );
// Speicher für alle Hefty1 hashes belegen // Speicher für alle Hefty1 hashes belegen
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads); CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads));
} }
__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len) __host__
void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
// data muss 80/84-Byte haben! // data muss 80/84-Byte haben!
{ {
// Nachricht expandieren und setzen // Nachricht expandieren und setzen
uint32_t msgBlock[32]; uint32_t msgBlock[32];
memset(msgBlock, 0, sizeof(uint32_t) * 32); memset(msgBlock, 0, sizeof(msgBlock));
memcpy(&msgBlock[0], data, len); memcpy(&msgBlock[0], data, len);
if (len == 84) { if (len == 84) {
msgBlock[21] |= 0x80; msgBlock[21] |= 0x80;
@ -323,9 +339,9 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
for(int i=0;i<31;i++) // Byteorder drehen for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]); msgBlock[i] = SWAB32(msgBlock[i]);
// die erste Runde wird auf der CPU durchgeführt, da diese für // die erste Runde wird auf der CPU durchgeführt, da diese für
// alle Threads gleich ist. Der Hash wird dann an die Threads // alle Threads gleich ist. Der Hash wird dann an die Threads
// übergeben // übergeben
// Erstelle expandierten Block W // Erstelle expandierten Block W
uint32_t W[64]; uint32_t W[64];
@ -366,39 +382,30 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
hash[k] += regs[k]; hash[k] += regs[k];
// sponge speichern // sponge speichern
cudaMemcpyToSymbol(hefty_gpu_sponge, sponge, 16);
cudaMemcpyToSymbol( hefty_gpu_sponge,
sponge,
sizeof(uint32_t) * 4 );
// hash speichern // hash speichern
cudaMemcpyToSymbol( hefty_gpu_register, cudaMemcpyToSymbol(hefty_gpu_register, hash, 32);
hash,
sizeof(uint32_t) * 8 );
// Blockheader setzen (korrekte Nonce fehlt da drin noch) // Blockheader setzen (korrekte Nonce fehlt da drin noch)
cudaMemcpyToSymbol( hefty_gpu_blockHeader, CUDA_SAFE_CALL(cudaMemcpyToSymbol(hefty_gpu_blockHeader, &msgBlock[16], 64));
&msgBlock[16],
64);
} }
__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) __host__
void hefty_cpu_hash(int thr_id, int threads, int startNounce)
{ {
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, int threadsperblock = 256;
// alle anderen mit 512 Threads.
int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED #if USE_SHARED
size_t shared_size = 8 * 64 * sizeof(uint32_t); int shared_size = 8 * 64 * sizeof(uint32_t);
#else #else
size_t shared_size = 0; int shared_size = 0;
#endif #endif
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); hefty_gpu_hash <<< grid, block, shared_size >>> (threads, startNounce, heavy_heftyHashes[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id); MyStreamSynchronize(NULL, 0, thr_id);

8
heavy/cuda_hefty1.h

@ -1,8 +0,0 @@
#ifndef _CUDA_HEFTY1_H
#define _CUDA_HEFTY1_H
void hefty_cpu_hash(int thr_id, int threads, int startNounce);
void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
void hefty_cpu_init(int thr_id, int threads);
#endif

45
heavy/cuda_keccak512.cu

@ -3,11 +3,11 @@
#include "cuda_helper.h" #include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *heavy_heftyHashes[8];
extern uint32_t *d_nonceVector[8]; extern uint32_t *heavy_nonceVector[8];
// globaler Speicher für unsere Ergebnisse // globaler Speicher für unsere Ergebnisse
uint32_t *d_hash3output[8]; uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8]; extern uint32_t *d_hash4output[8];
extern uint32_t *d_hash5output[8]; extern uint32_t *d_hash5output[8];
@ -15,13 +15,11 @@ extern uint32_t *d_hash5output[8];
// der Keccak512 State nach der ersten Runde (72 Bytes) // der Keccak512 State nach der ersten Runde (72 Bytes)
__constant__ uint64_t c_State[25]; __constant__ uint64_t c_State[25];
// die Message (72 Bytes) für die zweite Runde auf der GPU // die Message (72 Bytes) für die zweite Runde auf der GPU
__constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding __constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding
// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------ // ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
#include "cuda_helper.h"
#define U32TO64_LE(p) \ #define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
@ -144,7 +142,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
//uint32_t nounce = startNounce + thread; //uint32_t nounce = startNounce + thread;
uint32_t nounce = nonceVector[thread]; uint32_t nounce = nonceVector[thread];
@ -167,7 +165,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
// den individuellen Hefty1 Hash einsetzen // den individuellen Hefty1 Hash einsetzen
mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]); mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]);
// den Block einmal gut durchschütteln // den Block einmal gut durchschütteln
keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants); keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
// das Hash erzeugen // das Hash erzeugen
@ -187,8 +185,8 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
// ---------------------------- END CUDA keccak512 functions ------------------------------------ // ---------------------------- END CUDA keccak512 functions ------------------------------------
// Setup-Funktionen __host__
__host__ void keccak512_cpu_init(int thr_id, int threads) void keccak512_cpu_init(int thr_id, int threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher // Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants, cudaMemcpyToSymbol( c_keccak_round_constants,
@ -196,7 +194,7 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
sizeof(host_keccak_round_constants), sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
// Speicher für alle Ergebnisse belegen // Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads); cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
} }
@ -212,7 +210,8 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
static int BLOCKSIZE = 84; static int BLOCKSIZE = 84;
__host__ void keccak512_cpu_setBlock(void *data, int len) __host__
void keccak512_cpu_setBlock(void *data, int len)
// data muss 80 oder 84-Byte haben! // data muss 80 oder 84-Byte haben!
// heftyHash hat 32-Byte // heftyHash hat 32-Byte
{ {
@ -227,7 +226,7 @@ __host__ void keccak512_cpu_setBlock(void *data, int len)
// state kopieren // state kopieren
cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
// keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
// zu jeweils // zu jeweils
uint32_t msgBlock[18]; uint32_t msgBlock[18];
memset(msgBlock, 0, 18 * sizeof(uint32_t)); memset(msgBlock, 0, 18 * sizeof(uint32_t));
@ -238,7 +237,7 @@ __host__ void keccak512_cpu_setBlock(void *data, int len)
else if (len == 80) else if (len == 80)
memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8); memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8);
// Nachricht abschließen // Nachricht abschließen
if (len == 84) if (len == 84)
msgBlock[11] = 0x01; msgBlock[11] = 0x01;
else if (len == 80) else if (len == 80)
@ -252,15 +251,17 @@ __host__ void keccak512_cpu_setBlock(void *data, int len)
BLOCKSIZE = len; BLOCKSIZE = len;
} }
__host__
__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
{ {
// Hefty1 Hashes kopieren // Hefty1 Hashes kopieren
if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); if (copy)
CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
//else cudaThreadSynchronize(); //else cudaThreadSynchronize();
} }
__host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) __host__
void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
{ {
const int threadsperblock = 128; const int threadsperblock = 128;
@ -268,11 +269,11 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
if (BLOCKSIZE==84) if (BLOCKSIZE==84)
keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
else if (BLOCKSIZE==80) else if (BLOCKSIZE==80)
keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
} }

9
heavy/cuda_keccak512.h

@ -1,9 +0,0 @@
#ifndef _CUDA_KECCAK512_H
#define _CUDA_KECCAK512_H
void keccak512_cpu_init(int thr_id, int threads);
void keccak512_cpu_setBlock(void *data, int len);
void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
#endif

31
heavy/cuda_sha256.cu

@ -3,11 +3,11 @@
#include "cuda_helper.h" #include "cuda_helper.h"
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8]; extern uint32_t *heavy_heftyHashes[8];
extern uint32_t *d_nonceVector[8]; extern uint32_t *heavy_nonceVector[8];
// globaler Speicher für unsere Ergebnisse // globaler Speicher für unsere Ergebnisse
uint32_t *d_hash2output[8]; uint32_t *d_hash2output[8];
@ -47,7 +47,7 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
nonceVector[thread] = nounce; nonceVector[thread] = nounce;
@ -68,7 +68,7 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
} }
// 2. Runde // 2. Runde
//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke //memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke
//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen //memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen
#pragma unroll 16 #pragma unroll 16
for(int k=0;k<16;k++) for(int k=0;k<16;k++)
@ -168,7 +168,7 @@ __host__ void sha256_cpu_init(int thr_id, int threads)
sha256_cpu_constantTable, sha256_cpu_constantTable,
sizeof(uint32_t) * 64 ); sizeof(uint32_t) * 64 );
// Speicher für alle Ergebnisse belegen // Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads); cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
} }
@ -184,11 +184,11 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
memset(msgBlock, 0, sizeof(uint32_t) * 32); memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, len); memcpy(&msgBlock[0], data, len);
if (len == 84) { if (len == 84) {
memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
msgBlock[29] |= 0x80; msgBlock[29] |= 0x80;
msgBlock[31] = 928; // bitlen msgBlock[31] = 928; // bitlen
} else if (len == 80) { } else if (len == 80) {
memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
msgBlock[28] |= 0x80; msgBlock[28] |= 0x80;
msgBlock[31] = 896; // bitlen msgBlock[31] = 896; // bitlen
} }
@ -196,9 +196,9 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
for(int i=0;i<31;i++) // Byteorder drehen for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]); msgBlock[i] = SWAB32(msgBlock[i]);
// die erste Runde wird auf der CPU durchgeführt, da diese für // die erste Runde wird auf der CPU durchgeführt, da diese für
// alle Threads gleich ist. Der Hash wird dann an die Threads // alle Threads gleich ist. Der Hash wird dann an die Threads
// übergeben // übergeben
uint32_t W[64]; uint32_t W[64];
// Erstelle expandierten Block W // Erstelle expandierten Block W
@ -251,7 +251,8 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
{ {
// Hefty1 Hashes kopieren // Hefty1 Hashes kopieren
if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice ); if (copy)
CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
//else cudaThreadSynchronize(); //else cudaThreadSynchronize();
} }
@ -263,12 +264,12 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
if (BLOCKSIZE == 84) if (BLOCKSIZE == 84)
sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
else if (BLOCKSIZE == 80) { else if (BLOCKSIZE == 80) {
sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
} }
} }

8
heavy/cuda_sha256.h

@ -1,8 +0,0 @@
#ifndef _CUDA_SHA256_H
#define _CUDA_SHA256_H
void sha256_cpu_init(int thr_id, int threads);
void sha256_cpu_setBlock(void *data, int len);
void sha256_cpu_hash(int thr_id, int threads, int startNounce);
void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
#endif

130
heavy/heavy.cu

@ -1,35 +1,19 @@
#include <stdio.h> #include <stdio.h>
#include <memory.h>
#include <string.h>
#include <map>
#include <openssl/sha.h> #include <openssl/sha.h>
#include <map>
#ifndef _WIN32
#include <unistd.h>
#endif
// include thrust // include thrust
#include <thrust/version.h>
#include <thrust/remove.h> #include <thrust/remove.h>
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include "miner.h" #include "miner.h"
#include "hefty1.h" extern "C" {
#include "sph/sph_keccak.h" #include "sph/sph_keccak.h"
#include "sph/sph_blake.h" #include "sph/sph_blake.h"
#include "sph/sph_groestl.h" #include "sph/sph_groestl.h"
}
#include "heavy/cuda_hefty1.h" #include "hefty1.h"
#include "heavy/cuda_sha256.h" #include "heavy/heavy.h"
#include "heavy/cuda_keccak512.h"
#include "heavy/cuda_groestl512.h"
#include "heavy/cuda_blake512.h"
#include "heavy/cuda_combine.h"
#include "cuda_helper.h" #include "cuda_helper.h"
extern uint32_t *d_hash2output[8]; extern uint32_t *d_hash2output[8];
@ -40,8 +24,10 @@ extern uint32_t *d_hash5output[8];
#define HEAVYCOIN_BLKHDR_SZ 84 #define HEAVYCOIN_BLKHDR_SZ 84
#define MNR_BLKHDR_SZ 80 #define MNR_BLKHDR_SZ 80
// nonce-array für die threads // nonce-array für die threads
uint32_t *d_nonceVector[8]; uint32_t *heavy_nonceVector[8];
extern uint32_t *heavy_heftyHashes[8];
/* Combines top 64-bits from each hash into a single hash */ /* Combines top 64-bits from each hash into a single hash */
static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4) static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
@ -98,14 +84,14 @@ static int findhighbit(const uint32_t *ptarget, int words)
} }
// Generiere ein Multiword-Integer das die Zahl // Generiere ein Multiword-Integer das die Zahl
// (2 << highbit) - 1 repräsentiert. // (2 << highbit) - 1 repräsentiert.
static void genmask(uint32_t *ptarget, int words, int highbit) static void genmask(uint32_t *ptarget, int words, int highbit)
{ {
int i; int i;
for (i=words-1; i >= 0; --i) for (i=words-1; i >= 0; --i)
{ {
if ((i+1)*32 <= highbit) if ((i+1)*32 <= highbit)
ptarget[i] = 0xffffffff; ptarget[i] = UINT32_MAX;
else if (i*32 > highbit) else if (i*32 > highbit)
ptarget[i] = 0x00000000; ptarget[i] = 0x00000000;
else else
@ -121,6 +107,11 @@ struct check_nonce_for_remove
m_hashlen(hashlen), m_hashlen(hashlen),
m_startNonce(startNonce) { } m_startNonce(startNonce) { }
uint64_t m_target;
uint32_t *m_hashes;
uint32_t m_hashlen;
uint32_t m_startNonce;
__device__ __device__
bool operator()(const uint32_t x) bool operator()(const uint32_t x)
{ {
@ -129,53 +120,39 @@ struct check_nonce_for_remove
// Wert des Hashes (als uint64_t) auslesen. // Wert des Hashes (als uint64_t) auslesen.
// Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits) // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6])); uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
// gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein. bool res = (hashValue & m_target) != hashValue;
return (hashValue & m_target) != hashValue; //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target);
// gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
return res;
} }
uint64_t m_target;
uint32_t *m_hashes;
uint32_t m_hashlen;
uint32_t m_startNonce;
}; };
int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, static bool init[8] = {0,0,0,0,0,0,0,0};
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done, uint32_t maxvote, int blocklen);
extern "C" __host__
int scanhash_heavy(int thr_id, uint32_t *pdata, int scanhash_heavy(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done, uint32_t maxvote, int blocklen) unsigned long *hashes_done, uint32_t maxvote, int blocklen)
{
return scanhash_heavy_cpp(thr_id, pdata,
ptarget, max_nonce, hashes_done, maxvote, blocklen);
}
int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done, uint32_t maxvote, int blocklen)
{ {
const uint32_t first_nonce = pdata[19]; /* to check */ const uint32_t first_nonce = pdata[19]; /* to check */
// CUDA will process thousands of threads. // CUDA will process thousands of threads.
int throughput = opt_work_size ? opt_work_size : (1 << 19); // 128*4096 int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*2048
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, (int)(max_nonce - first_nonce));
int rc = 0; int rc = 0;
uint32_t *hash = NULL; uint32_t *hash = NULL;
cudaMallocHost(&hash, throughput*8*sizeof(uint32_t));
uint32_t *cpu_nonceVector = NULL; uint32_t *cpu_nonceVector = NULL;
cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)); CUDA_SAFE_CALL(cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)));
int nrmCalls[6]; int nrmCalls[6];
memset(nrmCalls, 0, sizeof(int) * 6); memset(nrmCalls, 0, sizeof(int) * 6);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000000ff; ((uint32_t*)ptarget)[7] = 0x00ff;
// für jeden Hash ein individuelles Target erstellen basierend // für jeden Hash ein individuelles Target erstellen basierend
// auf dem höchsten Bit, das in ptarget gesetzt ist. // auf dem höchsten Bit, das in ptarget gesetzt ist.
int highbit = findhighbit(ptarget, 8); int highbit = findhighbit(ptarget, 8);
uint32_t target2[2], target3[2], target4[2], target5[2]; uint32_t target2[2], target3[2], target4[2], target5[2];
genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256 genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
@ -183,7 +160,6 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512 genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512 genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id]) if (!init[thr_id])
{ {
hefty_cpu_init(thr_id, throughput); hefty_cpu_init(thr_id, throughput);
@ -192,8 +168,10 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
groestl512_cpu_init(thr_id, throughput); groestl512_cpu_init(thr_id, throughput);
blake512_cpu_init(thr_id, throughput); blake512_cpu_init(thr_id, throughput);
combine_cpu_init(thr_id, throughput); combine_cpu_init(thr_id, throughput);
CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput));
init[thr_id] = true; init[thr_id] = true;
cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput);
} }
if (blocklen == HEAVYCOIN_BLKHDR_SZ) if (blocklen == HEAVYCOIN_BLKHDR_SZ)
@ -201,13 +179,13 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
uint16_t *ext = (uint16_t *)&pdata[20]; uint16_t *ext = (uint16_t *)&pdata[20];
if (opt_vote > maxvote) { if (opt_vote > maxvote) {
printf("Warning: Your block reward vote (%hu) exceeds " applog(LOG_WARNING, "Your block reward vote (%hu) exceeds "
"the maxvote reported by the pool (%hu).\n", "the maxvote reported by the pool (%hu).",
opt_vote, maxvote); opt_vote, maxvote);
} }
if (opt_trust_pool && opt_vote > maxvote) { if (opt_trust_pool && opt_vote > maxvote) {
printf("Warning: Capping block reward vote to maxvote reported by pool.\n"); applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool.");
ext[0] = maxvote; ext[0] = maxvote;
} }
else else
@ -222,23 +200,24 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
blake512_cpu_setBlock(pdata, blocklen); blake512_cpu_setBlock(pdata, blocklen);
do { do {
uint32_t i;
////// Compaction init ////// Compaction init
thrust::device_ptr<uint32_t> devNoncePtr(d_nonceVector[thr_id]); thrust::device_ptr<uint32_t> devNoncePtr(heavy_nonceVector[thr_id]);
thrust::device_ptr<uint32_t> devNoncePtrEnd((d_nonceVector[thr_id]) + throughput); thrust::device_ptr<uint32_t> devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput);
uint32_t actualNumberOfValuesInNonceVectorGPU = throughput; uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
uint64_t *t;
hefty_cpu_hash(thr_id, throughput, pdata[19]); hefty_cpu_hash(thr_id, throughput, pdata[19]);
//cudaThreadSynchronize(); //cudaThreadSynchronize();
sha256_cpu_hash(thr_id, throughput, pdata[19]); sha256_cpu_hash(thr_id, throughput, pdata[19]);
//cudaThreadSynchronize(); //cudaThreadSynchronize();
// Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier. // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
MyStreamSynchronize(NULL, 1, thr_id); MyStreamSynchronize(NULL, 1, thr_id);
////// Compaction ////// Compaction
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target2), d_hash2output[thr_id], 8, pdata[19])); t = (uint64_t*) target2;
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19]));
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
if(actualNumberOfValuesInNonceVectorGPU == 0) if(actualNumberOfValuesInNonceVectorGPU == 0)
goto emptyNonceVector; goto emptyNonceVector;
@ -247,7 +226,8 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
//cudaThreadSynchronize(); //cudaThreadSynchronize();
////// Compaction ////// Compaction
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target3), d_hash3output[thr_id], 16, pdata[19])); t = (uint64_t*) target3;
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19]));
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
if(actualNumberOfValuesInNonceVectorGPU == 0) if(actualNumberOfValuesInNonceVectorGPU == 0)
goto emptyNonceVector; goto emptyNonceVector;
@ -256,7 +236,8 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
//cudaThreadSynchronize(); //cudaThreadSynchronize();
////// Compaction ////// Compaction
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target5), d_hash5output[thr_id], 16, pdata[19])); t = (uint64_t*) target5;
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19]));
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
if(actualNumberOfValuesInNonceVectorGPU == 0) if(actualNumberOfValuesInNonceVectorGPU == 0)
goto emptyNonceVector; goto emptyNonceVector;
@ -265,7 +246,8 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
//cudaThreadSynchronize(); //cudaThreadSynchronize();
////// Compaction ////// Compaction
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target4), d_hash4output[thr_id], 16, pdata[19])); t = (uint64_t*) target4;
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19]));
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
if(actualNumberOfValuesInNonceVectorGPU == 0) if(actualNumberOfValuesInNonceVectorGPU == 0)
goto emptyNonceVector; goto emptyNonceVector;
@ -273,17 +255,22 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
// combine // combine
combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash); combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
if (opt_tracegpu) {
applog(LOG_BLUE, "heavy GPU hash:");
applog_hash((uchar*)hash);
}
// Ergebnisse kopieren // Ergebnisse kopieren
if(actualNumberOfValuesInNonceVectorGPU > 0) if(actualNumberOfValuesInNonceVectorGPU > 0)
{ {
cudaMemcpy(cpu_nonceVector, d_nonceVector[thr_id], sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU, cudaMemcpyDeviceToHost); size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
for (i=0; i<actualNumberOfValuesInNonceVectorGPU;++i) for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
{ {
uint32_t nonce = cpu_nonceVector[i]; uint32_t nonce = cpu_nonceVector[i];
//uint32_t index = nonce - pdata[19]; uint32_t *foundhash = &hash[8*i];
uint32_t index = i;
uint32_t *foundhash = &hash[8*index];
if (foundhash[7] <= ptarget[7]) { if (foundhash[7] <= ptarget[7]) {
if (fulltest(foundhash, ptarget)) { if (fulltest(foundhash, ptarget)) {
uint32_t verification[8]; uint32_t verification[8];
@ -291,9 +278,7 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
heavycoin_hash((uchar*)verification, (uchar*)pdata, blocklen); heavycoin_hash((uchar*)verification, (uchar*)pdata, blocklen);
if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) { if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) {
applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce); applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce);
} } else {
else
{
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
rc = 1; rc = 1;
goto exit; goto exit;
@ -316,6 +301,7 @@ exit:
return rc; return rc;
} }
__host__
void heavycoin_hash(uchar* output, const uchar* input, int len) void heavycoin_hash(uchar* output, const uchar* input, int len)
{ {
unsigned char hash1[32]; unsigned char hash1[32];

30
heavy/heavy.h

@ -0,0 +1,30 @@
#ifndef _CUDA_HEAVY_H
#define _CUDA_HEAVY_H
void blake512_cpu_init(int thr_id, int threads);
void blake512_cpu_setBlock(void *pdata, int len);
void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
void groestl512_cpu_init(int thr_id, int threads);
void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
void groestl512_cpu_setBlock(void *data, int len);
void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
void hefty_cpu_hash(int thr_id, int threads, int startNounce);
void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
void hefty_cpu_init(int thr_id, int threads);
void keccak512_cpu_init(int thr_id, int threads);
void keccak512_cpu_setBlock(void *data, int len);
void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
void sha256_cpu_init(int thr_id, int threads);
void sha256_cpu_setBlock(void *data, int len);
void sha256_cpu_hash(int thr_id, int threads, int startNounce);
void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
void combine_cpu_init(int thr_id, int threads);
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
#endif

2
stats.cpp

@ -37,7 +37,7 @@ void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8
return; return;
memset(&data, 0, sizeof(data)); memset(&data, 0, sizeof(data));
data.uid = uid; data.uid = (uint32_t) uid;
data.gpu_id = gpu; data.gpu_id = gpu;
data.thr_id = (uint8_t)thr_id; data.thr_id = (uint8_t)thr_id;
data.tm_stat = (uint32_t) time(NULL); data.tm_stat = (uint32_t) time(NULL);

18
util.cpp

@ -102,10 +102,8 @@ void applog(int prio, const char *fmt, ...)
const char* color = ""; const char* color = "";
char *f; char *f;
int len; int len;
time_t now;
struct tm tm, *tm_p; struct tm tm, *tm_p;
time_t now = time(NULL);
time(&now);
pthread_mutex_lock(&applog_lock); pthread_mutex_lock(&applog_lock);
tm_p = localtime(&now); tm_p = localtime(&now);
@ -735,9 +733,7 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
if (!strstr(sctx->sockbuf, "\n")) { if (!strstr(sctx->sockbuf, "\n")) {
bool ret = true; bool ret = true;
time_t rstart; time_t rstart = time(NULL);
time(&rstart);
if (!socket_full(sctx->sock, 60)) { if (!socket_full(sctx->sock, 60)) {
applog(LOG_ERR, "stratum_recv_line timed out"); applog(LOG_ERR, "stratum_recv_line timed out");
goto out; goto out;
@ -1578,16 +1574,18 @@ void do_gpu_tests(void)
uchar buf[128]; uchar buf[128];
uint32_t tgt[8] = { 0 }; uint32_t tgt[8] = { 0 };
memset(buf, 0, sizeof buf);
buf[0] = 1; buf[64] = 2;
opt_tracegpu = true; opt_tracegpu = true;
work_restart = (struct work_restart*) malloc(sizeof(struct work_restart)); work_restart = (struct work_restart*) malloc(sizeof(struct work_restart));
work_restart[0].restart = 1; work_restart[0].restart = 1;
tgt[6] = 0xffff; tgt[7] = 0xffff;
memset(buf, 0, sizeof buf);
// buf[0] = 1; buf[64] = 2; // for endian tests
scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14); scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
memset(buf, 0, sizeof buf);
scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84
free(work_restart); free(work_restart);
work_restart = NULL; work_restart = NULL;
opt_tracegpu = false; opt_tracegpu = false;

Loading…
Cancel
Save