heavy: add error checks, fix strict aliasing and linux
The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80...
This commit is contained in:
parent
1032f191bf
commit
1b65cd05cc
@ -80,6 +80,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
|
|||||||
blake32.o: blake32.cu
|
blake32.o: blake32.cu
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
|
||||||
|
|
||||||
|
heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
|
||||||
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
|
||||||
|
|
||||||
keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
|
keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<
|
||||||
|
|
||||||
|
10
blake32.cu
10
blake32.cu
@ -303,7 +303,11 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
|
|||||||
ending[3] = nonce; /* our tested value */
|
ending[3] = nonce; /* our tested value */
|
||||||
|
|
||||||
blake256_compress(h, ending, 640, rounds);
|
blake256_compress(h, ending, 640, rounds);
|
||||||
|
#if 0
|
||||||
|
if (trace) {
|
||||||
|
printf("blake hash[6][7]: %08x %08x\n", h[6], h[7]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
//if (h[7] == 0 && high64 <= highTarget) {
|
//if (h[7] == 0 && high64 <= highTarget) {
|
||||||
if (h[7] == 0) {
|
if (h[7] == 0) {
|
||||||
#if NBN == 2
|
#if NBN == 2
|
||||||
@ -318,14 +322,14 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
|
|||||||
#else
|
#else
|
||||||
resNonce[0] = nonce;
|
resNonce[0] = nonce;
|
||||||
#endif
|
#endif
|
||||||
if (trace) {
|
|
||||||
#ifdef _DEBUG
|
#ifdef _DEBUG
|
||||||
|
if (trace) {
|
||||||
uint64_t high64 = ((uint64_t*)h)[3];
|
uint64_t high64 = ((uint64_t*)h)[3];
|
||||||
printf("gpu: %16llx\n", high64);
|
printf("gpu: %16llx\n", high64);
|
||||||
printf("gpu: %08x.%08x\n", h[7], h[6]);
|
printf("gpu: %08x.%08x\n", h[7], h[6]);
|
||||||
printf("tgt: %16llx\n", highTarget);
|
printf("tgt: %16llx\n", highTarget);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -173,7 +173,7 @@
|
|||||||
<MaxRegCount>80</MaxRegCount>
|
<MaxRegCount>80</MaxRegCount>
|
||||||
<PtxAsOptionV>true</PtxAsOptionV>
|
<PtxAsOptionV>true</PtxAsOptionV>
|
||||||
<Keep>false</Keep>
|
<Keep>false</Keep>
|
||||||
<CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
|
<CodeGeneration>compute_50,sm_50</CodeGeneration>
|
||||||
<AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
|
||||||
<Defines>
|
<Defines>
|
||||||
</Defines>
|
</Defines>
|
||||||
@ -306,12 +306,7 @@
|
|||||||
<ClInclude Include="cuda_groestlcoin.h" />
|
<ClInclude Include="cuda_groestlcoin.h" />
|
||||||
<ClInclude Include="cuda_helper.h" />
|
<ClInclude Include="cuda_helper.h" />
|
||||||
<ClInclude Include="elist.h" />
|
<ClInclude Include="elist.h" />
|
||||||
<ClInclude Include="heavy\cuda_blake512.h" />
|
<ClInclude Include="heavy\heavy.h" />
|
||||||
<ClInclude Include="heavy\cuda_combine.h" />
|
|
||||||
<ClInclude Include="heavy\cuda_groestl512.h" />
|
|
||||||
<ClInclude Include="heavy\cuda_hefty1.h" />
|
|
||||||
<ClInclude Include="heavy\cuda_keccak512.h" />
|
|
||||||
<ClInclude Include="heavy\cuda_sha256.h" />
|
|
||||||
<ClInclude Include="hefty1.h" />
|
<ClInclude Include="hefty1.h" />
|
||||||
<ClInclude Include="miner.h" />
|
<ClInclude Include="miner.h" />
|
||||||
<ClInclude Include="nvml.h" />
|
<ClInclude Include="nvml.h" />
|
||||||
@ -358,6 +353,7 @@
|
|||||||
<CudaCompile Include="heavy\cuda_groestl512.cu">
|
<CudaCompile Include="heavy\cuda_groestl512.cu">
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="heavy\cuda_hefty1.cu">
|
<CudaCompile Include="heavy\cuda_hefty1.cu">
|
||||||
|
<MaxRegCount>80</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="heavy\cuda_keccak512.cu">
|
<CudaCompile Include="heavy\cuda_keccak512.cu">
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
|
@ -43,9 +43,6 @@
|
|||||||
<Filter Include="Source Files\CUDA\heavy">
|
<Filter Include="Source Files\CUDA\heavy">
|
||||||
<UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
|
<UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
|
||||||
</Filter>
|
</Filter>
|
||||||
<Filter Include="Header Files\CUDA\heavy">
|
|
||||||
<UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
|
|
||||||
</Filter>
|
|
||||||
<Filter Include="Source Files\CUDA\qubit">
|
<Filter Include="Source Files\CUDA\qubit">
|
||||||
<UniqueIdentifier>{f3ed23a2-8ce7-41a5-b051-6da56047dc35}</UniqueIdentifier>
|
<UniqueIdentifier>{f3ed23a2-8ce7-41a5-b051-6da56047dc35}</UniqueIdentifier>
|
||||||
</Filter>
|
</Filter>
|
||||||
@ -293,23 +290,8 @@
|
|||||||
<ClInclude Include="sph\sph_types.h">
|
<ClInclude Include="sph\sph_types.h">
|
||||||
<Filter>Header Files\sph</Filter>
|
<Filter>Header Files\sph</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
<ClInclude Include="heavy\cuda_blake512.h">
|
<ClInclude Include="heavy\heavy.h">
|
||||||
<Filter>Header Files\CUDA\heavy</Filter>
|
<Filter>Header Files\CUDA</Filter>
|
||||||
</ClInclude>
|
|
||||||
<ClInclude Include="heavy\cuda_combine.h">
|
|
||||||
<Filter>Header Files\CUDA\heavy</Filter>
|
|
||||||
</ClInclude>
|
|
||||||
<ClInclude Include="heavy\cuda_groestl512.h">
|
|
||||||
<Filter>Header Files\CUDA\heavy</Filter>
|
|
||||||
</ClInclude>
|
|
||||||
<ClInclude Include="heavy\cuda_hefty1.h">
|
|
||||||
<Filter>Header Files\CUDA\heavy</Filter>
|
|
||||||
</ClInclude>
|
|
||||||
<ClInclude Include="heavy\cuda_keccak512.h">
|
|
||||||
<Filter>Header Files\CUDA\heavy</Filter>
|
|
||||||
</ClInclude>
|
|
||||||
<ClInclude Include="heavy\cuda_sha256.h">
|
|
||||||
<Filter>Header Files\CUDA\heavy</Filter>
|
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
<ClInclude Include="cuda_helper.h">
|
<ClInclude Include="cuda_helper.h">
|
||||||
<Filter>Header Files\CUDA</Filter>
|
<Filter>Header Files\CUDA</Filter>
|
||||||
@ -539,4 +521,4 @@
|
|||||||
<Filter>Source Files\CUDA\x11</Filter>
|
<Filter>Source Files\CUDA\x11</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
</Project>
|
</Project>
|
@ -3,11 +3,11 @@
|
|||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
// globaler Speicher für alle HeftyHashes aller Threads
|
// globaler Speicher für alle HeftyHashes aller Threads
|
||||||
extern uint32_t *d_heftyHashes[8];
|
extern uint32_t *heavy_heftyHashes[8];
|
||||||
extern uint32_t *d_nonceVector[8];
|
extern uint32_t *heavy_nonceVector[8];
|
||||||
|
|
||||||
// globaler Speicher für unsere Ergebnisse
|
// globaler Speicher für unsere Ergebnisse
|
||||||
uint32_t *d_hash5output[8];
|
uint32_t *d_hash5output[8];
|
||||||
|
|
||||||
// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
|
// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
|
||||||
@ -53,13 +53,13 @@ __constant__ uint64_t c_u512[16];
|
|||||||
|
|
||||||
const uint64_t host_u512[16] =
|
const uint64_t host_u512[16] =
|
||||||
{
|
{
|
||||||
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
|
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
|
||||||
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
|
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
|
||||||
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
|
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
|
||||||
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
|
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
|
||||||
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
|
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
|
||||||
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
|
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
|
||||||
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
|
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
|
||||||
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
|
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -123,7 +123,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
|
|||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
// bestimme den aktuellen Zähler
|
// bestimme den aktuellen Zähler
|
||||||
//uint32_t nounce = startNounce + thread;
|
//uint32_t nounce = startNounce + thread;
|
||||||
uint32_t nounce = nonceVector[thread];
|
uint32_t nounce = nonceVector[thread];
|
||||||
|
|
||||||
@ -141,10 +141,10 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
|
|||||||
h[6] = 0x1f83d9abfb41bd6bULL;
|
h[6] = 0x1f83d9abfb41bd6bULL;
|
||||||
h[7] = 0x5be0cd19137e2179ULL;
|
h[7] = 0x5be0cd19137e2179ULL;
|
||||||
|
|
||||||
// 128 Byte für die Message
|
// 128 Byte für die Message
|
||||||
uint64_t buf[16];
|
uint64_t buf[16];
|
||||||
|
|
||||||
// Message für die erste Runde in Register holen
|
// Message für die erste Runde in Register holen
|
||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
|
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
|
||||||
|
|
||||||
@ -154,7 +154,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
|
|||||||
uint32_t *hefty = heftyHashes + 8 * hashPosition;
|
uint32_t *hefty = heftyHashes + 8 * hashPosition;
|
||||||
if (BLOCKSIZE == 84) {
|
if (BLOCKSIZE == 84) {
|
||||||
// den thread-spezifischen Hefty1 hash einsetzen
|
// den thread-spezifischen Hefty1 hash einsetzen
|
||||||
// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
|
// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
|
||||||
buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
|
buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
|
||||||
buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
|
buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
|
||||||
buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
|
buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
|
||||||
@ -173,14 +173,14 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
|
|||||||
|
|
||||||
// erste Runde
|
// erste Runde
|
||||||
blake512_compress<BLOCKSIZE>( h, buf, 0, c_sigma, c_u512 );
|
blake512_compress<BLOCKSIZE>( h, buf, 0, c_sigma, c_u512 );
|
||||||
|
|
||||||
|
|
||||||
// zweite Runde
|
// zweite Runde
|
||||||
#pragma unroll 15
|
#pragma unroll 15
|
||||||
for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
|
for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
|
||||||
buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
|
buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
|
||||||
blake512_compress<BLOCKSIZE>( h, buf, 1, c_sigma, c_u512 );
|
blake512_compress<BLOCKSIZE>( h, buf, 1, c_sigma, c_u512 );
|
||||||
|
|
||||||
// Hash rauslassen
|
// Hash rauslassen
|
||||||
uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
|
uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
@ -210,8 +210,8 @@ __host__ void blake512_cpu_init(int thr_id, int threads)
|
|||||||
sizeof(host_SecondRound),
|
sizeof(host_SecondRound),
|
||||||
0, cudaMemcpyHostToDevice);
|
0, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
// Speicher für alle Ergebnisse belegen
|
// Speicher für alle Ergebnisse belegen
|
||||||
cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads);
|
CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads));
|
||||||
}
|
}
|
||||||
|
|
||||||
static int BLOCKSIZE = 84;
|
static int BLOCKSIZE = 84;
|
||||||
@ -222,14 +222,14 @@ __host__ void blake512_cpu_setBlock(void *pdata, int len)
|
|||||||
{
|
{
|
||||||
unsigned char PaddedMessage[128];
|
unsigned char PaddedMessage[128];
|
||||||
if (len == 84) {
|
if (len == 84) {
|
||||||
// Message mit Padding für erste Runde bereitstellen
|
// Message mit Padding für erste Runde bereitstellen
|
||||||
memcpy(PaddedMessage, pdata, 84);
|
memcpy(PaddedMessage, pdata, 84);
|
||||||
memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
|
memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
|
||||||
memset(PaddedMessage+116, 0, 12);
|
memset(PaddedMessage+116, 0, 12);
|
||||||
PaddedMessage[116] = 0x80;
|
PaddedMessage[116] = 0x80;
|
||||||
} else if (len == 80) {
|
} else if (len == 80) {
|
||||||
memcpy(PaddedMessage, pdata, 80);
|
memcpy(PaddedMessage, pdata, 80);
|
||||||
memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
|
memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
|
||||||
memset(PaddedMessage+112, 0, 16);
|
memset(PaddedMessage+112, 0, 16);
|
||||||
PaddedMessage[112] = 0x80;
|
PaddedMessage[112] = 0x80;
|
||||||
}
|
}
|
||||||
@ -246,11 +246,11 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
|
|
||||||
if (BLOCKSIZE == 80)
|
if (BLOCKSIZE == 80)
|
||||||
blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
|
||||||
else if (BLOCKSIZE == 84)
|
else if (BLOCKSIZE == 84)
|
||||||
blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
|
||||||
}
|
}
|
||||||
|
@ -1,7 +0,0 @@
|
|||||||
#ifndef _CUDA_BLAKE512_H
|
|
||||||
#define _CUDA_BLAKE512_H
|
|
||||||
|
|
||||||
void blake512_cpu_init(int thr_id, int threads);
|
|
||||||
void blake512_cpu_setBlock(void *pdata, int len);
|
|
||||||
void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
|
|
||||||
#endif
|
|
@ -1,16 +1,19 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
// globaler Speicher für unsere Ergebnisse
|
// globaler Speicher für unsere Ergebnisse
|
||||||
uint32_t *d_hashoutput[8];
|
static uint32_t *d_hashoutput[8];
|
||||||
|
|
||||||
extern uint32_t *d_hash2output[8];
|
extern uint32_t *d_hash2output[8];
|
||||||
extern uint32_t *d_hash3output[8];
|
extern uint32_t *d_hash3output[8];
|
||||||
extern uint32_t *d_hash4output[8];
|
extern uint32_t *d_hash4output[8];
|
||||||
extern uint32_t *d_hash5output[8];
|
extern uint32_t *d_hash5output[8];
|
||||||
extern uint32_t *d_nonceVector[8];
|
|
||||||
|
extern uint32_t *heavy_nonceVector[8];
|
||||||
|
|
||||||
/* Combines top 64-bits from each hash into a single hash */
|
/* Combines top 64-bits from each hash into a single hash */
|
||||||
static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
|
__device__
|
||||||
|
static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
|
||||||
{
|
{
|
||||||
uint32_t lout[8]; // Combining in Registern machen
|
uint32_t lout[8]; // Combining in Registern machen
|
||||||
|
|
||||||
@ -98,7 +101,8 @@ static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *
|
|||||||
out[i] = lout[i];
|
out[i] = lout[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
|
__global__
|
||||||
|
void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
|
||||||
{
|
{
|
||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
@ -116,13 +120,14 @@ __global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *ou
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setup-Funktionen
|
__host__
|
||||||
__host__ void combine_cpu_init(int thr_id, int threads)
|
void combine_cpu_init(int thr_id, int threads)
|
||||||
{
|
{
|
||||||
// Speicher für alle Ergebnisse belegen
|
// Speicher für alle Ergebnisse belegen
|
||||||
cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
|
CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash)
|
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash)
|
||||||
{
|
{
|
||||||
// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
|
// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
|
||||||
@ -133,11 +138,8 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs
|
combine_gpu_hash <<<grid, block>>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]);
|
||||||
size_t shared_size = 0;
|
|
||||||
|
|
||||||
combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
|
// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
|
||||||
|
CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost));
|
||||||
// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
|
|
||||||
cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
|
|
||||||
}
|
}
|
||||||
|
@ -1,7 +0,0 @@
|
|||||||
#ifndef _CUDA_COMBINE_H
|
|
||||||
#define _CUDA_COMBINE_H
|
|
||||||
|
|
||||||
void combine_cpu_init(int thr_id, int threads);
|
|
||||||
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
|
|
||||||
|
|
||||||
#endif
|
|
@ -3,11 +3,11 @@
|
|||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
// globaler Speicher für alle HeftyHashes aller Threads
|
// globaler Speicher für alle HeftyHashes aller Threads
|
||||||
extern uint32_t *d_heftyHashes[8];
|
extern uint32_t *heavy_heftyHashes[8];
|
||||||
extern uint32_t *d_nonceVector[8];
|
extern uint32_t *heavy_nonceVector[8];
|
||||||
|
|
||||||
// globaler Speicher für unsere Ergebnisse
|
// globaler Speicher für unsere Ergebnisse
|
||||||
uint32_t *d_hash4output[8];
|
uint32_t *d_hash4output[8];
|
||||||
|
|
||||||
__constant__ uint32_t groestl_gpu_state[32];
|
__constant__ uint32_t groestl_gpu_state[32];
|
||||||
@ -603,22 +603,22 @@ __device__ void groestl512_perm_P(uint32_t *a)
|
|||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int k=0;k<32;k+=2)
|
for(int k=0;k<32;k+=2)
|
||||||
{
|
{
|
||||||
t[k + 0] = T0up( B32_0(a[k & 0x1f]) ) ^
|
t[k + 0] = T0up( B32_0(a[k & 0x1f]) ) ^
|
||||||
T1up( B32_1(a[(k + 2) & 0x1f]) ) ^
|
T1up( B32_1(a[(k + 2) & 0x1f]) ) ^
|
||||||
T2up( B32_2(a[(k + 4) & 0x1f]) ) ^
|
T2up( B32_2(a[(k + 4) & 0x1f]) ) ^
|
||||||
T3up( B32_3(a[(k + 6) & 0x1f]) ) ^
|
T3up( B32_3(a[(k + 6) & 0x1f]) ) ^
|
||||||
T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^
|
T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^
|
||||||
T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^
|
T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^
|
||||||
T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^
|
T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^
|
||||||
T3dn( B32_3(a[(k + 23) & 0x1f]) );
|
T3dn( B32_3(a[(k + 23) & 0x1f]) );
|
||||||
|
|
||||||
t[k + 1] = T0dn( B32_0(a[k & 0x1f]) ) ^
|
t[k + 1] = T0dn( B32_0(a[k & 0x1f]) ) ^
|
||||||
T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^
|
T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^
|
||||||
T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^
|
T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^
|
||||||
T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^
|
T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^
|
||||||
T0up( B32_0(a[(k + 9) & 0x1f]) ) ^
|
T0up( B32_0(a[(k + 9) & 0x1f]) ) ^
|
||||||
T1up( B32_1(a[(k + 11) & 0x1f]) ) ^
|
T1up( B32_1(a[(k + 11) & 0x1f]) ) ^
|
||||||
T2up( B32_2(a[(k + 13) & 0x1f]) ) ^
|
T2up( B32_2(a[(k + 13) & 0x1f]) ) ^
|
||||||
T3up( B32_3(a[(k + 23) & 0x1f]) );
|
T3up( B32_3(a[(k + 23) & 0x1f]) );
|
||||||
}
|
}
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
@ -645,22 +645,22 @@ __device__ void groestl512_perm_Q(uint32_t *a)
|
|||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int k=0;k<32;k+=2)
|
for(int k=0;k<32;k+=2)
|
||||||
{
|
{
|
||||||
t[k + 0] = T0up( B32_0(a[(k + 2) & 0x1f]) ) ^
|
t[k + 0] = T0up( B32_0(a[(k + 2) & 0x1f]) ) ^
|
||||||
T1up( B32_1(a[(k + 6) & 0x1f]) ) ^
|
T1up( B32_1(a[(k + 6) & 0x1f]) ) ^
|
||||||
T2up( B32_2(a[(k + 10) & 0x1f]) ) ^
|
T2up( B32_2(a[(k + 10) & 0x1f]) ) ^
|
||||||
T3up( B32_3(a[(k + 22) & 0x1f]) ) ^
|
T3up( B32_3(a[(k + 22) & 0x1f]) ) ^
|
||||||
T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^
|
T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^
|
||||||
T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^
|
T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^
|
||||||
T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^
|
T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^
|
||||||
T3dn( B32_3(a[(k + 13) & 0x1f]) );
|
T3dn( B32_3(a[(k + 13) & 0x1f]) );
|
||||||
|
|
||||||
t[k + 1] = T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^
|
t[k + 1] = T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^
|
||||||
T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^
|
T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^
|
||||||
T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^
|
T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^
|
||||||
T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^
|
T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^
|
||||||
T0up( B32_0(a[(k + 1) & 0x1f]) ) ^
|
T0up( B32_0(a[(k + 1) & 0x1f]) ) ^
|
||||||
T1up( B32_1(a[(k + 5) & 0x1f]) ) ^
|
T1up( B32_1(a[(k + 5) & 0x1f]) ) ^
|
||||||
T2up( B32_2(a[(k + 9) & 0x1f]) ) ^
|
T2up( B32_2(a[(k + 9) & 0x1f]) ) ^
|
||||||
T3up( B32_3(a[(k + 13) & 0x1f]) );
|
T3up( B32_3(a[(k + 13) & 0x1f]) );
|
||||||
}
|
}
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
@ -677,7 +677,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
|
|||||||
uint32_t message[32];
|
uint32_t message[32];
|
||||||
uint32_t state[32];
|
uint32_t state[32];
|
||||||
|
|
||||||
// lese message ein & verknüpfe diese mit dem hash1 von hefty1
|
// lese message ein & verknüpfe diese mit dem hash1 von hefty1
|
||||||
// lese den state ein
|
// lese den state ein
|
||||||
|
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
@ -700,7 +700,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
|
|||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (int k=0; k<8; ++k)
|
for (int k=0; k<8; ++k)
|
||||||
message[BLOCKSIZE/4+k] = heftyHash[k];
|
message[BLOCKSIZE/4+k] = heftyHash[k];
|
||||||
|
|
||||||
uint32_t g[32];
|
uint32_t g[32];
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for(int u=0;u<32;u++)
|
for(int u=0;u<32;u++)
|
||||||
@ -709,7 +709,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
|
|||||||
// Perm
|
// Perm
|
||||||
groestl512_perm_P(g);
|
groestl512_perm_P(g);
|
||||||
groestl512_perm_Q(message);
|
groestl512_perm_Q(message);
|
||||||
|
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for(int u=0;u<32;u++)
|
for(int u=0;u<32;u++)
|
||||||
{
|
{
|
||||||
@ -753,7 +753,7 @@ __host__ void groestl512_cpu_init(int thr_id, int threads)
|
|||||||
texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
|
texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
|
||||||
texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
|
texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
|
||||||
|
|
||||||
// Speicher für alle Ergebnisse belegen
|
// Speicher für alle Ergebnisse belegen
|
||||||
cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
|
cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -778,31 +778,27 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
|
|||||||
msgBlock[28] = 0x80;
|
msgBlock[28] = 0x80;
|
||||||
msgBlock[31] = 0x01000000;
|
msgBlock[31] = 0x01000000;
|
||||||
}
|
}
|
||||||
// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
|
// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
|
||||||
// auf der GPU ausgeführt)
|
// auf der GPU ausgeführt)
|
||||||
|
|
||||||
// setze register
|
// setze register
|
||||||
uint32_t groestl_state_init[32];
|
uint32_t groestl_state_init[32];
|
||||||
memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
|
memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
|
||||||
groestl_state_init[31] = 0x20000;
|
groestl_state_init[31] = 0x20000;
|
||||||
|
|
||||||
// state speichern
|
// state speichern
|
||||||
cudaMemcpyToSymbol( groestl_gpu_state,
|
cudaMemcpyToSymbol(groestl_gpu_state, groestl_state_init, 128);
|
||||||
groestl_state_init,
|
|
||||||
128);
|
|
||||||
|
|
||||||
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
|
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
|
||||||
cudaMemcpyToSymbol( groestl_gpu_msg,
|
cudaMemcpyToSymbol(groestl_gpu_msg, msgBlock, 128);
|
||||||
msgBlock,
|
|
||||||
128);
|
|
||||||
BLOCKSIZE = len;
|
BLOCKSIZE = len;
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
|
__host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
|
||||||
{
|
{
|
||||||
// Hefty1 Hashes kopieren (eigentlich nur zum debuggen)
|
// Hefty1 Hashes kopieren (eigentlich nur zum debuggen)
|
||||||
if (copy)
|
if (copy)
|
||||||
cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
|
CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
__host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
||||||
@ -813,11 +809,11 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
|
|
||||||
if (BLOCKSIZE == 84)
|
if (BLOCKSIZE == 84)
|
||||||
groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
|
||||||
else if (BLOCKSIZE == 80)
|
else if (BLOCKSIZE == 80)
|
||||||
groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
|
||||||
}
|
}
|
||||||
|
@ -1,9 +0,0 @@
|
|||||||
#ifndef _CUDA_GROESTL512_H
|
|
||||||
#define _CUDA_GROESTL512_H
|
|
||||||
|
|
||||||
void groestl512_cpu_init(int thr_id, int threads);
|
|
||||||
void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
|
|
||||||
void groestl512_cpu_setBlock(void *data, int len);
|
|
||||||
void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,12 +1,14 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
|
|
||||||
|
#include "miner.h"
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
#define USE_SHARED 1
|
#define USE_SHARED 1
|
||||||
|
|
||||||
// globaler Speicher für alle HeftyHashes aller Threads
|
// globaler Speicher für alle HeftyHashes aller Threads
|
||||||
uint32_t *d_heftyHashes[8];
|
uint32_t *heavy_heftyHashes[8];
|
||||||
|
|
||||||
/* Hash-Tabellen */
|
/* Hash-Tabellen */
|
||||||
__constant__ uint32_t hefty_gpu_constantTable[64];
|
__constant__ uint32_t hefty_gpu_constantTable[64];
|
||||||
@ -30,7 +32,7 @@ uint32_t hefty_cpu_hashTable[] = {
|
|||||||
0x9b05688cUL,
|
0x9b05688cUL,
|
||||||
0x1f83d9abUL,
|
0x1f83d9abUL,
|
||||||
0x5be0cd19UL };
|
0x5be0cd19UL };
|
||||||
|
|
||||||
uint32_t hefty_cpu_constantTable[] = {
|
uint32_t hefty_cpu_constantTable[] = {
|
||||||
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
|
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
|
||||||
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
|
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
|
||||||
@ -50,11 +52,16 @@ uint32_t hefty_cpu_constantTable[] = {
|
|||||||
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
|
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
|
||||||
};
|
};
|
||||||
|
|
||||||
//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
#if 0
|
||||||
static __host__ __device__ uint32_t S(uint32_t x, int n)
|
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||||
|
#else
|
||||||
|
__host__ __device__
|
||||||
|
static uint32_t S(uint32_t x, int n)
|
||||||
{
|
{
|
||||||
return (((x) >> (n)) | ((x) << (32 - (n))));
|
return (((x) >> (n)) | ((x) << (32 - (n))));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#define R(x, n) ((x) >> (n))
|
#define R(x, n) ((x) >> (n))
|
||||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
||||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
||||||
@ -67,7 +74,9 @@ static __host__ __device__ uint32_t S(uint32_t x, int n)
|
|||||||
|
|
||||||
// uint8_t
|
// uint8_t
|
||||||
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
|
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
|
||||||
__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
|
|
||||||
|
__host__ __forceinline__ __device__
|
||||||
|
uint8_t smoosh2(uint32_t x)
|
||||||
{
|
{
|
||||||
uint16_t w = (x >> 16) ^ (x & 0xffff);
|
uint16_t w = (x >> 16) ^ (x & 0xffff);
|
||||||
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
|
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
|
||||||
@ -77,13 +86,14 @@ __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
|
|||||||
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
|
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
|
||||||
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
|
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
|
||||||
|
|
||||||
__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
|
__host__ __forceinline__ __device__
|
||||||
|
void Mangle(uint32_t *inp)
|
||||||
{
|
{
|
||||||
uint32_t r = smoosh4Quad(inp[0]);
|
uint32_t r = smoosh4Quad(inp[0]);
|
||||||
uint32_t inp0org;
|
uint32_t inp0org;
|
||||||
uint32_t tmp0Mask, tmp1Mask;
|
uint32_t tmp0Mask, tmp1Mask;
|
||||||
uint32_t in1, in2, isAddition;
|
uint32_t in1, in2, isAddition;
|
||||||
uint32_t tmp;
|
int32_t tmp;
|
||||||
uint8_t b;
|
uint8_t b;
|
||||||
|
|
||||||
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
|
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
|
||||||
@ -92,24 +102,24 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
|
|||||||
tmp = smoosh2(inp[1]);
|
tmp = smoosh2(inp[1]);
|
||||||
b = getByte(r,tmp);
|
b = getByte(r,tmp);
|
||||||
inp0org = S(inp[0], b);
|
inp0org = S(inp[0], b);
|
||||||
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
|
tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
|
||||||
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
|
tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
|
||||||
|
|
||||||
in1 = (inp[2] & ~inp0org) |
|
in1 = (inp[2] & ~inp0org) |
|
||||||
(tmp1Mask & ~inp[2] & inp0org) |
|
(tmp1Mask & ~inp[2] & inp0org) |
|
||||||
(~tmp0Mask & ~inp[2] & inp0org);
|
(~tmp0Mask & ~inp[2] & inp0org);
|
||||||
in2 = inp[2] += ~inp0org;
|
in2 = inp[2] += ~inp0org;
|
||||||
isAddition = ~tmp0Mask & tmp1Mask;
|
isAddition = ~tmp0Mask & tmp1Mask;
|
||||||
inp[2] = isAddition ? in2 : in1;
|
inp[2] = isAddition ? in2 : in1;
|
||||||
|
|
||||||
r += 0x01010101;
|
r += 0x01010101;
|
||||||
tmp = smoosh2(inp[1] ^ inp[2]);
|
tmp = smoosh2(inp[1] ^ inp[2]);
|
||||||
b = getByte(r,tmp);
|
b = getByte(r,tmp);
|
||||||
inp0org = S(inp[0], b);
|
inp0org = S(inp[0], b);
|
||||||
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
|
tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
|
||||||
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
|
tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
|
||||||
|
|
||||||
in1 = (inp[3] & ~inp0org) |
|
in1 = (inp[3] & ~inp0org) |
|
||||||
(tmp1Mask & ~inp[3] & inp0org) |
|
(tmp1Mask & ~inp[3] & inp0org) |
|
||||||
(~tmp0Mask & ~inp[3] & inp0org);
|
(~tmp0Mask & ~inp[3] & inp0org);
|
||||||
in2 = inp[3] += ~inp0org;
|
in2 = inp[3] += ~inp0org;
|
||||||
@ -119,20 +129,23 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
|
|||||||
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
|
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
|
__host__ __forceinline__ __device__
|
||||||
|
void Absorb(uint32_t *inp, uint32_t x)
|
||||||
{
|
{
|
||||||
inp[0] ^= x;
|
inp[0] ^= x;
|
||||||
Mangle(inp);
|
Mangle(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
|
__host__ __forceinline__ __device__
|
||||||
|
uint32_t Squeeze(uint32_t *inp)
|
||||||
{
|
{
|
||||||
uint32_t y = inp[0];
|
uint32_t y = inp[0];
|
||||||
Mangle(inp);
|
Mangle(inp);
|
||||||
return y;
|
return y;
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
|
__host__ __forceinline__ __device__
|
||||||
|
uint32_t Br(uint32_t *sponge, uint32_t x)
|
||||||
{
|
{
|
||||||
uint32_t r = Squeeze(sponge);
|
uint32_t r = Squeeze(sponge);
|
||||||
uint32_t t = ((r >> 8) & 0x1F);
|
uint32_t t = ((r >> 8) & 0x1F);
|
||||||
@ -146,11 +159,12 @@ __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
|
|||||||
return retVal;
|
return retVal;
|
||||||
}
|
}
|
||||||
|
|
||||||
__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
__device__ __forceinline__
|
||||||
|
void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
||||||
{
|
{
|
||||||
uint32_t tmpBr;
|
uint32_t tmpBr;
|
||||||
|
|
||||||
uint32_t brG = Br(sponge, regs[6]);
|
uint32_t brG = Br(sponge, regs[6]);
|
||||||
uint32_t brF = Br(sponge, regs[5]);
|
uint32_t brF = Br(sponge, regs[5]);
|
||||||
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
||||||
uint32_t brE = Br(sponge, regs[4]);
|
uint32_t brE = Br(sponge, regs[4]);
|
||||||
@ -169,11 +183,12 @@ __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint
|
|||||||
regs[4] += tmpBr;
|
regs[4] += tmpBr;
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
__host__
|
||||||
|
void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
||||||
{
|
{
|
||||||
uint32_t tmpBr;
|
uint32_t tmpBr;
|
||||||
|
|
||||||
uint32_t brG = Br(sponge, regs[6]);
|
uint32_t brG = Br(sponge, regs[6]);
|
||||||
uint32_t brF = Br(sponge, regs[5]);
|
uint32_t brF = Br(sponge, regs[5]);
|
||||||
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
||||||
uint32_t brE = Br(sponge, regs[4]);
|
uint32_t brE = Br(sponge, regs[4]);
|
||||||
@ -191,11 +206,11 @@ __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *
|
|||||||
regs[4] += tmpBr;
|
regs[4] += tmpBr;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Die Hash-Funktion
|
__global__
|
||||||
__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
|
void hefty_gpu_hash(int threads, uint32_t startNounce, uint32_t *outputHash)
|
||||||
{
|
{
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
extern __shared__ char heftytab[];
|
extern __shared__ unsigned char heftytab[];
|
||||||
if(threadIdx.x < 64)
|
if(threadIdx.x < 64)
|
||||||
{
|
{
|
||||||
*((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
|
*((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
|
||||||
@ -207,9 +222,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
|
|||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
// bestimme den aktuellen Zähler
|
// bestimme den aktuellen Zähler
|
||||||
uint32_t nounce = startNounce + thread;
|
uint32_t nounce = startNounce + thread;
|
||||||
|
|
||||||
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
||||||
// reduktion von 256 byte auf 128 byte
|
// reduktion von 256 byte auf 128 byte
|
||||||
uint32_t W1[16];
|
uint32_t W1[16];
|
||||||
@ -219,7 +234,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
|
|||||||
uint32_t regs[8];
|
uint32_t regs[8];
|
||||||
uint32_t hash[8];
|
uint32_t hash[8];
|
||||||
uint32_t sponge[4];
|
uint32_t sponge[4];
|
||||||
|
|
||||||
#pragma unroll 4
|
#pragma unroll 4
|
||||||
for(int k=0; k < 4; k++)
|
for(int k=0; k < 4; k++)
|
||||||
sponge[k] = hefty_gpu_sponge[k];
|
sponge[k] = hefty_gpu_sponge[k];
|
||||||
@ -231,7 +246,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
|
|||||||
regs[k] = hefty_gpu_register[k];
|
regs[k] = hefty_gpu_register[k];
|
||||||
hash[k] = regs[k];
|
hash[k] = regs[k];
|
||||||
}
|
}
|
||||||
|
|
||||||
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
|
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
|
||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int k=0;k<16;k++)
|
for(int k=0;k<16;k++)
|
||||||
@ -252,7 +267,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
|
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
|
||||||
|
|
||||||
#pragma unroll 3
|
#pragma unroll 3
|
||||||
for(int k=0;k<3;k++)
|
for(int k=0;k<3;k++)
|
||||||
{
|
{
|
||||||
@ -279,7 +294,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
|
|||||||
for(int j=0;j<16;j++)
|
for(int j=0;j<16;j++)
|
||||||
W1[j] = W2[j];
|
W1[j] = W2[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(int k=0;k<8;k++)
|
for(int k=0;k<8;k++)
|
||||||
hash[k] += regs[k];
|
hash[k] += regs[k];
|
||||||
@ -290,27 +305,28 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setup-Funktionen
|
__host__
|
||||||
__host__ void hefty_cpu_init(int thr_id, int threads)
|
void hefty_cpu_init(int thr_id, int threads)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device_map[thr_id]);
|
cudaSetDevice(device_map[thr_id]);
|
||||||
|
|
||||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||||
cudaMemcpyToSymbol( hefty_gpu_constantTable,
|
cudaMemcpyToSymbol( hefty_gpu_constantTable,
|
||||||
hefty_cpu_constantTable,
|
hefty_cpu_constantTable,
|
||||||
sizeof(uint32_t) * 64 );
|
sizeof(uint32_t) * 64 );
|
||||||
|
|
||||||
// Speicher für alle Hefty1 hashes belegen
|
// Speicher für alle Hefty1 hashes belegen
|
||||||
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
|
CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads));
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
|
__host__
|
||||||
|
void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
|
||||||
// data muss 80/84-Byte haben!
|
// data muss 80/84-Byte haben!
|
||||||
{
|
{
|
||||||
// Nachricht expandieren und setzen
|
// Nachricht expandieren und setzen
|
||||||
uint32_t msgBlock[32];
|
uint32_t msgBlock[32];
|
||||||
|
|
||||||
memset(msgBlock, 0, sizeof(uint32_t) * 32);
|
memset(msgBlock, 0, sizeof(msgBlock));
|
||||||
memcpy(&msgBlock[0], data, len);
|
memcpy(&msgBlock[0], data, len);
|
||||||
if (len == 84) {
|
if (len == 84) {
|
||||||
msgBlock[21] |= 0x80;
|
msgBlock[21] |= 0x80;
|
||||||
@ -319,17 +335,17 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
|
|||||||
msgBlock[20] |= 0x80;
|
msgBlock[20] |= 0x80;
|
||||||
msgBlock[31] = 640; // bitlen
|
msgBlock[31] = 640; // bitlen
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int i=0;i<31;i++) // Byteorder drehen
|
for(int i=0;i<31;i++) // Byteorder drehen
|
||||||
msgBlock[i] = SWAB32(msgBlock[i]);
|
msgBlock[i] = SWAB32(msgBlock[i]);
|
||||||
|
|
||||||
// die erste Runde wird auf der CPU durchgeführt, da diese für
|
// die erste Runde wird auf der CPU durchgeführt, da diese für
|
||||||
// alle Threads gleich ist. Der Hash wird dann an die Threads
|
// alle Threads gleich ist. Der Hash wird dann an die Threads
|
||||||
// übergeben
|
// übergeben
|
||||||
|
|
||||||
// Erstelle expandierten Block W
|
// Erstelle expandierten Block W
|
||||||
uint32_t W[64];
|
uint32_t W[64];
|
||||||
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
|
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
|
||||||
for(int j=16;j<64;j++)
|
for(int j=16;j<64;j++)
|
||||||
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
||||||
|
|
||||||
@ -344,7 +360,7 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
|
|||||||
{
|
{
|
||||||
regs[k] = hefty_cpu_hashTable[k];
|
regs[k] = hefty_cpu_hashTable[k];
|
||||||
hash[k] = regs[k];
|
hash[k] = regs[k];
|
||||||
}
|
}
|
||||||
|
|
||||||
// 1. Runde
|
// 1. Runde
|
||||||
for(int j=0;j<16;j++)
|
for(int j=0;j<16;j++)
|
||||||
@ -366,39 +382,30 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
|
|||||||
hash[k] += regs[k];
|
hash[k] += regs[k];
|
||||||
|
|
||||||
// sponge speichern
|
// sponge speichern
|
||||||
|
cudaMemcpyToSymbol(hefty_gpu_sponge, sponge, 16);
|
||||||
cudaMemcpyToSymbol( hefty_gpu_sponge,
|
|
||||||
sponge,
|
|
||||||
sizeof(uint32_t) * 4 );
|
|
||||||
// hash speichern
|
// hash speichern
|
||||||
cudaMemcpyToSymbol( hefty_gpu_register,
|
cudaMemcpyToSymbol(hefty_gpu_register, hash, 32);
|
||||||
hash,
|
|
||||||
sizeof(uint32_t) * 8 );
|
|
||||||
|
|
||||||
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
|
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
|
||||||
cudaMemcpyToSymbol( hefty_gpu_blockHeader,
|
CUDA_SAFE_CALL(cudaMemcpyToSymbol(hefty_gpu_blockHeader, &msgBlock[16], 64));
|
||||||
&msgBlock[16],
|
|
||||||
64);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
|
__host__
|
||||||
|
void hefty_cpu_hash(int thr_id, int threads, int startNounce)
|
||||||
{
|
{
|
||||||
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
|
int threadsperblock = 256;
|
||||||
// alle anderen mit 512 Threads.
|
|
||||||
int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512;
|
|
||||||
|
|
||||||
// berechne wie viele Thread Blocks wir brauchen
|
// berechne wie viele Thread Blocks wir brauchen
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
size_t shared_size = 8 * 64 * sizeof(uint32_t);
|
int shared_size = 8 * 64 * sizeof(uint32_t);
|
||||||
#else
|
#else
|
||||||
size_t shared_size = 0;
|
int shared_size = 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
|
hefty_gpu_hash <<< grid, block, shared_size >>> (threads, startNounce, heavy_heftyHashes[thr_id]);
|
||||||
|
|
||||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||||
MyStreamSynchronize(NULL, 0, thr_id);
|
MyStreamSynchronize(NULL, 0, thr_id);
|
||||||
|
@ -1,8 +0,0 @@
|
|||||||
#ifndef _CUDA_HEFTY1_H
|
|
||||||
#define _CUDA_HEFTY1_H
|
|
||||||
|
|
||||||
void hefty_cpu_hash(int thr_id, int threads, int startNounce);
|
|
||||||
void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
|
|
||||||
void hefty_cpu_init(int thr_id, int threads);
|
|
||||||
|
|
||||||
#endif
|
|
@ -3,11 +3,11 @@
|
|||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
// globaler Speicher für alle HeftyHashes aller Threads
|
// globaler Speicher für alle HeftyHashes aller Threads
|
||||||
extern uint32_t *d_heftyHashes[8];
|
extern uint32_t *heavy_heftyHashes[8];
|
||||||
extern uint32_t *d_nonceVector[8];
|
extern uint32_t *heavy_nonceVector[8];
|
||||||
|
|
||||||
// globaler Speicher für unsere Ergebnisse
|
// globaler Speicher für unsere Ergebnisse
|
||||||
uint32_t *d_hash3output[8];
|
uint32_t *d_hash3output[8];
|
||||||
extern uint32_t *d_hash4output[8];
|
extern uint32_t *d_hash4output[8];
|
||||||
extern uint32_t *d_hash5output[8];
|
extern uint32_t *d_hash5output[8];
|
||||||
@ -15,13 +15,11 @@ extern uint32_t *d_hash5output[8];
|
|||||||
// der Keccak512 State nach der ersten Runde (72 Bytes)
|
// der Keccak512 State nach der ersten Runde (72 Bytes)
|
||||||
__constant__ uint64_t c_State[25];
|
__constant__ uint64_t c_State[25];
|
||||||
|
|
||||||
// die Message (72 Bytes) für die zweite Runde auf der GPU
|
// die Message (72 Bytes) für die zweite Runde auf der GPU
|
||||||
__constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding
|
__constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding
|
||||||
|
|
||||||
// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
|
// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
|
||||||
|
|
||||||
#define U32TO64_LE(p) \
|
#define U32TO64_LE(p) \
|
||||||
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
|
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
|
||||||
|
|
||||||
@ -144,7 +142,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
|
|||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
// bestimme den aktuellen Zähler
|
// bestimme den aktuellen Zähler
|
||||||
//uint32_t nounce = startNounce + thread;
|
//uint32_t nounce = startNounce + thread;
|
||||||
uint32_t nounce = nonceVector[thread];
|
uint32_t nounce = nonceVector[thread];
|
||||||
|
|
||||||
@ -156,7 +154,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
|
|||||||
#pragma unroll 25
|
#pragma unroll 25
|
||||||
for (int i=0; i < 25; ++i)
|
for (int i=0; i < 25; ++i)
|
||||||
keccak_gpu_state[i] = c_State[i];
|
keccak_gpu_state[i] = c_State[i];
|
||||||
|
|
||||||
// Message2 in den Puffer holen
|
// Message2 in den Puffer holen
|
||||||
uint32_t msgBlock[18];
|
uint32_t msgBlock[18];
|
||||||
mycpy72(msgBlock, c_PaddedMessage2);
|
mycpy72(msgBlock, c_PaddedMessage2);
|
||||||
@ -167,7 +165,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
|
|||||||
// den individuellen Hefty1 Hash einsetzen
|
// den individuellen Hefty1 Hash einsetzen
|
||||||
mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]);
|
mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]);
|
||||||
|
|
||||||
// den Block einmal gut durchschütteln
|
// den Block einmal gut durchschütteln
|
||||||
keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
|
keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
|
||||||
|
|
||||||
// das Hash erzeugen
|
// das Hash erzeugen
|
||||||
@ -187,8 +185,8 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
|
|||||||
|
|
||||||
// ---------------------------- END CUDA keccak512 functions ------------------------------------
|
// ---------------------------- END CUDA keccak512 functions ------------------------------------
|
||||||
|
|
||||||
// Setup-Funktionen
|
__host__
|
||||||
__host__ void keccak512_cpu_init(int thr_id, int threads)
|
void keccak512_cpu_init(int thr_id, int threads)
|
||||||
{
|
{
|
||||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||||
cudaMemcpyToSymbol( c_keccak_round_constants,
|
cudaMemcpyToSymbol( c_keccak_round_constants,
|
||||||
@ -196,7 +194,7 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
|
|||||||
sizeof(host_keccak_round_constants),
|
sizeof(host_keccak_round_constants),
|
||||||
0, cudaMemcpyHostToDevice);
|
0, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
// Speicher für alle Ergebnisse belegen
|
// Speicher für alle Ergebnisse belegen
|
||||||
cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
|
cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -212,23 +210,24 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
|
|||||||
|
|
||||||
static int BLOCKSIZE = 84;
|
static int BLOCKSIZE = 84;
|
||||||
|
|
||||||
__host__ void keccak512_cpu_setBlock(void *data, int len)
|
__host__
|
||||||
|
void keccak512_cpu_setBlock(void *data, int len)
|
||||||
// data muss 80 oder 84-Byte haben!
|
// data muss 80 oder 84-Byte haben!
|
||||||
// heftyHash hat 32-Byte
|
// heftyHash hat 32-Byte
|
||||||
{
|
{
|
||||||
// CH
|
// CH
|
||||||
// state init
|
// state init
|
||||||
uint64_t keccak_cpu_state[25];
|
uint64_t keccak_cpu_state[25];
|
||||||
memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state));
|
memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state));
|
||||||
|
|
||||||
// erste Runde
|
// erste Runde
|
||||||
keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
|
keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
|
||||||
|
|
||||||
// state kopieren
|
// state kopieren
|
||||||
cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
// keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
|
// keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
|
||||||
// zu jeweils
|
// zu jeweils
|
||||||
uint32_t msgBlock[18];
|
uint32_t msgBlock[18];
|
||||||
memset(msgBlock, 0, 18 * sizeof(uint32_t));
|
memset(msgBlock, 0, 18 * sizeof(uint32_t));
|
||||||
|
|
||||||
@ -238,29 +237,31 @@ __host__ void keccak512_cpu_setBlock(void *data, int len)
|
|||||||
else if (len == 80)
|
else if (len == 80)
|
||||||
memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8);
|
memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8);
|
||||||
|
|
||||||
// Nachricht abschließen
|
// Nachricht abschließen
|
||||||
if (len == 84)
|
if (len == 84)
|
||||||
msgBlock[11] = 0x01;
|
msgBlock[11] = 0x01;
|
||||||
else if (len == 80)
|
else if (len == 80)
|
||||||
msgBlock[10] = 0x01;
|
msgBlock[10] = 0x01;
|
||||||
msgBlock[17] = 0x80000000;
|
msgBlock[17] = 0x80000000;
|
||||||
|
|
||||||
// Message 2 ins Constant Memory kopieren (die variable Nonce und
|
// Message 2 ins Constant Memory kopieren (die variable Nonce und
|
||||||
// der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden)
|
// der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden)
|
||||||
cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice );
|
cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice );
|
||||||
|
|
||||||
BLOCKSIZE = len;
|
BLOCKSIZE = len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
|
void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
|
||||||
{
|
{
|
||||||
// Hefty1 Hashes kopieren
|
// Hefty1 Hashes kopieren
|
||||||
if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
|
if (copy)
|
||||||
|
CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
|
||||||
//else cudaThreadSynchronize();
|
//else cudaThreadSynchronize();
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
__host__
|
||||||
|
void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
||||||
{
|
{
|
||||||
const int threadsperblock = 128;
|
const int threadsperblock = 128;
|
||||||
|
|
||||||
@ -268,11 +269,11 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
|
|
||||||
if (BLOCKSIZE==84)
|
if (BLOCKSIZE==84)
|
||||||
keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
|
||||||
else if (BLOCKSIZE==80)
|
else if (BLOCKSIZE==80)
|
||||||
keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
|
||||||
}
|
}
|
||||||
|
@ -1,9 +0,0 @@
|
|||||||
#ifndef _CUDA_KECCAK512_H
|
|
||||||
#define _CUDA_KECCAK512_H
|
|
||||||
|
|
||||||
void keccak512_cpu_init(int thr_id, int threads);
|
|
||||||
void keccak512_cpu_setBlock(void *data, int len);
|
|
||||||
void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
|
|
||||||
void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
|
|
||||||
|
|
||||||
#endif
|
|
@ -3,11 +3,11 @@
|
|||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
// globaler Speicher für alle HeftyHashes aller Threads
|
// globaler Speicher für alle HeftyHashes aller Threads
|
||||||
extern uint32_t *d_heftyHashes[8];
|
extern uint32_t *heavy_heftyHashes[8];
|
||||||
extern uint32_t *d_nonceVector[8];
|
extern uint32_t *heavy_nonceVector[8];
|
||||||
|
|
||||||
// globaler Speicher für unsere Ergebnisse
|
// globaler Speicher für unsere Ergebnisse
|
||||||
uint32_t *d_hash2output[8];
|
uint32_t *d_hash2output[8];
|
||||||
|
|
||||||
|
|
||||||
@ -47,10 +47,10 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
|
|||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
// bestimme den aktuellen Zähler
|
// bestimme den aktuellen Zähler
|
||||||
uint32_t nounce = startNounce + thread;
|
uint32_t nounce = startNounce + thread;
|
||||||
nonceVector[thread] = nounce;
|
nonceVector[thread] = nounce;
|
||||||
|
|
||||||
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
||||||
uint32_t W1[16];
|
uint32_t W1[16];
|
||||||
uint32_t W2[16];
|
uint32_t W2[16];
|
||||||
@ -66,10 +66,10 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
|
|||||||
regs[k] = sha256_gpu_register[k];
|
regs[k] = sha256_gpu_register[k];
|
||||||
hash[k] = regs[k];
|
hash[k] = regs[k];
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. Runde
|
// 2. Runde
|
||||||
//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke
|
//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke
|
||||||
//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen
|
//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen
|
||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int k=0;k<16;k++)
|
for(int k=0;k<16;k++)
|
||||||
W1[k] = sha256_gpu_blockHeader[k];
|
W1[k] = sha256_gpu_blockHeader[k];
|
||||||
@ -90,7 +90,7 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
|
|||||||
uint32_t T1, T2;
|
uint32_t T1, T2;
|
||||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j];
|
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j];
|
||||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||||
|
|
||||||
#pragma unroll 7
|
#pragma unroll 7
|
||||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||||
regs[0] = T1 + T2;
|
regs[0] = T1 + T2;
|
||||||
@ -121,7 +121,7 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
|
|||||||
uint32_t T1, T2;
|
uint32_t T1, T2;
|
||||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
|
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
|
||||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||||
|
|
||||||
#pragma unroll 7
|
#pragma unroll 7
|
||||||
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
|
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
|
||||||
regs[0] = T1 + T2;
|
regs[0] = T1 + T2;
|
||||||
@ -136,14 +136,14 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
|
|||||||
/*
|
/*
|
||||||
for(int j=16;j<64;j++)
|
for(int j=16;j<64;j++)
|
||||||
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
||||||
|
|
||||||
#pragma unroll 64
|
#pragma unroll 64
|
||||||
for(int j=0;j<64;j++)
|
for(int j=0;j<64;j++)
|
||||||
{
|
{
|
||||||
uint32_t T1, T2;
|
uint32_t T1, T2;
|
||||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j];
|
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j];
|
||||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||||
|
|
||||||
#pragma unroll 7
|
#pragma unroll 7
|
||||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||||
regs[0] = T1 + T2;
|
regs[0] = T1 + T2;
|
||||||
@ -168,7 +168,7 @@ __host__ void sha256_cpu_init(int thr_id, int threads)
|
|||||||
sha256_cpu_constantTable,
|
sha256_cpu_constantTable,
|
||||||
sizeof(uint32_t) * 64 );
|
sizeof(uint32_t) * 64 );
|
||||||
|
|
||||||
// Speicher für alle Ergebnisse belegen
|
// Speicher für alle Ergebnisse belegen
|
||||||
cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
|
cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -184,25 +184,25 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
|
|||||||
memset(msgBlock, 0, sizeof(uint32_t) * 32);
|
memset(msgBlock, 0, sizeof(uint32_t) * 32);
|
||||||
memcpy(&msgBlock[0], data, len);
|
memcpy(&msgBlock[0], data, len);
|
||||||
if (len == 84) {
|
if (len == 84) {
|
||||||
memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
|
memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
|
||||||
msgBlock[29] |= 0x80;
|
msgBlock[29] |= 0x80;
|
||||||
msgBlock[31] = 928; // bitlen
|
msgBlock[31] = 928; // bitlen
|
||||||
} else if (len == 80) {
|
} else if (len == 80) {
|
||||||
memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
|
memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
|
||||||
msgBlock[28] |= 0x80;
|
msgBlock[28] |= 0x80;
|
||||||
msgBlock[31] = 896; // bitlen
|
msgBlock[31] = 896; // bitlen
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int i=0;i<31;i++) // Byteorder drehen
|
for(int i=0;i<31;i++) // Byteorder drehen
|
||||||
msgBlock[i] = SWAB32(msgBlock[i]);
|
msgBlock[i] = SWAB32(msgBlock[i]);
|
||||||
|
|
||||||
// die erste Runde wird auf der CPU durchgeführt, da diese für
|
// die erste Runde wird auf der CPU durchgeführt, da diese für
|
||||||
// alle Threads gleich ist. Der Hash wird dann an die Threads
|
// alle Threads gleich ist. Der Hash wird dann an die Threads
|
||||||
// übergeben
|
// übergeben
|
||||||
uint32_t W[64];
|
uint32_t W[64];
|
||||||
|
|
||||||
// Erstelle expandierten Block W
|
// Erstelle expandierten Block W
|
||||||
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
|
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
|
||||||
for(int j=16;j<64;j++)
|
for(int j=16;j<64;j++)
|
||||||
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
||||||
|
|
||||||
@ -223,7 +223,7 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
|
|||||||
uint32_t T1, T2;
|
uint32_t T1, T2;
|
||||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j];
|
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j];
|
||||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||||
|
|
||||||
//#pragma unroll 7
|
//#pragma unroll 7
|
||||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||||
// sollte mal noch durch memmov ersetzt werden!
|
// sollte mal noch durch memmov ersetzt werden!
|
||||||
@ -251,7 +251,8 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
|
|||||||
__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
|
__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
|
||||||
{
|
{
|
||||||
// Hefty1 Hashes kopieren
|
// Hefty1 Hashes kopieren
|
||||||
if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
|
if (copy)
|
||||||
|
CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
|
||||||
//else cudaThreadSynchronize();
|
//else cudaThreadSynchronize();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -263,12 +264,12 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
|
|
||||||
if (BLOCKSIZE == 84)
|
if (BLOCKSIZE == 84)
|
||||||
sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
|
||||||
else if (BLOCKSIZE == 80) {
|
else if (BLOCKSIZE == 80) {
|
||||||
sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,8 +0,0 @@
|
|||||||
#ifndef _CUDA_SHA256_H
|
|
||||||
#define _CUDA_SHA256_H
|
|
||||||
|
|
||||||
void sha256_cpu_init(int thr_id, int threads);
|
|
||||||
void sha256_cpu_setBlock(void *data, int len);
|
|
||||||
void sha256_cpu_hash(int thr_id, int threads, int startNounce);
|
|
||||||
void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
|
|
||||||
#endif
|
|
152
heavy/heavy.cu
152
heavy/heavy.cu
@ -1,35 +1,19 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <memory.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include <map>
|
|
||||||
|
|
||||||
#include <openssl/sha.h>
|
#include <openssl/sha.h>
|
||||||
|
#include <map>
|
||||||
#ifndef _WIN32
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// include thrust
|
// include thrust
|
||||||
#include <thrust/version.h>
|
|
||||||
#include <thrust/remove.h>
|
#include <thrust/remove.h>
|
||||||
#include <thrust/device_vector.h>
|
#include <thrust/device_vector.h>
|
||||||
#include <thrust/iterator/constant_iterator.h>
|
|
||||||
|
|
||||||
#include "miner.h"
|
#include "miner.h"
|
||||||
|
|
||||||
#include "hefty1.h"
|
extern "C" {
|
||||||
#include "sph/sph_keccak.h"
|
#include "sph/sph_keccak.h"
|
||||||
#include "sph/sph_blake.h"
|
#include "sph/sph_blake.h"
|
||||||
#include "sph/sph_groestl.h"
|
#include "sph/sph_groestl.h"
|
||||||
|
}
|
||||||
#include "heavy/cuda_hefty1.h"
|
#include "hefty1.h"
|
||||||
#include "heavy/cuda_sha256.h"
|
#include "heavy/heavy.h"
|
||||||
#include "heavy/cuda_keccak512.h"
|
|
||||||
#include "heavy/cuda_groestl512.h"
|
|
||||||
#include "heavy/cuda_blake512.h"
|
|
||||||
#include "heavy/cuda_combine.h"
|
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
extern uint32_t *d_hash2output[8];
|
extern uint32_t *d_hash2output[8];
|
||||||
@ -37,11 +21,13 @@ extern uint32_t *d_hash3output[8];
|
|||||||
extern uint32_t *d_hash4output[8];
|
extern uint32_t *d_hash4output[8];
|
||||||
extern uint32_t *d_hash5output[8];
|
extern uint32_t *d_hash5output[8];
|
||||||
|
|
||||||
#define HEAVYCOIN_BLKHDR_SZ 84
|
#define HEAVYCOIN_BLKHDR_SZ 84
|
||||||
#define MNR_BLKHDR_SZ 80
|
#define MNR_BLKHDR_SZ 80
|
||||||
|
|
||||||
// nonce-array für die threads
|
// nonce-array für die threads
|
||||||
uint32_t *d_nonceVector[8];
|
uint32_t *heavy_nonceVector[8];
|
||||||
|
|
||||||
|
extern uint32_t *heavy_heftyHashes[8];
|
||||||
|
|
||||||
/* Combines top 64-bits from each hash into a single hash */
|
/* Combines top 64-bits from each hash into a single hash */
|
||||||
static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
|
static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
|
||||||
@ -71,9 +57,9 @@ static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t
|
|||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
static uint32_t __inline bitsset( uint32_t x )
|
static uint32_t __inline bitsset( uint32_t x )
|
||||||
{
|
{
|
||||||
DWORD r = 0;
|
DWORD r = 0;
|
||||||
_BitScanReverse(&r, x);
|
_BitScanReverse(&r, x);
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static uint32_t bitsset( uint32_t x )
|
static uint32_t bitsset( uint32_t x )
|
||||||
@ -91,21 +77,21 @@ static int findhighbit(const uint32_t *ptarget, int words)
|
|||||||
{
|
{
|
||||||
if (ptarget[i] != 0) {
|
if (ptarget[i] != 0) {
|
||||||
highbit = i*32 + bitsset(ptarget[i])+1;
|
highbit = i*32 + bitsset(ptarget[i])+1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return highbit;
|
return highbit;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generiere ein Multiword-Integer das die Zahl
|
// Generiere ein Multiword-Integer das die Zahl
|
||||||
// (2 << highbit) - 1 repräsentiert.
|
// (2 << highbit) - 1 repräsentiert.
|
||||||
static void genmask(uint32_t *ptarget, int words, int highbit)
|
static void genmask(uint32_t *ptarget, int words, int highbit)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
for (i=words-1; i >= 0; --i)
|
for (i=words-1; i >= 0; --i)
|
||||||
{
|
{
|
||||||
if ((i+1)*32 <= highbit)
|
if ((i+1)*32 <= highbit)
|
||||||
ptarget[i] = 0xffffffff;
|
ptarget[i] = UINT32_MAX;
|
||||||
else if (i*32 > highbit)
|
else if (i*32 > highbit)
|
||||||
ptarget[i] = 0x00000000;
|
ptarget[i] = 0x00000000;
|
||||||
else
|
else
|
||||||
@ -114,13 +100,18 @@ static void genmask(uint32_t *ptarget, int words, int highbit)
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct check_nonce_for_remove
|
struct check_nonce_for_remove
|
||||||
{
|
{
|
||||||
check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
|
check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
|
||||||
m_target(target),
|
m_target(target),
|
||||||
m_hashes(hashes),
|
m_hashes(hashes),
|
||||||
m_hashlen(hashlen),
|
m_hashlen(hashlen),
|
||||||
m_startNonce(startNonce) { }
|
m_startNonce(startNonce) { }
|
||||||
|
|
||||||
|
uint64_t m_target;
|
||||||
|
uint32_t *m_hashes;
|
||||||
|
uint32_t m_hashlen;
|
||||||
|
uint32_t m_startNonce;
|
||||||
|
|
||||||
__device__
|
__device__
|
||||||
bool operator()(const uint32_t x)
|
bool operator()(const uint32_t x)
|
||||||
{
|
{
|
||||||
@ -129,53 +120,39 @@ struct check_nonce_for_remove
|
|||||||
// Wert des Hashes (als uint64_t) auslesen.
|
// Wert des Hashes (als uint64_t) auslesen.
|
||||||
// Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
|
// Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
|
||||||
uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
|
uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
|
||||||
// gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
|
bool res = (hashValue & m_target) != hashValue;
|
||||||
return (hashValue & m_target) != hashValue;
|
//printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target);
|
||||||
|
// gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t m_target;
|
|
||||||
uint32_t *m_hashes;
|
|
||||||
uint32_t m_hashlen;
|
|
||||||
uint32_t m_startNonce;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
|
static bool init[8] = {0,0,0,0,0,0,0,0};
|
||||||
const uint32_t *ptarget, uint32_t max_nonce,
|
|
||||||
unsigned long *hashes_done, uint32_t maxvote, int blocklen);
|
|
||||||
|
|
||||||
extern "C"
|
__host__
|
||||||
int scanhash_heavy(int thr_id, uint32_t *pdata,
|
int scanhash_heavy(int thr_id, uint32_t *pdata,
|
||||||
const uint32_t *ptarget, uint32_t max_nonce,
|
const uint32_t *ptarget, uint32_t max_nonce,
|
||||||
unsigned long *hashes_done, uint32_t maxvote, int blocklen)
|
unsigned long *hashes_done, uint32_t maxvote, int blocklen)
|
||||||
{
|
|
||||||
return scanhash_heavy_cpp(thr_id, pdata,
|
|
||||||
ptarget, max_nonce, hashes_done, maxvote, blocklen);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
|
|
||||||
const uint32_t *ptarget, uint32_t max_nonce,
|
|
||||||
unsigned long *hashes_done, uint32_t maxvote, int blocklen)
|
|
||||||
{
|
{
|
||||||
const uint32_t first_nonce = pdata[19]; /* to check */
|
const uint32_t first_nonce = pdata[19]; /* to check */
|
||||||
// CUDA will process thousands of threads.
|
// CUDA will process thousands of threads.
|
||||||
int throughput = opt_work_size ? opt_work_size : (1 << 19); // 128*4096
|
int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*2048
|
||||||
throughput = min(throughput, (int)(max_nonce - first_nonce));
|
throughput = min(throughput, (int)(max_nonce - first_nonce));
|
||||||
|
|
||||||
int rc = 0;
|
int rc = 0;
|
||||||
uint32_t *hash = NULL;
|
uint32_t *hash = NULL;
|
||||||
cudaMallocHost(&hash, throughput*8*sizeof(uint32_t));
|
|
||||||
uint32_t *cpu_nonceVector = NULL;
|
uint32_t *cpu_nonceVector = NULL;
|
||||||
cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t));
|
CUDA_SAFE_CALL(cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)));
|
||||||
|
CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)));
|
||||||
|
|
||||||
int nrmCalls[6];
|
int nrmCalls[6];
|
||||||
memset(nrmCalls, 0, sizeof(int) * 6);
|
memset(nrmCalls, 0, sizeof(int) * 6);
|
||||||
|
|
||||||
if (opt_benchmark)
|
if (opt_benchmark)
|
||||||
((uint32_t*)ptarget)[7] = 0x000000ff;
|
((uint32_t*)ptarget)[7] = 0x00ff;
|
||||||
|
|
||||||
// für jeden Hash ein individuelles Target erstellen basierend
|
// für jeden Hash ein individuelles Target erstellen basierend
|
||||||
// auf dem höchsten Bit, das in ptarget gesetzt ist.
|
// auf dem höchsten Bit, das in ptarget gesetzt ist.
|
||||||
int highbit = findhighbit(ptarget, 8);
|
int highbit = findhighbit(ptarget, 8);
|
||||||
uint32_t target2[2], target3[2], target4[2], target5[2];
|
uint32_t target2[2], target3[2], target4[2], target5[2];
|
||||||
genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
|
genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
|
||||||
@ -183,7 +160,6 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
|
|||||||
genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
|
genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
|
||||||
genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
|
genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
|
||||||
|
|
||||||
static bool init[8] = {0,0,0,0,0,0,0,0};
|
|
||||||
if (!init[thr_id])
|
if (!init[thr_id])
|
||||||
{
|
{
|
||||||
hefty_cpu_init(thr_id, throughput);
|
hefty_cpu_init(thr_id, throughput);
|
||||||
@ -192,8 +168,10 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
|
|||||||
groestl512_cpu_init(thr_id, throughput);
|
groestl512_cpu_init(thr_id, throughput);
|
||||||
blake512_cpu_init(thr_id, throughput);
|
blake512_cpu_init(thr_id, throughput);
|
||||||
combine_cpu_init(thr_id, throughput);
|
combine_cpu_init(thr_id, throughput);
|
||||||
|
|
||||||
|
CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput));
|
||||||
|
|
||||||
init[thr_id] = true;
|
init[thr_id] = true;
|
||||||
cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (blocklen == HEAVYCOIN_BLKHDR_SZ)
|
if (blocklen == HEAVYCOIN_BLKHDR_SZ)
|
||||||
@ -201,13 +179,13 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
|
|||||||
uint16_t *ext = (uint16_t *)&pdata[20];
|
uint16_t *ext = (uint16_t *)&pdata[20];
|
||||||
|
|
||||||
if (opt_vote > maxvote) {
|
if (opt_vote > maxvote) {
|
||||||
printf("Warning: Your block reward vote (%hu) exceeds "
|
applog(LOG_WARNING, "Your block reward vote (%hu) exceeds "
|
||||||
"the maxvote reported by the pool (%hu).\n",
|
"the maxvote reported by the pool (%hu).",
|
||||||
opt_vote, maxvote);
|
opt_vote, maxvote);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opt_trust_pool && opt_vote > maxvote) {
|
if (opt_trust_pool && opt_vote > maxvote) {
|
||||||
printf("Warning: Capping block reward vote to maxvote reported by pool.\n");
|
applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool.");
|
||||||
ext[0] = maxvote;
|
ext[0] = maxvote;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -222,32 +200,34 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
|
|||||||
blake512_cpu_setBlock(pdata, blocklen);
|
blake512_cpu_setBlock(pdata, blocklen);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
uint32_t i;
|
|
||||||
|
|
||||||
////// Compaction init
|
////// Compaction init
|
||||||
thrust::device_ptr<uint32_t> devNoncePtr(d_nonceVector[thr_id]);
|
thrust::device_ptr<uint32_t> devNoncePtr(heavy_nonceVector[thr_id]);
|
||||||
thrust::device_ptr<uint32_t> devNoncePtrEnd((d_nonceVector[thr_id]) + throughput);
|
thrust::device_ptr<uint32_t> devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput);
|
||||||
uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
|
uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
|
||||||
|
uint64_t *t;
|
||||||
|
|
||||||
hefty_cpu_hash(thr_id, throughput, pdata[19]);
|
hefty_cpu_hash(thr_id, throughput, pdata[19]);
|
||||||
//cudaThreadSynchronize();
|
//cudaThreadSynchronize();
|
||||||
sha256_cpu_hash(thr_id, throughput, pdata[19]);
|
sha256_cpu_hash(thr_id, throughput, pdata[19]);
|
||||||
//cudaThreadSynchronize();
|
//cudaThreadSynchronize();
|
||||||
|
|
||||||
// Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
|
// Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
|
||||||
MyStreamSynchronize(NULL, 1, thr_id);
|
MyStreamSynchronize(NULL, 1, thr_id);
|
||||||
|
|
||||||
////// Compaction
|
////// Compaction
|
||||||
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target2), d_hash2output[thr_id], 8, pdata[19]));
|
t = (uint64_t*) target2;
|
||||||
|
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19]));
|
||||||
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
|
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
|
||||||
if(actualNumberOfValuesInNonceVectorGPU == 0)
|
if(actualNumberOfValuesInNonceVectorGPU == 0)
|
||||||
goto emptyNonceVector;
|
goto emptyNonceVector;
|
||||||
|
|
||||||
keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
|
keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
|
||||||
//cudaThreadSynchronize();
|
//cudaThreadSynchronize();
|
||||||
|
|
||||||
////// Compaction
|
////// Compaction
|
||||||
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target3), d_hash3output[thr_id], 16, pdata[19]));
|
t = (uint64_t*) target3;
|
||||||
|
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19]));
|
||||||
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
|
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
|
||||||
if(actualNumberOfValuesInNonceVectorGPU == 0)
|
if(actualNumberOfValuesInNonceVectorGPU == 0)
|
||||||
goto emptyNonceVector;
|
goto emptyNonceVector;
|
||||||
@ -256,7 +236,8 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
|
|||||||
//cudaThreadSynchronize();
|
//cudaThreadSynchronize();
|
||||||
|
|
||||||
////// Compaction
|
////// Compaction
|
||||||
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target5), d_hash5output[thr_id], 16, pdata[19]));
|
t = (uint64_t*) target5;
|
||||||
|
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19]));
|
||||||
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
|
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
|
||||||
if(actualNumberOfValuesInNonceVectorGPU == 0)
|
if(actualNumberOfValuesInNonceVectorGPU == 0)
|
||||||
goto emptyNonceVector;
|
goto emptyNonceVector;
|
||||||
@ -265,25 +246,31 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
|
|||||||
//cudaThreadSynchronize();
|
//cudaThreadSynchronize();
|
||||||
|
|
||||||
////// Compaction
|
////// Compaction
|
||||||
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target4), d_hash4output[thr_id], 16, pdata[19]));
|
t = (uint64_t*) target4;
|
||||||
|
devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19]));
|
||||||
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
|
actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
|
||||||
if(actualNumberOfValuesInNonceVectorGPU == 0)
|
if(actualNumberOfValuesInNonceVectorGPU == 0)
|
||||||
goto emptyNonceVector;
|
goto emptyNonceVector;
|
||||||
|
|
||||||
// combine
|
// combine
|
||||||
combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
|
combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
|
||||||
|
|
||||||
|
if (opt_tracegpu) {
|
||||||
|
applog(LOG_BLUE, "heavy GPU hash:");
|
||||||
|
applog_hash((uchar*)hash);
|
||||||
|
}
|
||||||
|
|
||||||
// Ergebnisse kopieren
|
// Ergebnisse kopieren
|
||||||
if(actualNumberOfValuesInNonceVectorGPU > 0)
|
if(actualNumberOfValuesInNonceVectorGPU > 0)
|
||||||
{
|
{
|
||||||
cudaMemcpy(cpu_nonceVector, d_nonceVector[thr_id], sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU, cudaMemcpyDeviceToHost);
|
size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
|
||||||
|
CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost));
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
for (i=0; i<actualNumberOfValuesInNonceVectorGPU;++i)
|
for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
|
||||||
{
|
{
|
||||||
uint32_t nonce = cpu_nonceVector[i];
|
uint32_t nonce = cpu_nonceVector[i];
|
||||||
//uint32_t index = nonce - pdata[19];
|
uint32_t *foundhash = &hash[8*i];
|
||||||
uint32_t index = i;
|
|
||||||
uint32_t *foundhash = &hash[8*index];
|
|
||||||
if (foundhash[7] <= ptarget[7]) {
|
if (foundhash[7] <= ptarget[7]) {
|
||||||
if (fulltest(foundhash, ptarget)) {
|
if (fulltest(foundhash, ptarget)) {
|
||||||
uint32_t verification[8];
|
uint32_t verification[8];
|
||||||
@ -291,9 +278,7 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
|
|||||||
heavycoin_hash((uchar*)verification, (uchar*)pdata, blocklen);
|
heavycoin_hash((uchar*)verification, (uchar*)pdata, blocklen);
|
||||||
if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) {
|
if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) {
|
||||||
applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce);
|
applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce);
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
*hashes_done = pdata[19] - first_nonce;
|
*hashes_done = pdata[19] - first_nonce;
|
||||||
rc = 1;
|
rc = 1;
|
||||||
goto exit;
|
goto exit;
|
||||||
@ -316,6 +301,7 @@ exit:
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
void heavycoin_hash(uchar* output, const uchar* input, int len)
|
void heavycoin_hash(uchar* output, const uchar* input, int len)
|
||||||
{
|
{
|
||||||
unsigned char hash1[32];
|
unsigned char hash1[32];
|
||||||
|
30
heavy/heavy.h
Normal file
30
heavy/heavy.h
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#ifndef _CUDA_HEAVY_H
|
||||||
|
#define _CUDA_HEAVY_H
|
||||||
|
|
||||||
|
void blake512_cpu_init(int thr_id, int threads);
|
||||||
|
void blake512_cpu_setBlock(void *pdata, int len);
|
||||||
|
void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
|
||||||
|
|
||||||
|
void groestl512_cpu_init(int thr_id, int threads);
|
||||||
|
void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
|
||||||
|
void groestl512_cpu_setBlock(void *data, int len);
|
||||||
|
void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
|
||||||
|
|
||||||
|
void hefty_cpu_hash(int thr_id, int threads, int startNounce);
|
||||||
|
void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
|
||||||
|
void hefty_cpu_init(int thr_id, int threads);
|
||||||
|
|
||||||
|
void keccak512_cpu_init(int thr_id, int threads);
|
||||||
|
void keccak512_cpu_setBlock(void *data, int len);
|
||||||
|
void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
|
||||||
|
void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
|
||||||
|
|
||||||
|
void sha256_cpu_init(int thr_id, int threads);
|
||||||
|
void sha256_cpu_setBlock(void *data, int len);
|
||||||
|
void sha256_cpu_hash(int thr_id, int threads, int startNounce);
|
||||||
|
void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
|
||||||
|
|
||||||
|
void combine_cpu_init(int thr_id, int threads);
|
||||||
|
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
|
||||||
|
|
||||||
|
#endif
|
@ -37,7 +37,7 @@ void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
memset(&data, 0, sizeof(data));
|
memset(&data, 0, sizeof(data));
|
||||||
data.uid = uid;
|
data.uid = (uint32_t) uid;
|
||||||
data.gpu_id = gpu;
|
data.gpu_id = gpu;
|
||||||
data.thr_id = (uint8_t)thr_id;
|
data.thr_id = (uint8_t)thr_id;
|
||||||
data.tm_stat = (uint32_t) time(NULL);
|
data.tm_stat = (uint32_t) time(NULL);
|
||||||
|
18
util.cpp
18
util.cpp
@ -102,10 +102,8 @@ void applog(int prio, const char *fmt, ...)
|
|||||||
const char* color = "";
|
const char* color = "";
|
||||||
char *f;
|
char *f;
|
||||||
int len;
|
int len;
|
||||||
time_t now;
|
|
||||||
struct tm tm, *tm_p;
|
struct tm tm, *tm_p;
|
||||||
|
time_t now = time(NULL);
|
||||||
time(&now);
|
|
||||||
|
|
||||||
pthread_mutex_lock(&applog_lock);
|
pthread_mutex_lock(&applog_lock);
|
||||||
tm_p = localtime(&now);
|
tm_p = localtime(&now);
|
||||||
@ -735,9 +733,7 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
|
|||||||
|
|
||||||
if (!strstr(sctx->sockbuf, "\n")) {
|
if (!strstr(sctx->sockbuf, "\n")) {
|
||||||
bool ret = true;
|
bool ret = true;
|
||||||
time_t rstart;
|
time_t rstart = time(NULL);
|
||||||
|
|
||||||
time(&rstart);
|
|
||||||
if (!socket_full(sctx->sock, 60)) {
|
if (!socket_full(sctx->sock, 60)) {
|
||||||
applog(LOG_ERR, "stratum_recv_line timed out");
|
applog(LOG_ERR, "stratum_recv_line timed out");
|
||||||
goto out;
|
goto out;
|
||||||
@ -1578,16 +1574,18 @@ void do_gpu_tests(void)
|
|||||||
uchar buf[128];
|
uchar buf[128];
|
||||||
uint32_t tgt[8] = { 0 };
|
uint32_t tgt[8] = { 0 };
|
||||||
|
|
||||||
memset(buf, 0, sizeof buf);
|
|
||||||
buf[0] = 1; buf[64] = 2;
|
|
||||||
|
|
||||||
opt_tracegpu = true;
|
opt_tracegpu = true;
|
||||||
work_restart = (struct work_restart*) malloc(sizeof(struct work_restart));
|
work_restart = (struct work_restart*) malloc(sizeof(struct work_restart));
|
||||||
work_restart[0].restart = 1;
|
work_restart[0].restart = 1;
|
||||||
tgt[6] = 0xffff;
|
tgt[7] = 0xffff;
|
||||||
|
|
||||||
|
memset(buf, 0, sizeof buf);
|
||||||
|
// buf[0] = 1; buf[64] = 2; // for endian tests
|
||||||
scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
|
scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
|
||||||
|
|
||||||
|
memset(buf, 0, sizeof buf);
|
||||||
|
scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84
|
||||||
|
|
||||||
free(work_restart);
|
free(work_restart);
|
||||||
work_restart = NULL;
|
work_restart = NULL;
|
||||||
opt_tracegpu = false;
|
opt_tracegpu = false;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user