mirror of
https://github.com/GOSTSec/ccminer
synced 2025-02-02 18:04:22 +00:00
keccak: not compatible with second nonces (was broken)
Use djm34 new uint2 method to get a +40% boost (115 to 153MH/s)
This commit is contained in:
parent
78f2dd6ffd
commit
ef8a73d6aa
@ -7,6 +7,12 @@ extern "C" {
|
|||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define UINT2(x,y) { x, y }
|
||||||
|
#else
|
||||||
|
#define UINT2(x,y) (uint2) { x, y }
|
||||||
|
#endif
|
||||||
|
|
||||||
static const uint64_t host_keccak_round_constants[24] = {
|
static const uint64_t host_keccak_round_constants[24] = {
|
||||||
0x0000000000000001ull, 0x0000000000008082ull,
|
0x0000000000000001ull, 0x0000000000008082ull,
|
||||||
0x800000000000808aull, 0x8000000080008000ull,
|
0x800000000000808aull, 0x8000000080008000ull,
|
||||||
@ -97,7 +103,7 @@ static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
|
|||||||
s[0] ^= vectorize(keccak_round_constants[i]);
|
s[0] ^= vectorize(keccak_round_constants[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
|
static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
|
||||||
@ -167,43 +173,52 @@ static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
|
|||||||
s[0] ^= keccak_round_constants[i];
|
s[0] ^= keccak_round_constants[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
__global__
|
__global__ __launch_bounds__(128,5)
|
||||||
void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
|
void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
|
||||||
{
|
{
|
||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
uint32_t nounce = startNounce + thread;
|
uint32_t nounce = startNounce + thread;
|
||||||
uint64_t keccak_gpu_state[25];
|
|
||||||
|
|
||||||
//#pragma unroll 25
|
#if __CUDA_ARCH__ >= 350
|
||||||
|
uint2 keccak_gpu_state[25];
|
||||||
|
#pragma unroll 25
|
||||||
for (int i=0; i<25; i++) {
|
for (int i=0; i<25; i++) {
|
||||||
if (i < 9)
|
if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]);
|
||||||
keccak_gpu_state[i] = c_PaddedMessage80[i];
|
else keccak_gpu_state[i] = UINT2(0, 0);
|
||||||
else
|
}
|
||||||
keccak_gpu_state[i] = 0;
|
|
||||||
|
keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]);
|
||||||
|
keccak_gpu_state[9].y = cuda_swab32(nounce);
|
||||||
|
keccak_gpu_state[10] = UINT2(1, 0);
|
||||||
|
keccak_gpu_state[16] = UINT2(0, 0x80000000);
|
||||||
|
|
||||||
|
keccak_blockv35(keccak_gpu_state,keccak_round_constants);
|
||||||
|
if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) {resNounce[0] = nounce;}
|
||||||
|
#else
|
||||||
|
uint64_t keccak_gpu_state[25];
|
||||||
|
#pragma unroll 25
|
||||||
|
for (int i=0; i<25; i++) {
|
||||||
|
if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
|
||||||
|
else keccak_gpu_state[i] = 0;
|
||||||
}
|
}
|
||||||
keccak_gpu_state[9] = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
|
keccak_gpu_state[9] = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
|
||||||
keccak_gpu_state[10] = 0x0000000000000001;
|
keccak_gpu_state[10] = 0x0000000000000001;
|
||||||
keccak_gpu_state[16] = 0x8000000000000000;
|
keccak_gpu_state[16] = 0x8000000000000000;
|
||||||
|
|
||||||
keccak_blockv30(keccak_gpu_state, keccak_round_constants);
|
keccak_blockv30(keccak_gpu_state, keccak_round_constants);
|
||||||
|
if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) { resNounce[0] = nounce; }
|
||||||
bool rc = false;
|
#endif
|
||||||
if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;}
|
|
||||||
|
|
||||||
if (rc == true) {
|
|
||||||
if(resNounce[0] > nounce)
|
|
||||||
resNounce[0] = nounce;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
|
uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
|
||||||
{
|
{
|
||||||
uint32_t result = 0xffffffff;
|
uint32_t result = UINT32_MAX;
|
||||||
cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
|
cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
|
||||||
const int threadsperblock = 128;
|
const int threadsperblock = 128;
|
||||||
|
|
||||||
@ -222,12 +237,6 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#define UINT2(a, b) { a, b }
|
|
||||||
#else
|
|
||||||
#define UINT2(a, b) (uint2) { a, b }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__global__ __launch_bounds__(256,3)
|
__global__ __launch_bounds__(256,3)
|
||||||
void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
|
void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
|
||||||
{
|
{
|
||||||
@ -238,18 +247,16 @@ void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHa
|
|||||||
uint2 keccak_gpu_state[25];
|
uint2 keccak_gpu_state[25];
|
||||||
#pragma unroll 25
|
#pragma unroll 25
|
||||||
for (int i = 0; i<25; i++) {
|
for (int i = 0; i<25; i++) {
|
||||||
if (i < 4)
|
if (i<4) keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
|
||||||
keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
|
else keccak_gpu_state[i] = UINT2(0, 0);
|
||||||
else
|
|
||||||
keccak_gpu_state[i] = UINT2(0, 0);
|
|
||||||
}
|
}
|
||||||
keccak_gpu_state[4] = UINT2(1, 0);
|
keccak_gpu_state[4] = UINT2(1, 0);
|
||||||
keccak_gpu_state[16] = UINT2(0, 0x80000000);
|
keccak_gpu_state[16] = UINT2(0, 0x80000000);
|
||||||
keccak_blockv35(keccak_gpu_state, keccak_round_constants);
|
keccak_blockv35(keccak_gpu_state, keccak_round_constants);
|
||||||
|
|
||||||
#pragma unroll 4
|
#pragma unroll 4
|
||||||
for (int i=0; i<4;i++)
|
for (int i=0; i<4; i++)
|
||||||
outputHash[i*threads+thread]=devectorize(keccak_gpu_state[i]);
|
outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]);
|
||||||
#else
|
#else
|
||||||
uint64_t keccak_gpu_state[25];
|
uint64_t keccak_gpu_state[25];
|
||||||
#pragma unroll 25
|
#pragma unroll 25
|
||||||
|
@ -41,13 +41,12 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
|
|||||||
unsigned long *hashes_done)
|
unsigned long *hashes_done)
|
||||||
{
|
{
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
int throughput = opt_work_size ? opt_work_size : (1 << 21); // 256*256*8*4
|
||||||
|
throughput = min(throughput, (int)(max_nonce - first_nonce));
|
||||||
|
|
||||||
if (opt_benchmark)
|
if (opt_benchmark)
|
||||||
((uint32_t*)ptarget)[7] = 0x0005;
|
((uint32_t*)ptarget)[7] = 0x0005;
|
||||||
|
|
||||||
int throughput = opt_work_size ? opt_work_size : (1 << 21); // 256*256*8*4
|
|
||||||
throughput = min(throughput, (int)(max_nonce - first_nonce));
|
|
||||||
|
|
||||||
if (!init[thr_id]) {
|
if (!init[thr_id]) {
|
||||||
cudaSetDevice(device_map[thr_id]);
|
cudaSetDevice(device_map[thr_id]);
|
||||||
|
|
||||||
@ -75,15 +74,9 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
|
|||||||
keccak256_hash(vhash64, endiandata);
|
keccak256_hash(vhash64, endiandata);
|
||||||
|
|
||||||
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
|
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
|
||||||
int res = 1;
|
*hashes_done = foundNonce - first_nonce + 1;
|
||||||
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
|
|
||||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
|
||||||
if (secNonce != 0) {
|
|
||||||
pdata[21] = secNonce;
|
|
||||||
res++;
|
|
||||||
}
|
|
||||||
pdata[19] = foundNonce;
|
pdata[19] = foundNonce;
|
||||||
return res;
|
return 1;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
|
applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
|
||||||
|
@ -79,15 +79,12 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
|
|||||||
.cu.o:
|
.cu.o:
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=128 -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=128 -o $@ -c $<
|
||||||
|
|
||||||
blake32.o: blake32.cu
|
Algo256/blake256.o: Algo256/blake256.cu
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
|
||||||
|
|
||||||
heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
|
heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
|
||||||
|
|
||||||
keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
|
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<
|
|
||||||
|
|
||||||
qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
|
qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
|
||||||
|
|
||||||
|
12
ccminer.cpp
12
ccminer.cpp
@ -1100,9 +1100,11 @@ static void *miner_thread(void *userdata)
|
|||||||
case ALGO_BLAKE:
|
case ALGO_BLAKE:
|
||||||
minmax = 0x80000000U;
|
minmax = 0x80000000U;
|
||||||
break;
|
break;
|
||||||
|
case ALGO_KECCAK:
|
||||||
|
minmax = 0x40000000U;
|
||||||
|
break;
|
||||||
case ALGO_DOOM:
|
case ALGO_DOOM:
|
||||||
case ALGO_JACKPOT:
|
case ALGO_JACKPOT:
|
||||||
case ALGO_KECCAK:
|
|
||||||
case ALGO_LUFFA_DOOM:
|
case ALGO_LUFFA_DOOM:
|
||||||
minmax = 0x2000000;
|
minmax = 0x2000000;
|
||||||
break;
|
break;
|
||||||
@ -1157,6 +1159,9 @@ static void *miner_thread(void *userdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
if (opt_algo == ALGO_KECCAK && max64 == UINT32_MAX) {
|
||||||
|
max64 = 0x7FFFFFFFUL;
|
||||||
|
}
|
||||||
/* never let small ranges at end */
|
/* never let small ranges at end */
|
||||||
if (end_nonce >= UINT32_MAX - 256)
|
if (end_nonce >= UINT32_MAX - 256)
|
||||||
end_nonce = UINT32_MAX;
|
end_nonce = UINT32_MAX;
|
||||||
@ -1406,9 +1411,10 @@ out:
|
|||||||
|
|
||||||
static void restart_threads(void)
|
static void restart_threads(void)
|
||||||
{
|
{
|
||||||
int i;
|
if (opt_debug)
|
||||||
|
applog(LOG_DEBUG,"%s", __FUNCTION__);
|
||||||
|
|
||||||
for (i = 0; i < opt_n_threads; i++)
|
for (int i = 0; i < opt_n_threads; i++)
|
||||||
work_restart[i].restart = 1;
|
work_restart[i].restart = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user