mirror of
https://github.com/GOSTSec/ccminer
synced 2025-02-02 01:44:32 +00:00
quark: add support for SM 2 devices
todo: use nonce vectors for the second branch GPU #0: Gigabyte GTX 460, 261.26 kH/s accepted: 2/2 (diff 0.046), 254.36 kH/s yay!!! Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
This commit is contained in:
parent
21115b7fc6
commit
2247605d23
@ -105,7 +105,6 @@ bool bench_algo_switch_next(int thr_id)
|
|||||||
if (algo == ALGO_JACKPOT) algo++; // compact shuffle
|
if (algo == ALGO_JACKPOT) algo++; // compact shuffle
|
||||||
if (algo == ALGO_LYRA2v2) algo++;
|
if (algo == ALGO_LYRA2v2) algo++;
|
||||||
if (algo == ALGO_NEOSCRYPT) algo++;
|
if (algo == ALGO_NEOSCRYPT) algo++;
|
||||||
if (algo == ALGO_QUARK) algo++; // todo
|
|
||||||
if (algo == ALGO_WHIRLPOOLX) algo++;
|
if (algo == ALGO_WHIRLPOOLX) algo++;
|
||||||
}
|
}
|
||||||
// and unwanted ones...
|
// and unwanted ones...
|
||||||
|
@ -333,8 +333,8 @@ struct option options[] = {
|
|||||||
{ "no-stratum", 0, NULL, 1007 },
|
{ "no-stratum", 0, NULL, 1007 },
|
||||||
{ "no-autotune", 0, NULL, 1004 }, // scrypt
|
{ "no-autotune", 0, NULL, 1004 }, // scrypt
|
||||||
{ "interactive", 1, NULL, 1050 }, // scrypt
|
{ "interactive", 1, NULL, 1050 }, // scrypt
|
||||||
{ "launch-config", 0, NULL, 'l' }, // scrypt
|
{ "launch-config", 1, NULL, 'l' }, // scrypt
|
||||||
{ "lookup-gap", 0, NULL, 'L' }, // scrypt
|
{ "lookup-gap", 1, NULL, 'L' }, // scrypt
|
||||||
{ "texture-cache", 1, NULL, 1051 },// scrypt
|
{ "texture-cache", 1, NULL, 1051 },// scrypt
|
||||||
{ "max-temp", 1, NULL, 1060 },
|
{ "max-temp", 1, NULL, 1060 },
|
||||||
{ "max-diff", 1, NULL, 1061 },
|
{ "max-diff", 1, NULL, 1061 },
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
|
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
|
||||||
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
|
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
|
||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500)
|
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500) || defined(_DEBUG)
|
||||||
__constant__ uint64_t d_constMem[16] = {
|
__constant__ uint64_t d_constMem[16] = {
|
||||||
SPH_C64(0x8081828384858687),
|
SPH_C64(0x8081828384858687),
|
||||||
SPH_C64(0x88898A8B8C8D8E8F),
|
SPH_C64(0x88898A8B8C8D8E8F),
|
||||||
@ -42,7 +42,7 @@ __constant__ uint64_t d_constMem[16] = {
|
|||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 500
|
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 500 || defined(_DEBUG))
|
||||||
|
|
||||||
__device__
|
__device__
|
||||||
void Compression512_30(uint64_t *msg, uint64_t *hash)
|
void Compression512_30(uint64_t *msg, uint64_t *hash)
|
||||||
|
@ -8,10 +8,8 @@
|
|||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
#include <sm_30_intrinsics.h>
|
#include <sm_30_intrinsics.h>
|
||||||
|
|
||||||
static uint32_t *d_tempBranch1Nonces[MAX_GPUS];
|
|
||||||
static uint32_t *d_numValid[MAX_GPUS];
|
|
||||||
static uint32_t *h_numValid[MAX_GPUS];
|
static uint32_t *h_numValid[MAX_GPUS];
|
||||||
|
static uint32_t *d_tempBranch1Nonces[MAX_GPUS];
|
||||||
static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen
|
static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen
|
||||||
|
|
||||||
#if __CUDA_ARCH__ < 300
|
#if __CUDA_ARCH__ < 300
|
||||||
@ -43,32 +41,39 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[M
|
|||||||
__host__
|
__host__
|
||||||
void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
|
void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
|
||||||
{
|
{
|
||||||
|
int dev_id = device_map[thr_id];
|
||||||
|
cuda_get_arch(thr_id);
|
||||||
|
|
||||||
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
|
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
|
||||||
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
|
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
|
||||||
|
|
||||||
// wir brauchen auch Speicherplatz auf dem Device
|
if (cuda_arch[dev_id] >= 300) {
|
||||||
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);
|
uint32_t s1 = (threads / 256) * 2;
|
||||||
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
|
CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2));
|
||||||
|
CUDA_SAFE_CALL(cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block)
|
||||||
|
CUDA_SAFE_CALL(cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block)
|
||||||
|
} else {
|
||||||
|
CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads));
|
||||||
|
}
|
||||||
|
|
||||||
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
|
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
|
||||||
|
|
||||||
uint32_t s1;
|
|
||||||
s1 = (threads / 256) * 2;
|
|
||||||
|
|
||||||
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
|
|
||||||
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Because all alloc should have a free...
|
// Because all alloc should have a free...
|
||||||
__host__
|
__host__
|
||||||
void quark_compactTest_cpu_free(int thr_id)
|
void quark_compactTest_cpu_free(int thr_id)
|
||||||
{
|
{
|
||||||
cudaFree(d_tempBranch1Nonces[thr_id]);
|
int dev_id = device_map[thr_id];
|
||||||
cudaFree(d_numValid[thr_id]);
|
|
||||||
|
|
||||||
cudaFree(d_partSum[0][thr_id]);
|
|
||||||
cudaFree(d_partSum[1][thr_id]);
|
|
||||||
|
|
||||||
cudaFreeHost(h_numValid[thr_id]);
|
cudaFreeHost(h_numValid[thr_id]);
|
||||||
|
|
||||||
|
if (cuda_arch[dev_id] >= 300) {
|
||||||
|
cudaFree(d_tempBranch1Nonces[thr_id]);
|
||||||
|
cudaFree(d_partSum[0][thr_id]);
|
||||||
|
cudaFree(d_partSum[1][thr_id]);
|
||||||
|
} else {
|
||||||
|
cudaFree(d_tempBranch1Nonces[thr_id]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
@ -124,7 +129,6 @@ void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *parti
|
|||||||
for (int i=1; i<=width; i*=2)
|
for (int i=1; i<=width; i*=2)
|
||||||
{
|
{
|
||||||
uint32_t n = __shfl_up((int)value, i, width);
|
uint32_t n = __shfl_up((int)value, i, width);
|
||||||
|
|
||||||
if (lane_id >= i) value += n;
|
if (lane_id >= i) value += n;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -207,14 +211,12 @@ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTe
|
|||||||
uint32_t value;
|
uint32_t value;
|
||||||
if (id < threads)
|
if (id < threads)
|
||||||
{
|
{
|
||||||
// uint32_t nounce = startNounce + id;
|
|
||||||
uint32_t *inpHash;
|
uint32_t *inpHash;
|
||||||
if(d_validNonceTable == NULL)
|
if(d_validNonceTable == NULL)
|
||||||
{
|
{
|
||||||
// keine Nonce-Liste
|
// keine Nonce-Liste
|
||||||
inpHash = &inpHashes[id<<4];
|
inpHash = &inpHashes[id<<4];
|
||||||
}else
|
} else {
|
||||||
{
|
|
||||||
// Nonce-Liste verfügbar
|
// Nonce-Liste verfügbar
|
||||||
int nonce = d_validNonceTable[id] - startNounce;
|
int nonce = d_validNonceTable[id] - startNounce;
|
||||||
actNounce = nonce;
|
actNounce = nonce;
|
||||||
@ -222,13 +224,11 @@ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTe
|
|||||||
}
|
}
|
||||||
|
|
||||||
value = (*testFunc)(inpHash);
|
value = (*testFunc)(inpHash);
|
||||||
}else
|
} else {
|
||||||
{
|
|
||||||
value = 0;
|
value = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( value )
|
if (value) {
|
||||||
{
|
|
||||||
int idx = sum[id];
|
int idx = sum[id];
|
||||||
if(idx > 0)
|
if(idx > 0)
|
||||||
outp[idx-1] = startNounce + actNounce;
|
outp[idx-1] = startNounce + actNounce;
|
||||||
@ -271,12 +271,10 @@ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32
|
|||||||
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);
|
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);
|
||||||
|
|
||||||
// weitere Scans
|
// weitere Scans
|
||||||
if(callThrid)
|
if(callThrid) {
|
||||||
{
|
|
||||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
|
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
|
||||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
|
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
|
||||||
}else
|
} else {
|
||||||
{
|
|
||||||
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
|
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -290,8 +288,7 @@ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32
|
|||||||
|
|
||||||
|
|
||||||
// Addieren
|
// Addieren
|
||||||
if(callThrid)
|
if(callThrid) {
|
||||||
{
|
|
||||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
|
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
|
||||||
}
|
}
|
||||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
|
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
|
||||||
@ -304,6 +301,68 @@ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32
|
|||||||
cudaStreamSynchronize(NULL);
|
cudaStreamSynchronize(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef __INTELLISENSE__
|
||||||
|
#define atomicAdd(x,n) ( *(x)+=n )
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(128, 8)
|
||||||
|
void quark_filter_gpu_sm2(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch, uint32_t &count)
|
||||||
|
{
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
|
||||||
|
uint4 *psrc = (uint4*) (&d_hash[offset]);
|
||||||
|
d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 0x8;
|
||||||
|
if (d_NonceBranch[thread]) return;
|
||||||
|
//uint32_t off_br = atomicAdd(&count, 1) * 16U;
|
||||||
|
// uint4 = 4x uint32_t = 16 bytes
|
||||||
|
uint4 *pdst = (uint4*) (&d_branch2[offset]);
|
||||||
|
pdst[0] = psrc[0];
|
||||||
|
pdst[1] = psrc[1];
|
||||||
|
pdst[2] = psrc[2];
|
||||||
|
pdst[3] = psrc[3];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(128, 8)
|
||||||
|
void quark_merge_gpu_sm2(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
|
||||||
|
{
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads && !d_NonceBranch[thread])
|
||||||
|
{
|
||||||
|
const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
|
||||||
|
uint4 *pdst = (uint4*) (&d_hash[offset]);
|
||||||
|
uint4 *psrc = (uint4*) (&d_branch2[offset]);
|
||||||
|
pdst[0] = psrc[0];
|
||||||
|
pdst[1] = psrc[1];
|
||||||
|
pdst[2] = psrc[2];
|
||||||
|
pdst[3] = psrc[3];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
uint32_t quark_filter_cpu_sm2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2)
|
||||||
|
{
|
||||||
|
uint32_t branch2_nonces = 0;
|
||||||
|
const uint32_t threadsperblock = 128;
|
||||||
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
// copy all hashes in the right branch buffer
|
||||||
|
quark_filter_gpu_sm2 <<<grid, block>>> (threads, inpHashes, d_branch2, d_tempBranch1Nonces[thr_id], branch2_nonces);
|
||||||
|
return branch2_nonces;
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void quark_merge_cpu_sm2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2)
|
||||||
|
{
|
||||||
|
const uint32_t threadsperblock = 128;
|
||||||
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
// copy second branch hashes to d_hash
|
||||||
|
quark_merge_gpu_sm2 <<<grid, block>>> (threads, outpHashes, d_branch2, d_tempBranch1Nonces[thr_id]);
|
||||||
|
}
|
||||||
|
|
||||||
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
|
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
|
||||||
__host__
|
__host__
|
||||||
void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1,
|
void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1,
|
||||||
@ -311,37 +370,6 @@ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t
|
|||||||
{
|
{
|
||||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||||
|
|
||||||
/*
|
|
||||||
// threadsPerBlock ausrechnen
|
|
||||||
int blockSize = 256;
|
|
||||||
int thr1 = threads / blockSize;
|
|
||||||
int thr2 = threads / (blockSize*blockSize);
|
|
||||||
|
|
||||||
// 1
|
|
||||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
|
|
||||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
|
|
||||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
|
|
||||||
cudaStreamSynchronize(NULL);
|
|
||||||
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
|
||||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
|
|
||||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
|
|
||||||
|
|
||||||
// 2
|
|
||||||
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
|
|
||||||
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
|
|
||||||
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
|
|
||||||
cudaStreamSynchronize(NULL);
|
|
||||||
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
|
||||||
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
|
|
||||||
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
|
|
||||||
|
|
||||||
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
|
|
||||||
// Schritt 3: Scatter
|
|
||||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
|
|
||||||
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
|
|
||||||
cudaStreamSynchronize(NULL);
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
@ -369,6 +397,6 @@ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, ui
|
|||||||
|
|
||||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||||
|
|
||||||
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
|
cudaStreamSynchronize(NULL);
|
||||||
*nrm1 = h_numValid[thr_id][0];
|
*nrm1 = h_numValid[thr_id][0];
|
||||||
}
|
}
|
@ -52,7 +52,7 @@ extern uint32_t T2dn_cpu[];
|
|||||||
extern uint32_t T3up_cpu[];
|
extern uint32_t T3up_cpu[];
|
||||||
extern uint32_t T3dn_cpu[];
|
extern uint32_t T3dn_cpu[];
|
||||||
|
|
||||||
#if __CUDA_ARCH__ < 300
|
#if __CUDA_ARCH__ < 300 || defined(_DEBUG)
|
||||||
|
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
void quark_groestl512_perm_P(uint32_t *a, char *mixtabs)
|
void quark_groestl512_perm_P(uint32_t *a, char *mixtabs)
|
||||||
@ -206,7 +206,7 @@ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs)
|
|||||||
__global__
|
__global__
|
||||||
void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
|
void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
|
||||||
{
|
{
|
||||||
#if __CUDA_ARCH__ < 300
|
#if __CUDA_ARCH__ < 300 || defined(_DEBUG)
|
||||||
extern __shared__ char mixtabs[];
|
extern __shared__ char mixtabs[];
|
||||||
|
|
||||||
if (threadIdx.x < 256)
|
if (threadIdx.x < 256)
|
||||||
|
@ -13,7 +13,13 @@ extern "C"
|
|||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
#include "cuda_quark.h"
|
#include "cuda_quark.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
extern uint32_t quark_filter_cpu_sm2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2);
|
||||||
|
extern void quark_merge_cpu_sm2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2);
|
||||||
|
|
||||||
static uint32_t *d_hash[MAX_GPUS];
|
static uint32_t *d_hash[MAX_GPUS];
|
||||||
|
static uint32_t* d_hash_br2[MAX_GPUS]; // SM 2
|
||||||
|
|
||||||
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
|
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
|
||||||
static uint32_t *d_branch1Nonces[MAX_GPUS];
|
static uint32_t *d_branch1Nonces[MAX_GPUS];
|
||||||
@ -102,10 +108,10 @@ extern "C" void quarkhash(void *state, const void *input)
|
|||||||
#define TRACE(algo) { \
|
#define TRACE(algo) { \
|
||||||
if (max_nonce == 1 && pdata[19] <= 1) { \
|
if (max_nonce == 1 && pdata[19] <= 1) { \
|
||||||
uint32_t* debugbuf = NULL; \
|
uint32_t* debugbuf = NULL; \
|
||||||
cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \
|
cudaMallocHost(&debugbuf, 32); \
|
||||||
cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
|
cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \
|
||||||
printf("quark %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
|
printf("quark %s %08x %08x %08x %08x...%08x... \n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
|
||||||
swab32(debugbuf[2]), swab32(debugbuf[3])); \
|
swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \
|
||||||
cudaFreeHost(debugbuf); \
|
cudaFreeHost(debugbuf); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
@ -121,9 +127,10 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
|||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
int dev_id = device_map[thr_id];
|
|
||||||
|
|
||||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096
|
int dev_id = device_map[thr_id];
|
||||||
|
uint32_t def_thr = 1U << 20; // 256*4096
|
||||||
|
uint32_t throughput = cuda_default_throughput(thr_id, def_thr);
|
||||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||||
|
|
||||||
if (opt_benchmark)
|
if (opt_benchmark)
|
||||||
@ -131,7 +138,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
|||||||
|
|
||||||
if (!init[thr_id])
|
if (!init[thr_id])
|
||||||
{
|
{
|
||||||
cudaSetDevice(device_map[thr_id]);
|
cudaSetDevice(dev_id);
|
||||||
|
|
||||||
cudaGetLastError();
|
cudaGetLastError();
|
||||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
|
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
|
||||||
@ -142,20 +149,19 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
|||||||
quark_bmw512_cpu_init(thr_id, throughput);
|
quark_bmw512_cpu_init(thr_id, throughput);
|
||||||
quark_keccak512_cpu_init(thr_id, throughput);
|
quark_keccak512_cpu_init(thr_id, throughput);
|
||||||
quark_jh512_cpu_init(thr_id, throughput);
|
quark_jh512_cpu_init(thr_id, throughput);
|
||||||
cuda_check_cpu_init(thr_id, throughput);
|
|
||||||
quark_compactTest_cpu_init(thr_id, throughput);
|
quark_compactTest_cpu_init(thr_id, throughput);
|
||||||
|
|
||||||
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
|
if (cuda_arch[dev_id] >= 300) {
|
||||||
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
|
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||||
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
|
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||||
CUDA_SAFE_CALL(cudaGetLastError());
|
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||||
|
} else {
|
||||||
if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
|
cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput);
|
||||||
gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
|
|
||||||
proper_exit(1);
|
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cuda_check_cpu_init(thr_id, throughput);
|
||||||
|
CUDA_SAFE_CALL(cudaGetLastError());
|
||||||
|
|
||||||
init[thr_id] = true;
|
init[thr_id] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -167,58 +173,95 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
|||||||
|
|
||||||
do {
|
do {
|
||||||
int order = 0;
|
int order = 0;
|
||||||
|
uint32_t foundNonce;
|
||||||
uint32_t nrm1=0, nrm2=0, nrm3=0;
|
uint32_t nrm1=0, nrm2=0, nrm3=0;
|
||||||
|
|
||||||
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
|
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
|
||||||
TRACE("blake :");
|
TRACE("blake :");
|
||||||
|
|
||||||
// das ist der unbedingte Branch für BMW512
|
|
||||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
TRACE("bmw :");
|
||||||
|
|
||||||
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
|
if (cuda_arch[dev_id] >= 300) {
|
||||||
d_branch3Nonces[thr_id], &nrm3,
|
|
||||||
|
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
|
||||||
|
d_branch3Nonces[thr_id], &nrm3, order++);
|
||||||
|
|
||||||
|
// nur den Skein Branch weiterverfolgen
|
||||||
|
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für Groestl512
|
||||||
|
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für JH512
|
||||||
|
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
|
||||||
|
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
||||||
|
d_branch1Nonces[thr_id], &nrm1,
|
||||||
|
d_branch2Nonces[thr_id], &nrm2,
|
||||||
order++);
|
order++);
|
||||||
|
|
||||||
// nur den Skein Branch weiterverfolgen
|
// das ist der bedingte Branch für Blake512
|
||||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
|
|
||||||
// das ist der unbedingte Branch für Groestl512
|
// das ist der bedingte Branch für Bmw512
|
||||||
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
|
|
||||||
// das ist der unbedingte Branch für JH512
|
// das ist der unbedingte Branch für Keccak512
|
||||||
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
|
|
||||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
|
// das ist der unbedingte Branch für Skein512
|
||||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
d_branch1Nonces[thr_id], &nrm1,
|
|
||||||
d_branch2Nonces[thr_id], &nrm2,
|
|
||||||
order++);
|
|
||||||
|
|
||||||
// das ist der bedingte Branch für Blake512
|
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
|
||||||
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
||||||
|
d_branch1Nonces[thr_id], &nrm1,
|
||||||
|
d_branch2Nonces[thr_id], &nrm2,
|
||||||
|
order++);
|
||||||
|
|
||||||
// das ist der bedingte Branch für Bmw512
|
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
|
|
||||||
// das ist der unbedingte Branch für Keccak512
|
foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||||
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
|
||||||
|
|
||||||
// das ist der unbedingte Branch für Skein512
|
} else {
|
||||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
/* algo permutations are made with 2 different buffers */
|
||||||
|
|
||||||
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
|
quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
|
||||||
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
d_branch1Nonces[thr_id], &nrm1,
|
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
|
||||||
d_branch2Nonces[thr_id], &nrm2,
|
quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
|
||||||
order++);
|
TRACE("perm1 :");
|
||||||
|
|
||||||
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
TRACE("groestl:");
|
||||||
|
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
TRACE("jh512 :");
|
||||||
|
|
||||||
|
quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
|
||||||
|
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
|
||||||
|
quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
|
||||||
|
TRACE("perm2 :");
|
||||||
|
|
||||||
|
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
TRACE("keccak :");
|
||||||
|
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
TRACE("skein :");
|
||||||
|
|
||||||
|
quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
|
||||||
|
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
|
||||||
|
quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
|
||||||
|
TRACE("perm3 :");
|
||||||
|
|
||||||
|
CUDA_LOG_ERROR();
|
||||||
|
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
||||||
|
}
|
||||||
|
|
||||||
*hashes_done = pdata[19] - first_nonce + 1;
|
*hashes_done = pdata[19] - first_nonce + 1;
|
||||||
|
|
||||||
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
if (foundNonce != UINT32_MAX)
|
||||||
if (foundNonce != UINT32_MAX)
|
|
||||||
{
|
{
|
||||||
uint32_t vhash[8];
|
uint32_t vhash[8];
|
||||||
be32enc(&endiandata[19], foundNonce);
|
be32enc(&endiandata[19], foundNonce);
|
||||||
|
3
util.cpp
3
util.cpp
@ -1852,7 +1852,6 @@ void do_gpu_tests(void)
|
|||||||
#ifdef _DEBUG
|
#ifdef _DEBUG
|
||||||
unsigned long done;
|
unsigned long done;
|
||||||
char s[128] = { '\0' };
|
char s[128] = { '\0' };
|
||||||
uchar buf[160];
|
|
||||||
struct work work;
|
struct work work;
|
||||||
memset(&work, 0, sizeof(work));
|
memset(&work, 0, sizeof(work));
|
||||||
|
|
||||||
@ -1866,7 +1865,7 @@ void do_gpu_tests(void)
|
|||||||
//scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv);
|
//scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv);
|
||||||
|
|
||||||
memset(work.data, 0, sizeof(work.data));
|
memset(work.data, 0, sizeof(work.data));
|
||||||
scanhash_lyra2(0, &work, 1, &done);
|
scanhash_quark(0, &work, 1, &done);
|
||||||
|
|
||||||
free(work_restart);
|
free(work_restart);
|
||||||
work_restart = NULL;
|
work_restart = NULL;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user