mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-22 04:24:29 +00:00
various changes, cleanup for the release
small fixes to handle better the multi thread per gpu explicitly report than quark is not compatible with SM 2.1 (compact shuffle)
This commit is contained in:
parent
1e3db41a8d
commit
e50556b637
@ -724,14 +724,13 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
|
||||
__host__
|
||||
void fugue256_cpu_init(int thr_id, uint32_t threads)
|
||||
{
|
||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||
// Link the hash tables in the GPU
|
||||
texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
|
||||
texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
|
||||
texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
|
||||
texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
|
||||
|
||||
// Speicher für alle Ergebnisse belegen
|
||||
cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads);
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads));
|
||||
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
|
||||
}
|
||||
|
||||
@ -741,11 +740,6 @@ void fugue256_cpu_free(int thr_id)
|
||||
cudaFree(d_fugue256_hashoutput[thr_id]);
|
||||
cudaFree(d_resultNonce[thr_id]);
|
||||
|
||||
cudaUnbindTexture(mixTab0Tex);
|
||||
cudaUnbindTexture(mixTab1Tex);
|
||||
cudaUnbindTexture(mixTab2Tex);
|
||||
cudaUnbindTexture(mixTab3Tex);
|
||||
|
||||
for (int i=0; i<4; i++)
|
||||
cudaFree(d_textures[thr_id][i]);
|
||||
}
|
||||
@ -753,20 +747,18 @@ void fugue256_cpu_free(int thr_id)
|
||||
__host__
|
||||
void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
|
||||
{
|
||||
// CPU-Vorbereitungen treffen
|
||||
sph_fugue256_context ctx_fugue_const;
|
||||
sph_fugue256_init(&ctx_fugue_const);
|
||||
sph_fugue256 (&ctx_fugue_const, data, 80); // State speichern
|
||||
|
||||
sph_fugue256 (&ctx_fugue_const, data, 80);
|
||||
cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30);
|
||||
|
||||
cudaMemcpyToSymbol(pTarget, pTargetIn, sizeof(uint32_t) * 8);
|
||||
cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
|
||||
|
||||
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
||||
}
|
||||
|
||||
__host__
|
||||
void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
|
||||
void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
|
||||
{
|
||||
#if USE_SHARED
|
||||
const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
|
||||
|
@ -283,15 +283,6 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
|
||||
__host__
|
||||
void groestl256_cpu_free(int thr_id)
|
||||
{
|
||||
cudaUnbindTexture(t0up2);
|
||||
cudaUnbindTexture(t0dn2);
|
||||
cudaUnbindTexture(t1up2);
|
||||
cudaUnbindTexture(t1dn2);
|
||||
cudaUnbindTexture(t2up2);
|
||||
cudaUnbindTexture(t2dn2);
|
||||
cudaUnbindTexture(t3up2);
|
||||
cudaUnbindTexture(t3dn2);
|
||||
|
||||
for (int i=0; i<8; i++)
|
||||
cudaFree(d_textures[thr_id][i]);
|
||||
|
||||
|
@ -228,7 +228,7 @@ features.
|
||||
|
||||
>>> RELEASE HISTORY <<<
|
||||
|
||||
Nov. 02nd 2015 v1.7
|
||||
Nov. 05th 2015 v1.7
|
||||
Improve old devices compatibility (x11, lyra2, qubit...)
|
||||
Add windows support for SM 2.1 and drop SM 3.5 (x86)
|
||||
Improve lyra2 (v1/v2) cuda implementations
|
||||
|
17
ccminer.cpp
17
ccminer.cpp
@ -2793,12 +2793,13 @@ void parse_arg(int key, char *arg)
|
||||
if (p) d *= 1e9;
|
||||
opt_max_rate = d;
|
||||
break;
|
||||
case 'd': // CB
|
||||
case 'd': // --device
|
||||
{
|
||||
int device_thr[MAX_GPUS] = { 0 };
|
||||
int ngpus = cuda_num_devices();
|
||||
char * pch = strtok (arg,",");
|
||||
opt_n_threads = 0;
|
||||
while (pch != NULL) {
|
||||
while (pch != NULL && opt_n_threads < MAX_GPUS) {
|
||||
if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
|
||||
{
|
||||
if (atoi(pch) < ngpus)
|
||||
@ -2818,6 +2819,14 @@ void parse_arg(int key, char *arg)
|
||||
}
|
||||
pch = strtok (NULL, ",");
|
||||
}
|
||||
// count threads per gpu
|
||||
for (int n=0; n < opt_n_threads; n++) {
|
||||
int device = device_map[n];
|
||||
device_thr[device]++;
|
||||
}
|
||||
for (int n=0; n < ngpus; n++) {
|
||||
gpu_threads = max(gpu_threads, device_thr[n]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@ -3177,8 +3186,8 @@ int main(int argc, char *argv[])
|
||||
else if (active_gpus > opt_n_threads)
|
||||
active_gpus = opt_n_threads;
|
||||
|
||||
// generally doesn't work... let 1
|
||||
gpu_threads = opt_n_threads / active_gpus;
|
||||
// generally doesn't work well...
|
||||
gpu_threads = max(gpu_threads, opt_n_threads / active_gpus);
|
||||
|
||||
if (opt_benchmark && opt_algo == ALGO_AUTO) {
|
||||
bench_init(opt_n_threads);
|
||||
|
@ -11,23 +11,27 @@
|
||||
__constant__ uint32_t pTarget[8]; // 32 bytes
|
||||
|
||||
// store MAX_GPUS device arrays of 8 nonces
|
||||
static uint32_t* h_resNonces[MAX_GPUS];
|
||||
static uint32_t* d_resNonces[MAX_GPUS];
|
||||
static bool init_done = false;
|
||||
static uint32_t* h_resNonces[MAX_GPUS] = { NULL };
|
||||
static uint32_t* d_resNonces[MAX_GPUS] = { NULL };
|
||||
static __thread bool init_done = false;
|
||||
|
||||
__host__
|
||||
void cuda_check_cpu_init(int thr_id, uint32_t threads)
|
||||
{
|
||||
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 32));
|
||||
CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32));
|
||||
CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 32));
|
||||
init_done = true;
|
||||
}
|
||||
|
||||
__host__
|
||||
void cuda_check_cpu_free(int thr_id)
|
||||
{
|
||||
if (!init_done) return;
|
||||
cudaFree(d_resNonces[thr_id]);
|
||||
cudaFreeHost(h_resNonces[thr_id]);
|
||||
d_resNonces[thr_id] = NULL;
|
||||
h_resNonces[thr_id] = NULL;
|
||||
init_done = false;
|
||||
}
|
||||
|
||||
// Target Difficulty
|
||||
@ -198,7 +202,7 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc
|
||||
cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 32, cudaMemcpyDeviceToHost);
|
||||
rescnt = h_resNonces[thr_id][0];
|
||||
if (rescnt > numNonce) {
|
||||
if (numNonce <= rescnt) {
|
||||
|
@ -1,7 +1,7 @@
|
||||
#ifndef _CUDA_FUGUE512_H
|
||||
#define _CUDA_FUGUE512_H
|
||||
#ifndef _CUDA_FUGUE256_H
|
||||
#define _CUDA_FUGUE256_H
|
||||
|
||||
void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce);
|
||||
void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
|
||||
void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
|
||||
void fugue256_cpu_init(int thr_id, uint32_t threads);
|
||||
void fugue256_cpu_free(int thr_id);
|
||||
|
@ -8,14 +8,6 @@
|
||||
|
||||
#include "cuda_fugue256.h"
|
||||
|
||||
extern "C" void my_fugue256_init(void *cc);
|
||||
extern "C" void my_fugue256(void *cc, const void *data, size_t len);
|
||||
extern "C" void my_fugue256_close(void *cc, void *dst);
|
||||
extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
// vorbereitete Kontexte nach den ersten 80 Bytes
|
||||
// sph_fugue256_context ctx_fugue_const[MAX_GPUS];
|
||||
|
||||
#define SWAP32(x) \
|
||||
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
|
||||
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
|
||||
@ -38,11 +30,11 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t start_nonce = pdata[19]++;
|
||||
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 256*256*8
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
|
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0xf;
|
||||
ptarget[7] = 0xf;
|
||||
|
||||
// init
|
||||
if(!init[thr_id])
|
||||
@ -57,7 +49,6 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
|
||||
for (int kk=0; kk < 20; kk++)
|
||||
be32enc(&endiandata[kk], pdata[kk]);
|
||||
|
||||
// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
|
||||
fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
|
||||
|
||||
do {
|
||||
|
@ -114,11 +114,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
|
||||
|
||||
if (device_sm[dev_id] < 300) {
|
||||
applog(LOG_ERR, "Device SM 3.0 or more recent required!");
|
||||
gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
|
||||
proper_exit(1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
api_set_throughput(thr_id, throughput);
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
|
@ -1,3 +1,7 @@
|
||||
/*
|
||||
* REQUIRE SM 3.0 arch!
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
@ -10,6 +14,14 @@ static uint32_t *h_numValid[MAX_GPUS];
|
||||
|
||||
static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
|
||||
*/
|
||||
#undef __shfl_up
|
||||
#define __shfl_up(var, delta, width) (0)
|
||||
#endif
|
||||
|
||||
// True/False tester
|
||||
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
|
||||
|
||||
@ -28,7 +40,8 @@ __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_Qua
|
||||
cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS];
|
||||
|
||||
// Setup/Alloc Function
|
||||
__host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
|
||||
__host__
|
||||
void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
|
||||
{
|
||||
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
|
||||
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
|
||||
@ -46,7 +59,8 @@ __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
|
||||
}
|
||||
|
||||
// Because all alloc should have a free...
|
||||
__host__ void quark_compactTest_cpu_free(int thr_id)
|
||||
__host__
|
||||
void quark_compactTest_cpu_free(int thr_id)
|
||||
{
|
||||
cudaFree(d_tempBranch1Nonces[thr_id]);
|
||||
cudaFree(d_numValid[thr_id]);
|
||||
@ -57,16 +71,9 @@ __host__ void quark_compactTest_cpu_free(int thr_id)
|
||||
cudaFreeHost(h_numValid[thr_id]);
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
|
||||
*/
|
||||
#undef __shfl_up
|
||||
#define __shfl_up(var, delta, width) (0)
|
||||
#endif
|
||||
|
||||
// Die Summenfunktion (vom NVIDIA SDK)
|
||||
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
__global__
|
||||
void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL,
|
||||
uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
{
|
||||
extern __shared__ uint32_t sums[];
|
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
|
||||
@ -91,19 +98,16 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
|
||||
{
|
||||
// keine Nonce-Liste
|
||||
inpHash = &inpHashes[id<<4];
|
||||
}else
|
||||
{
|
||||
} else {
|
||||
// Nonce-Liste verfügbar
|
||||
int nonce = d_validNonceTable[id] - startNounce;
|
||||
inpHash = &inpHashes[nonce<<4];
|
||||
}
|
||||
value = (*testFunc)(inpHash);
|
||||
}else
|
||||
{
|
||||
} else {
|
||||
value = 0;
|
||||
}
|
||||
}else
|
||||
{
|
||||
} else {
|
||||
value = data[id];
|
||||
}
|
||||
|
||||
@ -115,8 +119,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
|
||||
// those threads where the thread 'i' away would have
|
||||
// been out of bounds of the warp are unaffected. This
|
||||
// creates the scan sum.
|
||||
#pragma unroll
|
||||
|
||||
#pragma unroll
|
||||
for (int i=1; i<=width; i*=2)
|
||||
{
|
||||
uint32_t n = __shfl_up((int)value, i, width);
|
||||
@ -147,8 +151,7 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
|
||||
for (int i=1; i<=width; i*=2)
|
||||
{
|
||||
uint32_t n = __shfl_up((int)warp_sum, i, width);
|
||||
|
||||
if (lane_id >= i) warp_sum += n;
|
||||
if (lane_id >= i) warp_sum += n;
|
||||
}
|
||||
|
||||
sums[lane_id] = warp_sum;
|
||||
@ -178,7 +181,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
|
||||
}
|
||||
|
||||
// Uniform add: add partial sums array
|
||||
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
|
||||
__global__
|
||||
void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
|
||||
{
|
||||
__shared__ uint32_t buf;
|
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
|
||||
@ -195,7 +199,8 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums
|
||||
}
|
||||
|
||||
// Der Scatter
|
||||
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
__global__
|
||||
void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
{
|
||||
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
|
||||
uint32_t actNounce = id;
|
||||
@ -244,9 +249,9 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
|
||||
return mask;
|
||||
}
|
||||
|
||||
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
|
||||
uint32_t *d_nonces1, cuda_compactTestFunction_t function,
|
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
|
||||
__host__
|
||||
void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,uint32_t *d_nonces1,
|
||||
cuda_compactTestFunction_t function, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
|
||||
{
|
||||
int orgThreads = threads;
|
||||
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
|
||||
@ -300,9 +305,9 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t thread
|
||||
}
|
||||
|
||||
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
|
||||
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
|
||||
uint32_t *d_nonces1, uint32_t *d_nonces2,
|
||||
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
|
||||
__host__
|
||||
void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1,
|
||||
uint32_t *d_nonces2, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
|
||||
{
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
|
||||
@ -339,10 +344,9 @@ __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads,
|
||||
*/
|
||||
}
|
||||
|
||||
__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, uint32_t *nrm1,
|
||||
uint32_t *d_nonces2, uint32_t *nrm2,
|
||||
int order)
|
||||
__host__
|
||||
void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
|
||||
uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order)
|
||||
{
|
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
|
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
|
||||
@ -356,9 +360,9 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32
|
||||
*nrm2 = h_numValid[thr_id][1];
|
||||
}
|
||||
|
||||
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
|
||||
uint32_t *d_nonces1, uint32_t *nrm1,
|
||||
int order)
|
||||
__host__
|
||||
void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
|
||||
uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, int order)
|
||||
{
|
||||
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
|
||||
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
|
||||
|
@ -121,6 +121,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int dev_id = device_map[thr_id];
|
||||
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096
|
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
@ -132,6 +133,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
cudaGetLastError();
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
|
||||
|
||||
quark_blake512_cpu_init(thr_id, throughput);
|
||||
@ -146,6 +148,13 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
|
||||
CUDA_SAFE_CALL(cudaGetLastError());
|
||||
|
||||
if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
|
||||
gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
|
||||
proper_exit(1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
@ -685,11 +685,6 @@ void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
|
||||
__host__
|
||||
void x13_fugue512_cpu_free(int thr_id)
|
||||
{
|
||||
cudaUnbindTexture(mixTab0Tex);
|
||||
cudaUnbindTexture(mixTab1Tex);
|
||||
cudaUnbindTexture(mixTab2Tex);
|
||||
cudaUnbindTexture(mixTab3Tex);
|
||||
|
||||
for (int i=0; i<4; i++)
|
||||
cudaFree(d_textures[thr_id][i]);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user