Browse Source

various changes, cleanup for the release

small fixes to handle better the multi thread per gpu

explicitly report than quark is not compatible with SM 2.1 (compact shuffle)
2upstream
Tanguy Pruvot 9 years ago
parent
commit
e50556b637
  1. 18
      Algo256/cuda_fugue256.cu
  2. 9
      Algo256/cuda_groestl256.cu
  3. 2
      README.txt
  4. 17
      ccminer.cpp
  5. 14
      cuda_checkhash.cu
  6. 6
      cuda_fugue256.h
  7. 13
      fuguecoin.cpp
  8. 3
      lyra2/lyra2REv2.cu
  9. 76
      quark/cuda_quark_compactionTest.cu
  10. 9
      quark/quarkcoin.cu
  11. 5
      x13/cuda_x13_fugue512.cu

18
Algo256/cuda_fugue256.cu

@ -724,14 +724,13 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp @@ -724,14 +724,13 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
__host__
void fugue256_cpu_init(int thr_id, uint32_t threads)
{
// Kopiere die Hash-Tabellen in den GPU-Speicher
// Link the hash tables in the GPU
texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
// Speicher für alle Ergebnisse belegen
cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads);
CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads));
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
}
@ -741,11 +740,6 @@ void fugue256_cpu_free(int thr_id) @@ -741,11 +740,6 @@ void fugue256_cpu_free(int thr_id)
cudaFree(d_fugue256_hashoutput[thr_id]);
cudaFree(d_resultNonce[thr_id]);
cudaUnbindTexture(mixTab0Tex);
cudaUnbindTexture(mixTab1Tex);
cudaUnbindTexture(mixTab2Tex);
cudaUnbindTexture(mixTab3Tex);
for (int i=0; i<4; i++)
cudaFree(d_textures[thr_id][i]);
}
@ -753,20 +747,18 @@ void fugue256_cpu_free(int thr_id) @@ -753,20 +747,18 @@ void fugue256_cpu_free(int thr_id)
__host__
void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
{
// CPU-Vorbereitungen treffen
sph_fugue256_context ctx_fugue_const;
sph_fugue256_init(&ctx_fugue_const);
sph_fugue256 (&ctx_fugue_const, data, 80); // State speichern
sph_fugue256 (&ctx_fugue_const, data, 80);
cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30);
cudaMemcpyToSymbol(pTarget, pTargetIn, sizeof(uint32_t) * 8);
cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
}
__host__
void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
{
#if USE_SHARED
const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN

9
Algo256/cuda_groestl256.cu

@ -283,15 +283,6 @@ void groestl256_cpu_init(int thr_id, uint32_t threads) @@ -283,15 +283,6 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
__host__
void groestl256_cpu_free(int thr_id)
{
cudaUnbindTexture(t0up2);
cudaUnbindTexture(t0dn2);
cudaUnbindTexture(t1up2);
cudaUnbindTexture(t1dn2);
cudaUnbindTexture(t2up2);
cudaUnbindTexture(t2dn2);
cudaUnbindTexture(t3up2);
cudaUnbindTexture(t3dn2);
for (int i=0; i<8; i++)
cudaFree(d_textures[thr_id][i]);

2
README.txt

@ -228,7 +228,7 @@ features. @@ -228,7 +228,7 @@ features.
>>> RELEASE HISTORY <<<
Nov. 02nd 2015 v1.7
Nov. 05th 2015 v1.7
Improve old devices compatibility (x11, lyra2, qubit...)
Add windows support for SM 2.1 and drop SM 3.5 (x86)
Improve lyra2 (v1/v2) cuda implementations

17
ccminer.cpp

@ -2793,12 +2793,13 @@ void parse_arg(int key, char *arg) @@ -2793,12 +2793,13 @@ void parse_arg(int key, char *arg)
if (p) d *= 1e9;
opt_max_rate = d;
break;
case 'd': // CB
case 'd': // --device
{
int device_thr[MAX_GPUS] = { 0 };
int ngpus = cuda_num_devices();
char * pch = strtok (arg,",");
opt_n_threads = 0;
while (pch != NULL) {
while (pch != NULL && opt_n_threads < MAX_GPUS) {
if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
{
if (atoi(pch) < ngpus)
@ -2818,6 +2819,14 @@ void parse_arg(int key, char *arg) @@ -2818,6 +2819,14 @@ void parse_arg(int key, char *arg)
}
pch = strtok (NULL, ",");
}
// count threads per gpu
for (int n=0; n < opt_n_threads; n++) {
int device = device_map[n];
device_thr[device]++;
}
for (int n=0; n < ngpus; n++) {
gpu_threads = max(gpu_threads, device_thr[n]);
}
}
break;
@ -3177,8 +3186,8 @@ int main(int argc, char *argv[]) @@ -3177,8 +3186,8 @@ int main(int argc, char *argv[])
else if (active_gpus > opt_n_threads)
active_gpus = opt_n_threads;
// generally doesn't work... let 1
gpu_threads = opt_n_threads / active_gpus;
// generally doesn't work well...
gpu_threads = max(gpu_threads, opt_n_threads / active_gpus);
if (opt_benchmark && opt_algo == ALGO_AUTO) {
bench_init(opt_n_threads);

14
cuda_checkhash.cu

@ -11,23 +11,27 @@ @@ -11,23 +11,27 @@
__constant__ uint32_t pTarget[8]; // 32 bytes
// store MAX_GPUS device arrays of 8 nonces
static uint32_t* h_resNonces[MAX_GPUS];
static uint32_t* d_resNonces[MAX_GPUS];
static bool init_done = false;
static uint32_t* h_resNonces[MAX_GPUS] = { NULL };
static uint32_t* d_resNonces[MAX_GPUS] = { NULL };
static __thread bool init_done = false;
__host__
void cuda_check_cpu_init(int thr_id, uint32_t threads)
{
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 32));
CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32));
CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 32));
init_done = true;
}
__host__
void cuda_check_cpu_free(int thr_id)
{
if (!init_done) return;
cudaFree(d_resNonces[thr_id]);
cudaFreeHost(h_resNonces[thr_id]);
d_resNonces[thr_id] = NULL;
h_resNonces[thr_id] = NULL;
init_done = false;
}
// Target Difficulty
@ -198,7 +202,7 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc @@ -198,7 +202,7 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc
cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize();
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 32, cudaMemcpyDeviceToHost);
rescnt = h_resNonces[thr_id][0];
if (rescnt > numNonce) {
if (numNonce <= rescnt) {

6
cuda_fugue256.h

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
#ifndef _CUDA_FUGUE512_H
#define _CUDA_FUGUE512_H
#ifndef _CUDA_FUGUE256_H
#define _CUDA_FUGUE256_H
void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce);
void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
void fugue256_cpu_init(int thr_id, uint32_t threads);
void fugue256_cpu_free(int thr_id);

13
fuguecoin.cpp

@ -8,14 +8,6 @@ @@ -8,14 +8,6 @@
#include "cuda_fugue256.h"
extern "C" void my_fugue256_init(void *cc);
extern "C" void my_fugue256(void *cc, const void *data, size_t len);
extern "C" void my_fugue256_close(void *cc, void *dst);
extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
// vorbereitete Kontexte nach den ersten 80 Bytes
// sph_fugue256_context ctx_fugue_const[MAX_GPUS];
#define SWAP32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
@ -38,11 +30,11 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne @@ -38,11 +30,11 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
uint32_t *ptarget = work->target;
uint32_t start_nonce = pdata[19]++;
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 256*256*8
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0xf;
ptarget[7] = 0xf;
// init
if(!init[thr_id])
@ -57,7 +49,6 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne @@ -57,7 +49,6 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
for (int kk=0; kk < 20; kk++)
be32enc(&endiandata[kk], pdata[kk]);
// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
do {

3
lyra2/lyra2REv2.cu

@ -114,11 +114,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc @@ -114,11 +114,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
if (device_sm[dev_id] < 300) {
applog(LOG_ERR, "Device SM 3.0 or more recent required!");
gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
proper_exit(1);
return -1;
}
api_set_throughput(thr_id, throughput);
init[thr_id] = true;
}

76
quark/cuda_quark_compactionTest.cu

@ -1,3 +1,7 @@ @@ -1,3 +1,7 @@
/*
* REQUIRE SM 3.0 arch!
*/
#include <stdio.h>
#include <memory.h>
@ -10,6 +14,14 @@ static uint32_t *h_numValid[MAX_GPUS]; @@ -10,6 +14,14 @@ static uint32_t *h_numValid[MAX_GPUS];
static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen
#if __CUDA_ARCH__ < 300
/**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/
#undef __shfl_up
#define __shfl_up(var, delta, width) (0)
#endif
// True/False tester
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
@ -28,7 +40,8 @@ __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_Qua @@ -28,7 +40,8 @@ __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_Qua
cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS];
// Setup/Alloc Function
__host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
__host__
void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
{
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
@ -46,7 +59,8 @@ __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads) @@ -46,7 +59,8 @@ __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
}
// Because all alloc should have a free...
__host__ void quark_compactTest_cpu_free(int thr_id)
__host__
void quark_compactTest_cpu_free(int thr_id)
{
cudaFree(d_tempBranch1Nonces[thr_id]);
cudaFree(d_numValid[thr_id]);
@ -57,16 +71,9 @@ __host__ void quark_compactTest_cpu_free(int thr_id) @@ -57,16 +71,9 @@ __host__ void quark_compactTest_cpu_free(int thr_id)
cudaFreeHost(h_numValid[thr_id]);
}
#if __CUDA_ARCH__ < 300
/**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/
#undef __shfl_up
#define __shfl_up(var, delta, width) (0)
#endif
// Die Summenfunktion (vom NVIDIA SDK)
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
__global__
void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL,
uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
extern __shared__ uint32_t sums[];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@ -91,19 +98,16 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * @@ -91,19 +98,16 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
{
// keine Nonce-Liste
inpHash = &inpHashes[id<<4];
}else
{
} else {
// Nonce-Liste verfügbar
int nonce = d_validNonceTable[id] - startNounce;
inpHash = &inpHashes[nonce<<4];
}
value = (*testFunc)(inpHash);
}else
{
} else {
value = 0;
}
}else
{
} else {
value = data[id];
}
@ -115,8 +119,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * @@ -115,8 +119,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
// those threads where the thread 'i' away would have
// been out of bounds of the warp are unaffected. This
// creates the scan sum.
#pragma unroll
#pragma unroll
for (int i=1; i<=width; i*=2)
{
uint32_t n = __shfl_up((int)value, i, width);
@ -147,8 +151,7 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * @@ -147,8 +151,7 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
for (int i=1; i<=width; i*=2)
{
uint32_t n = __shfl_up((int)warp_sum, i, width);
if (lane_id >= i) warp_sum += n;
if (lane_id >= i) warp_sum += n;
}
sums[lane_id] = warp_sum;
@ -178,7 +181,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * @@ -178,7 +181,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
}
// Uniform add: add partial sums array
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
__global__
void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
{
__shared__ uint32_t buf;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@ -195,7 +199,8 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums @@ -195,7 +199,8 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums
}
// Der Scatter
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
__global__
void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t actNounce = id;
@ -244,9 +249,9 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) @@ -244,9 +249,9 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
return mask;
}
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
uint32_t *d_nonces1, cuda_compactTestFunction_t function,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
__host__
void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,uint32_t *d_nonces1,
cuda_compactTestFunction_t function, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{
int orgThreads = threads;
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
@ -300,9 +305,9 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t thread @@ -300,9 +305,9 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t thread
}
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
uint32_t *d_nonces1, uint32_t *d_nonces2,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
__host__
void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1,
uint32_t *d_nonces2, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
@ -339,10 +344,9 @@ __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, @@ -339,10 +344,9 @@ __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads,
*/
}
__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, uint32_t *nrm1,
uint32_t *d_nonces2, uint32_t *nrm2,
int order)
__host__
void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order)
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
@ -356,9 +360,9 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32 @@ -356,9 +360,9 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32
*nrm2 = h_numValid[thr_id][1];
}
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, uint32_t *nrm1,
int order)
__host__
void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, int order)
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!

9
quark/quarkcoin.cu

@ -121,6 +121,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, @@ -121,6 +121,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -132,6 +133,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, @@ -132,6 +133,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
{
cudaSetDevice(device_map[thr_id]);
cudaGetLastError();
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
quark_blake512_cpu_init(thr_id, throughput);
@ -146,6 +148,13 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, @@ -146,6 +148,13 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
CUDA_SAFE_CALL(cudaGetLastError());
if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
proper_exit(1);
return -1;
}
init[thr_id] = true;
}

5
x13/cuda_x13_fugue512.cu

@ -685,11 +685,6 @@ void x13_fugue512_cpu_init(int thr_id, uint32_t threads) @@ -685,11 +685,6 @@ void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
__host__
void x13_fugue512_cpu_free(int thr_id)
{
cudaUnbindTexture(mixTab0Tex);
cudaUnbindTexture(mixTab1Tex);
cudaUnbindTexture(mixTab2Tex);
cudaUnbindTexture(mixTab3Tex);
for (int i=0; i<4; i++)
cudaFree(d_textures[thr_id][i]);
}

Loading…
Cancel
Save