Browse Source

various changes, cleanup for the release

small fixes to handle better the multi thread per gpu

explicitly report than quark is not compatible with SM 2.1 (compact shuffle)
2upstream
Tanguy Pruvot 9 years ago
parent
commit
e50556b637
  1. 18
      Algo256/cuda_fugue256.cu
  2. 9
      Algo256/cuda_groestl256.cu
  3. 2
      README.txt
  4. 17
      ccminer.cpp
  5. 14
      cuda_checkhash.cu
  6. 6
      cuda_fugue256.h
  7. 13
      fuguecoin.cpp
  8. 3
      lyra2/lyra2REv2.cu
  9. 74
      quark/cuda_quark_compactionTest.cu
  10. 9
      quark/quarkcoin.cu
  11. 5
      x13/cuda_x13_fugue512.cu

18
Algo256/cuda_fugue256.cu

@ -724,14 +724,13 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
__host__ __host__
void fugue256_cpu_init(int thr_id, uint32_t threads) void fugue256_cpu_init(int thr_id, uint32_t threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher // Link the hash tables in the GPU
texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256); texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
// Speicher für alle Ergebnisse belegen CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads));
cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads);
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
} }
@ -741,11 +740,6 @@ void fugue256_cpu_free(int thr_id)
cudaFree(d_fugue256_hashoutput[thr_id]); cudaFree(d_fugue256_hashoutput[thr_id]);
cudaFree(d_resultNonce[thr_id]); cudaFree(d_resultNonce[thr_id]);
cudaUnbindTexture(mixTab0Tex);
cudaUnbindTexture(mixTab1Tex);
cudaUnbindTexture(mixTab2Tex);
cudaUnbindTexture(mixTab3Tex);
for (int i=0; i<4; i++) for (int i=0; i<4; i++)
cudaFree(d_textures[thr_id][i]); cudaFree(d_textures[thr_id][i]);
} }
@ -753,20 +747,18 @@ void fugue256_cpu_free(int thr_id)
__host__ __host__
void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
{ {
// CPU-Vorbereitungen treffen
sph_fugue256_context ctx_fugue_const; sph_fugue256_context ctx_fugue_const;
sph_fugue256_init(&ctx_fugue_const); sph_fugue256_init(&ctx_fugue_const);
sph_fugue256 (&ctx_fugue_const, data, 80); // State speichern sph_fugue256 (&ctx_fugue_const, data, 80);
cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30); cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30);
cudaMemcpyToSymbol(pTarget, pTargetIn, sizeof(uint32_t) * 8); cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
} }
__host__ __host__
void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce) void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
{ {
#if USE_SHARED #if USE_SHARED
const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN

9
Algo256/cuda_groestl256.cu

@ -283,15 +283,6 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
__host__ __host__
void groestl256_cpu_free(int thr_id) void groestl256_cpu_free(int thr_id)
{ {
cudaUnbindTexture(t0up2);
cudaUnbindTexture(t0dn2);
cudaUnbindTexture(t1up2);
cudaUnbindTexture(t1dn2);
cudaUnbindTexture(t2up2);
cudaUnbindTexture(t2dn2);
cudaUnbindTexture(t3up2);
cudaUnbindTexture(t3dn2);
for (int i=0; i<8; i++) for (int i=0; i<8; i++)
cudaFree(d_textures[thr_id][i]); cudaFree(d_textures[thr_id][i]);

2
README.txt

@ -228,7 +228,7 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Nov. 02nd 2015 v1.7 Nov. 05th 2015 v1.7
Improve old devices compatibility (x11, lyra2, qubit...) Improve old devices compatibility (x11, lyra2, qubit...)
Add windows support for SM 2.1 and drop SM 3.5 (x86) Add windows support for SM 2.1 and drop SM 3.5 (x86)
Improve lyra2 (v1/v2) cuda implementations Improve lyra2 (v1/v2) cuda implementations

17
ccminer.cpp

@ -2793,12 +2793,13 @@ void parse_arg(int key, char *arg)
if (p) d *= 1e9; if (p) d *= 1e9;
opt_max_rate = d; opt_max_rate = d;
break; break;
case 'd': // CB case 'd': // --device
{ {
int device_thr[MAX_GPUS] = { 0 };
int ngpus = cuda_num_devices(); int ngpus = cuda_num_devices();
char * pch = strtok (arg,","); char * pch = strtok (arg,",");
opt_n_threads = 0; opt_n_threads = 0;
while (pch != NULL) { while (pch != NULL && opt_n_threads < MAX_GPUS) {
if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0') if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
{ {
if (atoi(pch) < ngpus) if (atoi(pch) < ngpus)
@ -2818,6 +2819,14 @@ void parse_arg(int key, char *arg)
} }
pch = strtok (NULL, ","); pch = strtok (NULL, ",");
} }
// count threads per gpu
for (int n=0; n < opt_n_threads; n++) {
int device = device_map[n];
device_thr[device]++;
}
for (int n=0; n < ngpus; n++) {
gpu_threads = max(gpu_threads, device_thr[n]);
}
} }
break; break;
@ -3177,8 +3186,8 @@ int main(int argc, char *argv[])
else if (active_gpus > opt_n_threads) else if (active_gpus > opt_n_threads)
active_gpus = opt_n_threads; active_gpus = opt_n_threads;
// generally doesn't work... let 1 // generally doesn't work well...
gpu_threads = opt_n_threads / active_gpus; gpu_threads = max(gpu_threads, opt_n_threads / active_gpus);
if (opt_benchmark && opt_algo == ALGO_AUTO) { if (opt_benchmark && opt_algo == ALGO_AUTO) {
bench_init(opt_n_threads); bench_init(opt_n_threads);

14
cuda_checkhash.cu

@ -11,23 +11,27 @@
__constant__ uint32_t pTarget[8]; // 32 bytes __constant__ uint32_t pTarget[8]; // 32 bytes
// store MAX_GPUS device arrays of 8 nonces // store MAX_GPUS device arrays of 8 nonces
static uint32_t* h_resNonces[MAX_GPUS]; static uint32_t* h_resNonces[MAX_GPUS] = { NULL };
static uint32_t* d_resNonces[MAX_GPUS]; static uint32_t* d_resNonces[MAX_GPUS] = { NULL };
static bool init_done = false; static __thread bool init_done = false;
__host__ __host__
void cuda_check_cpu_init(int thr_id, uint32_t threads) void cuda_check_cpu_init(int thr_id, uint32_t threads)
{ {
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 32));
CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32)); CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32));
CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 32));
init_done = true; init_done = true;
} }
__host__ __host__
void cuda_check_cpu_free(int thr_id) void cuda_check_cpu_free(int thr_id)
{ {
if (!init_done) return;
cudaFree(d_resNonces[thr_id]); cudaFree(d_resNonces[thr_id]);
cudaFreeHost(h_resNonces[thr_id]); cudaFreeHost(h_resNonces[thr_id]);
d_resNonces[thr_id] = NULL;
h_resNonces[thr_id] = NULL;
init_done = false;
} }
// Target Difficulty // Target Difficulty
@ -198,7 +202,7 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc
cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]); cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize(); cudaThreadSynchronize();
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 32, cudaMemcpyDeviceToHost);
rescnt = h_resNonces[thr_id][0]; rescnt = h_resNonces[thr_id][0];
if (rescnt > numNonce) { if (rescnt > numNonce) {
if (numNonce <= rescnt) { if (numNonce <= rescnt) {

6
cuda_fugue256.h

@ -1,7 +1,7 @@
#ifndef _CUDA_FUGUE512_H #ifndef _CUDA_FUGUE256_H
#define _CUDA_FUGUE512_H #define _CUDA_FUGUE256_H
void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce); void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn); void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
void fugue256_cpu_init(int thr_id, uint32_t threads); void fugue256_cpu_init(int thr_id, uint32_t threads);
void fugue256_cpu_free(int thr_id); void fugue256_cpu_free(int thr_id);

13
fuguecoin.cpp

@ -8,14 +8,6 @@
#include "cuda_fugue256.h" #include "cuda_fugue256.h"
extern "C" void my_fugue256_init(void *cc);
extern "C" void my_fugue256(void *cc, const void *data, size_t len);
extern "C" void my_fugue256_close(void *cc, void *dst);
extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
// vorbereitete Kontexte nach den ersten 80 Bytes
// sph_fugue256_context ctx_fugue_const[MAX_GPUS];
#define SWAP32(x) \ #define SWAP32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
@ -38,11 +30,11 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
uint32_t start_nonce = pdata[19]++; uint32_t start_nonce = pdata[19]++;
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19; int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 256*256*8 uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0xf; ptarget[7] = 0xf;
// init // init
if(!init[thr_id]) if(!init[thr_id])
@ -57,7 +49,6 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
for (int kk=0; kk < 20; kk++) for (int kk=0; kk < 20; kk++)
be32enc(&endiandata[kk], pdata[kk]); be32enc(&endiandata[kk], pdata[kk]);
// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget); fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
do { do {

3
lyra2/lyra2REv2.cu

@ -114,11 +114,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
if (device_sm[dev_id] < 300) { if (device_sm[dev_id] < 300) {
applog(LOG_ERR, "Device SM 3.0 or more recent required!"); gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
proper_exit(1); proper_exit(1);
return -1; return -1;
} }
api_set_throughput(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }

74
quark/cuda_quark_compactionTest.cu

@ -1,3 +1,7 @@
/*
* REQUIRE SM 3.0 arch!
*/
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
@ -10,6 +14,14 @@ static uint32_t *h_numValid[MAX_GPUS];
static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen
#if __CUDA_ARCH__ < 300
/**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/
#undef __shfl_up
#define __shfl_up(var, delta, width) (0)
#endif
// True/False tester // True/False tester
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
@ -28,7 +40,8 @@ __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_Qua
cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS]; cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS];
// Setup/Alloc Function // Setup/Alloc Function
__host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads) __host__
void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
@ -46,7 +59,8 @@ __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
} }
// Because all alloc should have a free... // Because all alloc should have a free...
__host__ void quark_compactTest_cpu_free(int thr_id) __host__
void quark_compactTest_cpu_free(int thr_id)
{ {
cudaFree(d_tempBranch1Nonces[thr_id]); cudaFree(d_tempBranch1Nonces[thr_id]);
cudaFree(d_numValid[thr_id]); cudaFree(d_numValid[thr_id]);
@ -57,16 +71,9 @@ __host__ void quark_compactTest_cpu_free(int thr_id)
cudaFreeHost(h_numValid[thr_id]); cudaFreeHost(h_numValid[thr_id]);
} }
#if __CUDA_ARCH__ < 300 __global__
/** void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL,
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
*/
#undef __shfl_up
#define __shfl_up(var, delta, width) (0)
#endif
// Die Summenfunktion (vom NVIDIA SDK)
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{ {
extern __shared__ uint32_t sums[]; extern __shared__ uint32_t sums[];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@ -91,19 +98,16 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
{ {
// keine Nonce-Liste // keine Nonce-Liste
inpHash = &inpHashes[id<<4]; inpHash = &inpHashes[id<<4];
}else } else {
{
// Nonce-Liste verfügbar // Nonce-Liste verfügbar
int nonce = d_validNonceTable[id] - startNounce; int nonce = d_validNonceTable[id] - startNounce;
inpHash = &inpHashes[nonce<<4]; inpHash = &inpHashes[nonce<<4];
} }
value = (*testFunc)(inpHash); value = (*testFunc)(inpHash);
}else } else {
{
value = 0; value = 0;
} }
}else } else {
{
value = data[id]; value = data[id];
} }
@ -115,8 +119,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
// those threads where the thread 'i' away would have // those threads where the thread 'i' away would have
// been out of bounds of the warp are unaffected. This // been out of bounds of the warp are unaffected. This
// creates the scan sum. // creates the scan sum.
#pragma unroll
#pragma unroll
for (int i=1; i<=width; i*=2) for (int i=1; i<=width; i*=2)
{ {
uint32_t n = __shfl_up((int)value, i, width); uint32_t n = __shfl_up((int)value, i, width);
@ -147,7 +151,6 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
for (int i=1; i<=width; i*=2) for (int i=1; i<=width; i*=2)
{ {
uint32_t n = __shfl_up((int)warp_sum, i, width); uint32_t n = __shfl_up((int)warp_sum, i, width);
if (lane_id >= i) warp_sum += n; if (lane_id >= i) warp_sum += n;
} }
@ -178,7 +181,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
} }
// Uniform add: add partial sums array // Uniform add: add partial sums array
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) __global__
void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
{ {
__shared__ uint32_t buf; __shared__ uint32_t buf;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@ -195,7 +199,8 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums
} }
// Der Scatter // Der Scatter
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) __global__
void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{ {
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t actNounce = id; uint32_t actNounce = id;
@ -244,9 +249,9 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
return mask; return mask;
} }
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm, __host__
uint32_t *d_nonces1, cuda_compactTestFunction_t function, void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,uint32_t *d_nonces1,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) cuda_compactTestFunction_t function, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{ {
int orgThreads = threads; int orgThreads = threads;
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
@ -300,9 +305,9 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t thread
} }
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, __host__
uint32_t *d_nonces1, uint32_t *d_nonces2, void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) uint32_t *d_nonces2, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{ {
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
@ -339,10 +344,9 @@ __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads,
*/ */
} }
__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, __host__
uint32_t *d_nonces1, uint32_t *nrm1, void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_nonces2, uint32_t *nrm2, uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order)
int order)
{ {
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
@ -356,9 +360,9 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32
*nrm2 = h_numValid[thr_id][1]; *nrm2 = h_numValid[thr_id][1];
} }
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, __host__
uint32_t *d_nonces1, uint32_t *nrm1, void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
int order) uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, int order)
{ {
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!

9
quark/quarkcoin.cu

@ -121,6 +121,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096 uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -132,6 +133,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
cudaGetLastError();
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
quark_blake512_cpu_init(thr_id, throughput); quark_blake512_cpu_init(thr_id, throughput);
@ -146,6 +148,13 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
CUDA_SAFE_CALL(cudaGetLastError());
if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
proper_exit(1);
return -1;
}
init[thr_id] = true; init[thr_id] = true;
} }

5
x13/cuda_x13_fugue512.cu

@ -685,11 +685,6 @@ void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
__host__ __host__
void x13_fugue512_cpu_free(int thr_id) void x13_fugue512_cpu_free(int thr_id)
{ {
cudaUnbindTexture(mixTab0Tex);
cudaUnbindTexture(mixTab1Tex);
cudaUnbindTexture(mixTab2Tex);
cudaUnbindTexture(mixTab3Tex);
for (int i=0; i<4; i++) for (int i=0; i<4; i++)
cudaFree(d_textures[thr_id][i]); cudaFree(d_textures[thr_id][i]);
} }

Loading…
Cancel
Save