various changes, cleanup for the release

small fixes to handle better the multi thread per gpu explicitly report than quark is not compatible with SM 2.1 (compact shuffle)
9 years ago · e50556b637
11 changed files with 84 additions and 88 deletions
--- a/Algo256/cuda_fugue256.cu
+++ b/Algo256/cuda_fugue256.cu
@ -724,14 +724,13 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
 __host__
 void fugue256_cpu_init(int thr_id, uint32_t threads)
 {
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	// Link the hash tables in the GPU
 	texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
 	texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
 	texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
 	texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
-	// Speicher für alle Ergebnisse belegen
+	CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads));
 	cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads);
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }
@ -741,11 +740,6 @@ void fugue256_cpu_free(int thr_id)
 	cudaFree(d_fugue256_hashoutput[thr_id]);
 	cudaFree(d_resultNonce[thr_id]);
 	cudaUnbindTexture(mixTab0Tex);
 	cudaUnbindTexture(mixTab1Tex);
 	cudaUnbindTexture(mixTab2Tex);
 	cudaUnbindTexture(mixTab3Tex);
 	for (int i=0; i<4; i++)
 		cudaFree(d_textures[thr_id][i]);
 }
@ -753,20 +747,18 @@ void fugue256_cpu_free(int thr_id)
 __host__
 void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
 	// CPU-Vorbereitungen treffen
 	sph_fugue256_context ctx_fugue_const;
 	sph_fugue256_init(&ctx_fugue_const);
-	sph_fugue256 (&ctx_fugue_const, data, 80);	// State speichern
+	sph_fugue256 (&ctx_fugue_const, data, 80);
 	cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30);
-	cudaMemcpyToSymbol(pTarget, pTargetIn, sizeof(uint32_t) * 8);
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
 }
 __host__
-void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
+void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
 #if USE_SHARED
 	const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@ -283,15 +283,6 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
 __host__
 void groestl256_cpu_free(int thr_id)
 {
 	cudaUnbindTexture(t0up2);
 	cudaUnbindTexture(t0dn2);
 	cudaUnbindTexture(t1up2);
 	cudaUnbindTexture(t1dn2);
 	cudaUnbindTexture(t2up2);
 	cudaUnbindTexture(t2dn2);
 	cudaUnbindTexture(t3up2);
 	cudaUnbindTexture(t3dn2);
 	for (int i=0; i<8; i++)
 		cudaFree(d_textures[thr_id][i]);
--- a/README.txt
+++ b/README.txt
@ -228,7 +228,7 @@ features.
 >>> RELEASE HISTORY <<<
-  Nov. 02nd 2015  v1.7
+  Nov. 05th 2015  v1.7
                  Improve old devices compatibility (x11, lyra2, qubit...)
                  Add windows support for SM 2.1 and drop SM 3.5 (x86)
                  Improve lyra2 (v1/v2) cuda implementations
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -2793,12 +2793,13 @@ void parse_arg(int key, char *arg)
 		if (p) d *= 1e9;
 		opt_max_rate = d;
 		break;
-	case 'd': // CB
+	case 'd': // --device
 		{
 			int device_thr[MAX_GPUS] = { 0 };
 			int ngpus = cuda_num_devices();
 			char * pch = strtok (arg,",");
 			opt_n_threads = 0;
-			while (pch != NULL) {
+			while (pch != NULL && opt_n_threads < MAX_GPUS) {
 				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
 				{
 					if (atoi(pch) < ngpus)
@ -2818,6 +2819,14 @@ void parse_arg(int key, char *arg)
 				}
 				pch = strtok (NULL, ",");
 			}
 			// count threads per gpu
 			for (int n=0; n < opt_n_threads; n++) {
 				int device = device_map[n];
 				device_thr[device]++;
 			}
 			for (int n=0; n < ngpus; n++) {
 				gpu_threads = max(gpu_threads, device_thr[n]);
 			}
 		}
 		break;
@ -3177,8 +3186,8 @@ int main(int argc, char *argv[])
 	else if (active_gpus > opt_n_threads)
 		active_gpus = opt_n_threads;
-	// generally doesn't work... let 1
+	// generally doesn't work well...
-	gpu_threads = opt_n_threads / active_gpus;
+	gpu_threads = max(gpu_threads, opt_n_threads / active_gpus);
 	if (opt_benchmark && opt_algo == ALGO_AUTO) {
 		bench_init(opt_n_threads);
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@ -11,23 +11,27 @@
 __constant__ uint32_t pTarget[8]; // 32 bytes
 // store MAX_GPUS device arrays of 8 nonces
-static uint32_t* h_resNonces[MAX_GPUS];
+static uint32_t* h_resNonces[MAX_GPUS] = { NULL };
-static uint32_t* d_resNonces[MAX_GPUS];
+static uint32_t* d_resNonces[MAX_GPUS] = { NULL };
-static bool init_done = false;
+static __thread bool init_done = false;
 __host__
 void cuda_check_cpu_init(int thr_id, uint32_t threads)
 {
    CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 32));
    CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32));
    CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 32));
    init_done = true;
 }
 __host__
 void cuda_check_cpu_free(int thr_id)
 {
 	if (!init_done) return;
 	cudaFree(d_resNonces[thr_id]);
 	cudaFreeHost(h_resNonces[thr_id]);
 	d_resNonces[thr_id] = NULL;
 	h_resNonces[thr_id] = NULL;
 	init_done = false;
 }
 // Target Difficulty
@ -198,7 +202,7 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc
 	cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
 	cudaThreadSynchronize();
-	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 32, cudaMemcpyDeviceToHost);
 	rescnt = h_resNonces[thr_id][0];
 	if (rescnt > numNonce) {
 		if (numNonce <= rescnt) {
--- a/cuda_fugue256.h
+++ b/cuda_fugue256.h
@ -1,7 +1,7 @@
-#ifndef _CUDA_FUGUE512_H
+#ifndef _CUDA_FUGUE256_H
-#define _CUDA_FUGUE512_H
+#define _CUDA_FUGUE256_H
-void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce);
+void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
 void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
 void fugue256_cpu_init(int thr_id, uint32_t threads);
 void fugue256_cpu_free(int thr_id);
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@ -8,14 +8,6 @@
 #include "cuda_fugue256.h"
 extern "C" void my_fugue256_init(void *cc);
 extern "C" void my_fugue256(void *cc, const void *data, size_t len);
 extern "C" void my_fugue256_close(void *cc, void *dst);
 extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
 // vorbereitete Kontexte nach den ersten 80 Bytes
 // sph_fugue256_context  ctx_fugue_const[MAX_GPUS];
 #define SWAP32(x) \
    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
@ -38,11 +30,11 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 	uint32_t *ptarget = work->target;
 	uint32_t start_nonce = pdata[19]++;
 	int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
-	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << intensity); // 256*256*8
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0xf;
+		ptarget[7] = 0xf;
 	// init
 	if(!init[thr_id])
@ -57,7 +49,6 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 	for (int kk=0; kk < 20; kk++)
 		be32enc(&endiandata[kk], pdata[kk]);
 	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
 	fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
 	do {
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@ -114,11 +114,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
 		if (device_sm[dev_id] < 300) {
-			applog(LOG_ERR, "Device SM 3.0 or more recent required!");
+			gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
 			proper_exit(1);
 			return -1;
 		}
 		api_set_throughput(thr_id, throughput);
 		init[thr_id] = true;
 	}
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@ -1,3 +1,7 @@
 /*
 * REQUIRE SM 3.0 arch!
 */
 #include <stdio.h>
 #include <memory.h>
@ -10,6 +14,14 @@ static uint32_t *h_numValid[MAX_GPUS];
 static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen
 #if __CUDA_ARCH__ < 300
 /**
 * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
 */
 #undef __shfl_up
 #define __shfl_up(var, delta, width) (0)
 #endif
 // True/False tester
 typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
@ -28,7 +40,8 @@ __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_Qua
 cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS];
 // Setup/Alloc Function
-__host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
+__host__
 void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
 	cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
@ -46,7 +59,8 @@ __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
 }
 // Because all alloc should have a free...
-__host__ void quark_compactTest_cpu_free(int thr_id)
+__host__
 void quark_compactTest_cpu_free(int thr_id)
 {
 	cudaFree(d_tempBranch1Nonces[thr_id]);
 	cudaFree(d_numValid[thr_id]);
@ -57,16 +71,9 @@ __host__ void quark_compactTest_cpu_free(int thr_id)
 	cudaFreeHost(h_numValid[thr_id]);
 }
-#if __CUDA_ARCH__ < 300
+__global__
-/**
+void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL,
- * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
+	uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 */
 #undef __shfl_up
 #define __shfl_up(var, delta, width) (0)
 #endif
 // Die Summenfunktion (vom NVIDIA SDK)
 __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 {
 	extern __shared__ uint32_t sums[];
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@ -91,19 +98,16 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
 			{
 				// keine Nonce-Liste
 				inpHash = &inpHashes[id<<4];
-			}else
+			} else {
 			{
 				// Nonce-Liste verfügbar
 				int nonce = d_validNonceTable[id] - startNounce;
 				inpHash = &inpHashes[nonce<<4];
 			}			
 			value = (*testFunc)(inpHash);
-		}else
+		} else {
 		{
 			value = 0;
 		}
-	}else
+	} else {
 	{
 		value = data[id];
 	}
@ -115,8 +119,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
 	// those threads where the thread 'i' away would have
 	// been out of bounds of the warp are unaffected.  This
 	// creates the scan sum.
 #pragma unroll
 	#pragma unroll
 	for (int i=1; i<=width; i*=2)
 	{
 		uint32_t n = __shfl_up((int)value, i, width);
@ -147,8 +151,7 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
 		for (int i=1; i<=width; i*=2)
 		{
 			uint32_t n = __shfl_up((int)warp_sum, i, width);
-
+			if (lane_id >= i) warp_sum += n;
 		if (lane_id >= i) warp_sum += n;
 		}
 		sums[lane_id] = warp_sum;
@ -178,7 +181,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *
 }
 // Uniform add: add partial sums array
-__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
+__global__
 void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
 {
 	__shared__ uint32_t buf;
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@ -195,7 +199,8 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums
 }
 // Der Scatter
-__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+__global__
 void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 {
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
 	uint32_t actNounce = id;
@ -244,9 +249,9 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
 	return mask;
 }
-__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
+__host__
-														uint32_t *d_nonces1, cuda_compactTestFunction_t function,
+void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,uint32_t *d_nonces1,
-														uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+	cuda_compactTestFunction_t function, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
 {
 	int orgThreads = threads;
 	threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
@ -300,9 +305,9 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t thread
 }
 ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
-__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
+__host__
-													 uint32_t *d_nonces1, uint32_t *d_nonces2,
+void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1,
-													 uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+	 uint32_t *d_nonces2, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
 {
 	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
 	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
@ -339,10 +344,9 @@ __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads,
 	*/
 }
-__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+__host__
-											uint32_t *d_nonces1, uint32_t *nrm1,
+void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_nonces2, uint32_t *nrm2,
+	uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order)
 											int order)
 {
 	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
 	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
@ -356,9 +360,9 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32
 	*nrm2 = h_numValid[thr_id][1];
 }
-__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+__host__
-											uint32_t *d_nonces1, uint32_t *nrm1,
+void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											int order)
+	uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, int order)
 {
 	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
 	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -121,6 +121,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	int dev_id = device_map[thr_id];
 	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 20); // 256*4096
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -132,6 +133,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 	{
 		cudaSetDevice(device_map[thr_id]);
 		cudaGetLastError();
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 		quark_blake512_cpu_init(thr_id, throughput);
@ -146,6 +148,13 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
 		CUDA_SAFE_CALL(cudaGetLastError());
 		if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
 			gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
 			proper_exit(1);
 			return -1;
 		}
 		init[thr_id] = true;
 	}
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@ -685,11 +685,6 @@ void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
 __host__
 void x13_fugue512_cpu_free(int thr_id)
 {
 	cudaUnbindTexture(mixTab0Tex);
 	cudaUnbindTexture(mixTab1Tex);
 	cudaUnbindTexture(mixTab2Tex);
 	cudaUnbindTexture(mixTab3Tex);
 	for (int i=0; i<4; i++)
 		cudaFree(d_textures[thr_id][i]);
 }