From c2214091aee94a2afff763f6e1259b603e7eea19 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 10 Oct 2015 02:12:43 +0200
Subject: [PATCH] benchmark: free last memory leaks on algo switch

remains my original lyra2 implementation to fix... (cuda_lyra2.cu)

I guess some kind of memory overflow force the driver to allocate
memory... but was unable to free it without device reset.
---
 Algo256/cuda_fugue256.cu   | 35 +++++++++++++++++++++++------------
 Algo256/cuda_groestl256.cu | 25 +++++++++++++++++--------
 ccminer.cpp                |  1 +
 scrypt-jane.cpp            |  3 +++
 scrypt.cpp                 |  3 +++
 skein.cu                   |  6 ++++--
 util.cpp                   | 27 +++++++--------------------
 x13/cuda_x13_fugue512.cu   | 13 +++++++++----
 x15/cuda_x15_whirlpool.cu  |  4 ++--
 9 files changed, 69 insertions(+), 48 deletions(-)
diff --git a/Algo256/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu
index 66d1c35..a5b6628 100644
--- a/Algo256/cuda_fugue256.cu
+++ b/Algo256/cuda_fugue256.cu
@@ -10,14 +10,15 @@
 
 uint32_t *d_fugue256_hashoutput[MAX_GPUS];
 static uint32_t *d_resultNonce[MAX_GPUS];
+static unsigned int* d_textures[MAX_GPUS][8];
 
 __constant__ uint32_t GPUstate[30]; // Single GPU
 __constant__ uint32_t pTarget[8]; // Single GPU
 
-texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
 
 #if USE_SHARED
 #define mixtab0(x) (*((uint32_t*)mixtabs + (    (x))))
@@ -707,28 +708,30 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
 	}
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
-
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
 
 __host__
 void fugue256_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
-	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
+	texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
+	texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
+	texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
+	texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
 
 	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
+	cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads);
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }
 
@@ -737,6 +740,14 @@ void fugue256_cpu_free(int thr_id)
 {
 	cudaFree(d_fugue256_hashoutput[thr_id]);
 	cudaFree(d_resultNonce[thr_id]);
+
+	cudaUnbindTexture(mixTab0Tex);
+	cudaUnbindTexture(mixTab1Tex);
+	cudaUnbindTexture(mixTab2Tex);
+	cudaUnbindTexture(mixTab3Tex);
+
+	for (int i=0; i<4; i++)
+		cudaFree(d_textures[thr_id][i]);
 }
 
 __host__
diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu
index b14a0f2..f65b5c2 100644
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@@ -67,14 +67,14 @@ __constant__ uint32_t pTarget[8];
 	#define T3dn(x) tex1Dfetch(t3dn2, x)
 #endif
 
-texture<unsigned int, 1, cudaReadModeElementType> t0up2;
-texture<unsigned int, 1, cudaReadModeElementType> t0dn2;
-texture<unsigned int, 1, cudaReadModeElementType> t1up2;
-texture<unsigned int, 1, cudaReadModeElementType> t1dn2;
-texture<unsigned int, 1, cudaReadModeElementType> t2up2;
-texture<unsigned int, 1, cudaReadModeElementType> t2dn2;
-texture<unsigned int, 1, cudaReadModeElementType> t3up2;
-texture<unsigned int, 1, cudaReadModeElementType> t3dn2;
+static texture<unsigned int, 1, cudaReadModeElementType> t0up2;
+static texture<unsigned int, 1, cudaReadModeElementType> t0dn2;
+static texture<unsigned int, 1, cudaReadModeElementType> t1up2;
+static texture<unsigned int, 1, cudaReadModeElementType> t1dn2;
+static texture<unsigned int, 1, cudaReadModeElementType> t2up2;
+static texture<unsigned int, 1, cudaReadModeElementType> t2dn2;
+static texture<unsigned int, 1, cudaReadModeElementType> t3up2;
+static texture<unsigned int, 1, cudaReadModeElementType> t3dn2;
 
 #define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
 	t[d0] = T0up(B32_0(a[b0])) \
@@ -283,6 +283,15 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
 __host__
 void groestl256_cpu_free(int thr_id)
 {
+	cudaUnbindTexture(t0up2);
+	cudaUnbindTexture(t0dn2);
+	cudaUnbindTexture(t1up2);
+	cudaUnbindTexture(t1dn2);
+	cudaUnbindTexture(t2up2);
+	cudaUnbindTexture(t2dn2);
+	cudaUnbindTexture(t3up2);
+	cudaUnbindTexture(t3dn2);
+
 	for (int i=0; i<8; i++)
 		cudaFree(d_textures[thr_id][i]);
 
diff --git a/ccminer.cpp b/ccminer.cpp
index eda3a35..86b0f68 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -1632,6 +1632,7 @@ bool algo_switch_next(int thr_id)
 
 	stats_purge_all();
 	global_hashrate = 0;
+	thr_hashrates[thr_id] = 0; // reset for minmax64
 
 	if (thr_id == 0)
 		applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]);
diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp
index 8023fe9..4620fdf 100644
--- a/scrypt-jane.cpp
+++ b/scrypt-jane.cpp
@@ -433,6 +433,9 @@ void free_scrypt_jane(int thr_id)
 {
 	int dev_id = device_map[thr_id];
 
+	if (!init[thr_id])
+		return;
+
 	cudaSetDevice(dev_id);
 	cudaDeviceSynchronize();
 	cudaDeviceReset(); // well, simple way to free ;)
diff --git a/scrypt.cpp b/scrypt.cpp
index e447583..9483cd7 100644
--- a/scrypt.cpp
+++ b/scrypt.cpp
@@ -692,6 +692,9 @@ void free_scrypt(int thr_id)
 {
 	int dev_id = device_map[thr_id];
 
+	if (!init[thr_id])
+		return;
+
 	// trivial way to free all...
 	cudaSetDevice(dev_id);
 	cudaDeviceSynchronize();
diff --git a/skein.cu b/skein.cu
index 8577e2f..b7b46ec 100644
--- a/skein.cu
+++ b/skein.cu
@@ -376,7 +376,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 		if (sm5) {
 			skeincoin_init(thr_id);
 		} else {
-			cudaMalloc(&d_hash[thr_id], throughput * 64U);
+			cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);
 			quark_skein512_cpu_init(thr_id, throughput);
 			cuda_check_cpu_init(thr_id, throughput);
 			CUDA_SAFE_CALL(cudaDeviceSynchronize());
@@ -475,7 +475,9 @@ extern "C" void free_skeincoin(int thr_id)
 
 	cudaSetDevice(device_map[thr_id]);
 
-	if (!sm5) {
+	if (sm5)
+		skeincoin_free(thr_id);
+	else {
 		cudaFree(d_hash[thr_id]);
 		cuda_check_cpu_free(thr_id);
 	}
diff --git a/util.cpp b/util.cpp
index f721f24..13d1735 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1829,33 +1829,20 @@ void do_gpu_tests(void)
 	unsigned long done;
 	char s[128] = { '\0' };
 	uchar buf[160];
-	uint32_t tgt[8] = { 0 };
+	struct work work;
+	memset(&work, 0, sizeof(work));
 
 	opt_tracegpu = true;
 	work_restart = (struct work_restart*) malloc(sizeof(struct work_restart));
 	work_restart[0].restart = 1;
-	tgt[7] = 0xffff;
-
-	//memset(buf, 0, sizeof buf);
-	//scanhash_skeincoin(0, (uint32_t*)buf, tgt, 1, &done);
-
-	//memset(buf, 0, sizeof buf);
-	//memcpy(buf, zrtest, 80);
-	//scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done);
+	work.target[7] = 0xffff;
 
 	//struct timeval tv;
-	//memset(buf, 0, sizeof buf);
-	//scanhash_scrypt_jane(0, (uint32_t*)buf, tgt, NULL, 1, &done, &tv, &tv);
-
-	memset(buf, 0, sizeof buf);
-	scanhash_lyra2(0, (uint32_t*)buf, tgt, 1, &done);
-
-	//memset(buf, 0, sizeof buf);
-	// buf[0] = 1; buf[64] = 2; // for endian tests
-	//scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
+	//memset(work.data, 0, sizeof(work.data));
+	//scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv);
 
-	//memset(buf, 0, sizeof buf);
-	//scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84
+	memset(work.data, 0, sizeof(work.data));
+	scanhash_lyra2(0, &work, 1, &done);
 
 	free(work_restart);
 	work_restart = NULL;
diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu
index d243935..1f3cbcb 100644
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@@ -46,10 +46,10 @@ static unsigned int* d_textures[MAX_GPUS][4];
 #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
 #define mixtab3(x) (*((uint32_t*)mixtabs + (768+(x))))
 
-texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
 
 static const uint32_t mixtab0_cpu[] = {
 	SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7),
@@ -685,6 +685,11 @@ void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
 __host__
 void x13_fugue512_cpu_free(int thr_id)
 {
+	cudaUnbindTexture(mixTab0Tex);
+	cudaUnbindTexture(mixTab1Tex);
+	cudaUnbindTexture(mixTab2Tex);
+	cudaUnbindTexture(mixTab3Tex);
+
 	for (int i=0; i<4; i++)
 		cudaFree(d_textures[thr_id][i]);
 }
diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu
index c6c98fd..49f6867 100644
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
@@ -14,8 +14,8 @@
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __constant__ uint32_t pTarget[8];
 
-uint32_t *d_wnounce[MAX_GPUS];
-uint32_t *d_WNonce[MAX_GPUS];
+static uint32_t *d_wnounce[MAX_GPUS];
+static uint32_t *d_WNonce[MAX_GPUS];
 
 #define USE_ALL_TABLES 1