From 922c2a5cd7cf1a153995b4d7b3bdec0ed8a5bb3a Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 8 Oct 2015 21:31:16 +0200
Subject: [PATCH] algos: free allocated mem for algo switch

All can be freed propertly now, except script (reset) and lyra2 (leak)
---
 Algo256/bmw.cu                      |  4 ++-
 Algo256/cuda_bmw.cu                 |  6 ++++
 Algo256/cuda_groestl256.cu          | 27 +++++++++++-------
 JHA/cuda_jha_compactionTest.cu      | 17 ++++++++++--
 JHA/jackpotcoin.cu                  |  6 +++-
 cuda_nist5.cu                       |  2 ++
 cuda_skeincoin.cu                   |  7 ++++-
 heavy/cuda_blake512.cu              | 16 ++++++++---
 heavy/cuda_combine.cu               |  8 +++++-
 heavy/cuda_groestl512.cu            | 43 +++++++++++++++++++----------
 heavy/cuda_hefty1.cu                | 11 ++++++--
 heavy/cuda_keccak512.cu             | 10 +++++--
 heavy/cuda_sha256.cu                | 13 +++++++--
 heavy/heavy.cu                      | 10 +++++--
 heavy/heavy.h                       |  6 ++++
 lyra2/lyra2RE.cu                    |  3 ++
 lyra2/lyra2REv2.cu                  |  3 ++
 neoscrypt/cuda_neoscrypt.cu         |  2 +-
 pentablake.cu                       |  4 +--
 quark/cuda_quark_groestl512.cu      |  8 ++++++
 quark/cuda_quark_groestl512_sm20.cu | 32 ++++++++++++++-------
 quark/quarkcoin.cu                  |  2 ++
 qubit/qubit.cu                      |  9 +++---
 scrypt-jane.cpp                     | 24 ++++++++++++++--
 scrypt.cpp                          | 27 +++++++++++++++---
 scrypt/blake.cu                     | 14 ++++++++--
 scrypt/salsa_kernel.cu              |  5 ++--
 skein.cu                            | 10 +++++--
 skein2.cpp                          | 12 ++++----
 x11/c11.cu                          |  4 +++
 x11/cuda_x11_simd512.cu             |  7 +++++
 x11/fresh.cu                        |  2 ++
 x11/s3.cu                           |  2 ++
 x11/x11.cu                          |  5 ++++
 x13/cuda_x13_fugue512.cu            | 31 +++++++++++++++------
 x13/x13.cu                          |  8 +++++-
 x15/x14.cu                          | 14 ++++++++--
 x15/x15.cu                          |  7 +++++
 x17/x17.cu                          |  7 +++++
 zr5.cu                              |  2 ++
 40 files changed, 337 insertions(+), 93 deletions(-)

diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu
index 2951482..9b91252 100644
--- a/Algo256/bmw.cu
+++ b/Algo256/bmw.cu
@@ -13,6 +13,7 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];
 
 extern void bmw256_midstate_init(int thr_id, uint32_t threads);
+extern void bmw256_midstate_free(int thr_id);
 extern void bmw256_setBlock_80(int thr_id, void *pdata);
 extern void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int swap);
 
@@ -111,8 +112,9 @@ extern "C" void free_bmw(int thr_id)
 	cudaSetDevice(device_map[thr_id]);
 
 	cudaFree(d_hash[thr_id]);
-
+	bmw256_midstate_free(thr_id);
 	cuda_check_cpu_free(thr_id);
+
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
diff --git a/Algo256/cuda_bmw.cu b/Algo256/cuda_bmw.cu
index b7a4f99..c4638b9 100644
--- a/Algo256/cuda_bmw.cu
+++ b/Algo256/cuda_bmw.cu
@@ -372,3 +372,9 @@ void bmw256_midstate_init(int thr_id, uint32_t threads)
 {
 	cudaMalloc(&d_midstate[thr_id], sizeof(sph_bmw256_context));
 }
+
+__host__
+void bmw256_midstate_free(int thr_id)
+{
+	cudaFree(d_midstate[thr_id]);
+}
diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu
index b1ffa4f..b14a0f2 100644
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@@ -7,6 +7,7 @@
 
 static uint32_t *h_GNonces[MAX_GPUS];
 static uint32_t *d_GNonces[MAX_GPUS];
+static unsigned int* d_textures[MAX_GPUS][8];
 
 __constant__ uint32_t pTarget[8];
 
@@ -249,28 +250,31 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *out
 	}
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
 
 __host__
 void groestl256_cpu_init(int thr_id, uint32_t threads)
 {
 	// Texturen mit obigem Makro initialisieren
-	texDef(t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
-	texDef(t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
-	texDef(t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256);
-	texDef(t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256);
-	texDef(t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256);
-	texDef(t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256);
-	texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
-	texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
+	texDef(0, t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
+	texDef(1, t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
+	texDef(2, t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256);
+	texDef(3, t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256);
+	texDef(4, t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256);
+	texDef(5, t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256);
+	texDef(6, t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
+	texDef(7, t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
 
 	cudaMalloc(&d_GNonces[thr_id], 2*sizeof(uint32_t));
 	cudaMallocHost(&h_GNonces[thr_id], 2*sizeof(uint32_t));
@@ -279,6 +283,9 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
 __host__
 void groestl256_cpu_free(int thr_id)
 {
+	for (int i=0; i<8; i++)
+		cudaFree(d_textures[thr_id][i]);
+
 	cudaFree(d_GNonces[thr_id]);
 	cudaFreeHost(h_GNonces[thr_id]);
 }
diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu
index a1d1ec3..35a13b7 100644
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@@ -33,8 +33,9 @@ __device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d
 
 cuda_compactTestFunction_t h_JackpotTrueFunction[MAX_GPUS], h_JackpotFalseFunction[MAX_GPUS];
 
-// Setup-Funktionen
-__host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
+// Setup-Function
+__host__
+void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
 	cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
@@ -51,6 +52,18 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
 	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
 }
 
+__host__
+void jackpot_compactTest_cpu_free(int thr_id)
+{
+	cudaFree(d_tempBranch1Nonces[thr_id]);
+	cudaFree(d_numValid[thr_id]);
+
+	cudaFree(d_partSum[0][thr_id]);
+	cudaFree(d_partSum[1][thr_id]);
+
+	cudaFreeHost(h_numValid[thr_id]);
+}
+
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 /**
  * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index 674a388..558233e 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -21,6 +21,7 @@ extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -29,6 +30,7 @@ extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads);
+extern void jackpot_compactTest_cpu_free(int thr_id);
 extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, 
 											uint32_t *d_nonces1, uint32_t *nrm1,
 											uint32_t *d_nonces2, uint32_t *nrm2,
@@ -264,9 +266,11 @@ extern "C" void free_jackpot(int thr_id)
 	cudaFree(d_branch1Nonces[thr_id]);
 	cudaFree(d_branch2Nonces[thr_id]);
 	cudaFree(d_branch3Nonces[thr_id]);
-
 	cudaFree(d_jackpotNonces[thr_id]);
 
+	quark_groestl512_cpu_free(thr_id);
+	jackpot_compactTest_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index c4c390d..44bfe60 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -19,6 +19,7 @@ extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t st
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -173,6 +174,7 @@ extern "C" void free_nist5(int thr_id)
 
 	cudaFree(d_hash[thr_id]);
 
+	quark_groestl512_cpu_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
diff --git a/cuda_skeincoin.cu b/cuda_skeincoin.cu
index 5a6e8a0..8c91c5e 100644
--- a/cuda_skeincoin.cu
+++ b/cuda_skeincoin.cu
@@ -708,6 +708,12 @@ __host__
 void skeincoin_init(int thr_id)
 {
 	cuda_get_arch(thr_id);
+	CUDA_SAFE_CALL(cudaMalloc(&d_found[thr_id], 2 * sizeof(uint32_t)));
+}
+
+__host__
+void skeincoin_free(int thr_id) {
+	cudaFree(d_found[thr_id]);
 }
 
 __host__
@@ -716,7 +722,6 @@ void skeincoin_setBlock_80(int thr_id, void *pdata)
 	uint64_t message[16];
 	memcpy(&message[0], pdata, 80);
 
-	CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 2 * sizeof(uint32_t)));
 	cudaMemcpyToSymbol(c_message16, &message[8], 16, 0, cudaMemcpyHostToDevice);
 
 	precalc(message);
diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu
index 22f413b..f3d22fa 100644
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@@ -191,8 +191,9 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(uint32_t threads, uin
 
 // ---------------------------- END CUDA blake512 functions ------------------------------------
 
-// Setup-Funktionen
-__host__ void blake512_cpu_init(int thr_id, uint32_t threads)
+// Setup Function
+__host__
+void blake512_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol( c_sigma,
@@ -211,12 +212,19 @@ __host__ void blake512_cpu_init(int thr_id, uint32_t threads)
 						0, cudaMemcpyHostToDevice);
 
 	// Speicher für alle Ergebnisse belegen
-	CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], (size_t) 64 * threads));
+}
+
+__host__
+void blake512_cpu_free(int thr_id)
+{
+	cudaFree(d_hash5output[thr_id]);
 }
 
 static int BLOCKSIZE = 84;
 
-__host__ void blake512_cpu_setBlock(void *pdata, int len)
+__host__
+void blake512_cpu_setBlock(void *pdata, int len)
 	// data muss 84-Byte haben!
 	// heftyHash hat 32-Byte
 {
diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu
index 3365cf1..64a9561 100644
--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@@ -124,7 +124,13 @@ __host__
 void combine_cpu_init(int thr_id, uint32_t threads)
 {
 	// Speicher für alle Ergebnisse belegen
-	CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], (size_t) 32 * threads));
+}
+
+__host__
+void combine_cpu_free(int thr_id)
+{
+	cudaFree(d_hashoutput[thr_id]);
 }
 
 __host__
diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu
index 0e503d3..ef4c2c1 100644
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@@ -9,6 +9,7 @@
 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *heavy_heftyHashes[MAX_GPUS];
 extern uint32_t *heavy_nonceVector[MAX_GPUS];
+static unsigned int *d_textures[MAX_GPUS][8];
 
 // globaler Speicher für unsere Ergebnisse
 uint32_t *d_hash4output[MAX_GPUS];
@@ -730,36 +731,50 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(uint32_t threads, u
 	}
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
 
-// Setup-Funktionen
-__host__ void groestl512_cpu_init(int thr_id, uint32_t threads)
+// Setup Function
+__host__
+void groestl512_cpu_init(int thr_id, uint32_t threads)
 {
 	// Texturen mit obigem Makro initialisieren
-	texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
-	texDef(t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
-	texDef(t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
-	texDef(t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
-	texDef(t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
-	texDef(t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
-	texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
-	texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+	texDef(0, t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+	texDef(1, t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
+	texDef(2, t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
+	texDef(3, t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
+	texDef(4, t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
+	texDef(5, t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
+	texDef(6, t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
+	texDef(7, t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
 
 	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
+	cudaMalloc(&d_hash4output[thr_id], (size_t) 64 * threads);
+}
+
+__host__
+void groestl512_cpu_free(int thr_id)
+{
+	for (int i=0; i <8; i++)
+		cudaFree(d_textures[thr_id][i]);
+
+	cudaFree(d_hash4output[thr_id]);
 }
 
 static int BLOCKSIZE = 84;
 
-__host__ void groestl512_cpu_setBlock(void *data, int len)
+__host__
+void groestl512_cpu_setBlock(void *data, int len)
 	// data muss 80/84-Byte haben!
 	// heftyHash hat 32-Byte
 {
diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu
index d8a65a8..4c83441 100644
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@@ -31,7 +31,8 @@ uint32_t hefty_cpu_hashTable[] = {
     0x510e527fUL,
     0x9b05688cUL,
     0x1f83d9abUL,
-    0x5be0cd19UL };
+    0x5be0cd19UL
+};
 
 uint32_t hefty_cpu_constantTable[] = {
     0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
@@ -316,7 +317,13 @@ void hefty_cpu_init(int thr_id, uint32_t threads)
                         sizeof(uint32_t) * 64 );
 
     // Speicher für alle Hefty1 hashes belegen
-    CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads));
+    CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], (size_t) 32 * threads));
+}
+
+__host__
+void hefty_cpu_free(int thr_id)
+{
+    cudaFree(heavy_heftyHashes[thr_id]);
 }
 
 __host__
diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu
index 7415848..3e94f16 100644
--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@@ -185,7 +185,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(uint32_t threads, ui
 
 // ---------------------------- END CUDA keccak512 functions ------------------------------------
 
-__host__ 
+__host__
 void keccak512_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
@@ -195,7 +195,13 @@ void keccak512_cpu_init(int thr_id, uint32_t threads)
 						0, cudaMemcpyHostToDevice);
 
 	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
+	cudaMalloc(&d_hash3output[thr_id], (size_t) 64 * threads);
+}
+
+__host__
+void keccak512_cpu_free(int thr_id)
+{
+	cudaFree(d_hash3output[thr_id]);
 }
 
 // ----------------BEGIN keccak512 CPU version from scrypt-jane code --------------------
diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu
index d0305da..7441621 100644
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@@ -160,8 +160,9 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(uint32_t threads, uint3
 	}
 }
 
-// Setup-Funktionen
-__host__ void sha256_cpu_init(int thr_id, uint32_t threads)
+// Setup Function
+__host__
+void sha256_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol(	sha256_gpu_constantTable,
@@ -169,7 +170,13 @@ __host__ void sha256_cpu_init(int thr_id, uint32_t threads)
 						sizeof(uint32_t) * 64 );
 
 	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
+	cudaMalloc(&d_hash2output[thr_id], (size_t) 8 * sizeof(uint32_t) * threads);
+}
+
+__host__
+void sha256_cpu_free(int thr_id)
+{
+	cudaFree(d_hash2output[thr_id]);
 }
 
 static int BLOCKSIZE = 84;
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index 605c81f..37eb8b8 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -178,7 +178,7 @@ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned l
     {
         uint16_t *ext = (uint16_t *)&pdata[20];
 
-        if (opt_vote > maxvote) {
+        if (opt_vote > maxvote && !opt_benchmark) {
             applog(LOG_WARNING, "Your block reward vote (%hu) exceeds "
                     "the maxvote reported by the pool (%hu).",
                     opt_vote, maxvote);
@@ -310,12 +310,18 @@ extern "C" void free_heavy(int thr_id)
 
     cudaFree(heavy_nonceVector[thr_id]);
 
-    // todo: free sub algos vectors
+    blake512_cpu_free(thr_id);
+    groestl512_cpu_free(thr_id);
+    hefty_cpu_free(thr_id);
+    keccak512_cpu_free(thr_id);
+    sha256_cpu_free(thr_id);
+    combine_cpu_free(thr_id);
 
     init[thr_id] = false;
 
     cudaDeviceSynchronize();
 }
+
 __host__
 void heavycoin_hash(uchar* output, const uchar* input, int len)
 {
diff --git a/heavy/heavy.h b/heavy/heavy.h
index 59f3913..50affc4 100644
--- a/heavy/heavy.h
+++ b/heavy/heavy.h
@@ -4,27 +4,33 @@
 void blake512_cpu_init(int thr_id, uint32_t threads);
 void blake512_cpu_setBlock(void *pdata, int len);
 void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+void blake512_cpu_free(int thr_id);
 
 void groestl512_cpu_init(int thr_id, uint32_t threads);
 void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
 void groestl512_cpu_setBlock(void *data, int len);
 void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+void groestl512_cpu_free(int thr_id);
 
 void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce);
 void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len);
 void hefty_cpu_init(int thr_id, uint32_t threads);
+void hefty_cpu_free(int thr_id);
 
 void keccak512_cpu_init(int thr_id, uint32_t threads);
 void keccak512_cpu_setBlock(void *data, int len);
 void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
 void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+void keccak512_cpu_free(int thr_id);
 
 void sha256_cpu_init(int thr_id, uint32_t threads);
 void sha256_cpu_setBlock(void *data, int len);
 void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce);
 void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
+void sha256_cpu_free(int thr_id);
 
 void combine_cpu_init(int thr_id, uint32_t threads);
 void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash);
+void combine_cpu_free(int thr_id);
 
 #endif
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index 8475eb4..5a9cb71 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -17,6 +17,7 @@ extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const
 extern void blake256_cpu_setBlock_80(uint32_t *pdata);
 extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+extern void keccak256_cpu_free(int thr_id);
 extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 
@@ -174,7 +175,9 @@ extern "C" void free_lyra2(int thr_id)
 
 	cudaFree(d_hash[thr_id]);
 
+	keccak256_cpu_free(thr_id);
 	groestl256_cpu_free(thr_id);
+
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index f327b46..15327ec 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -19,6 +19,7 @@ extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const
 extern void blake256_cpu_setBlock_80(uint32_t *pdata);
 extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+extern void keccak256_cpu_free(int thr_id);
 extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
@@ -183,6 +184,8 @@ extern "C" void free_lyra2v2(int thr_id)
 	cudaFree(d_matrix[thr_id]);
 
 	bmw256_cpu_free(thr_id);
+	keccak256_cpu_free(thr_id);
+
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
index 79d14d6..e7f4b21 100644
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -730,7 +730,7 @@ void neoscrypt_cpu_init(int thr_id, uint32_t threads)
 {
 	cuda_get_arch(thr_id);
 	cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t));
-	CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], threads * 256 * SHIFT));
+	CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], (size_t) 256 * SHIFT * threads));
 	cudaMemcpyToSymbol(W, &d_buffer[thr_id], sizeof(uint4*), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice);
 }
diff --git a/pentablake.cu b/pentablake.cu
index ca909b5..5d19a9c 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -381,7 +381,7 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 		if (active_gpus > 1) {
 			cudaSetDevice(device_map[thr_id]);
 		}
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
 		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
 
@@ -452,7 +452,7 @@ void free_pentablake(int thr_id)
 	cudaSetDevice(device_map[thr_id]);
 
 	cudaFree(d_hash[thr_id]);
-	cudaFree(h_resNounce[thr_id]);
+	cudaFreeHost(h_resNounce[thr_id]);
 	cudaFree(d_resNounce[thr_id]);
 
 	init[thr_id] = false;
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index 2a946b9..456b009 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -141,6 +141,14 @@ void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
         quark_groestl512_sm20_init(thr_id, threads);
 }
 
+__host__
+void quark_groestl512_cpu_free(int thr_id)
+{
+    int dev_id = device_map[thr_id];
+    if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
+        quark_groestl512_sm20_free(thr_id);
+}
+
 __host__
 void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
diff --git a/quark/cuda_quark_groestl512_sm20.cu b/quark/cuda_quark_groestl512_sm20.cu
index ea710c8..f368594 100644
--- a/quark/cuda_quark_groestl512_sm20.cu
+++ b/quark/cuda_quark_groestl512_sm20.cu
@@ -5,6 +5,8 @@
 #define MAXWELL_OR_FERMI 0
 #define USE_SHARED 1
 
+static unsigned int *d_textures[MAX_GPUS][8];
+
 // #define SPH_C32(x)    ((uint32_t)(x ## U))
 // #define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
 
@@ -274,28 +276,38 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32
 #endif
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
 
 __host__
 void quark_groestl512_sm20_init(int thr_id, uint32_t threads)
 {
 	// Texturen mit obigem Makro initialisieren
-	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
-	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
-	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
-	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
-	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
-	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
-	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
-	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+	texDef(0, t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+	texDef(1, t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
+	texDef(2, t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
+	texDef(3, t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
+	texDef(4, t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
+	texDef(5, t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
+	texDef(6, t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
+	texDef(7, t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+}
+
+__host__
+void quark_groestl512_sm20_free(int thr_id)
+{
+	for (int i=0; i<8; i++)
+		cudaFree(d_textures[thr_id][i]);
 }
 
 __host__
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index e03b51b..772fd65 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -30,6 +30,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -270,6 +271,7 @@ extern "C" void free_quark(int thr_id)
 	cudaFree(d_branch2Nonces[thr_id]);
 	cudaFree(d_branch3Nonces[thr_id]);
 
+	quark_groestl512_cpu_free(thr_id);
 	quark_compactTest_cpu_free(thr_id);
 
 	cuda_check_cpu_free(thr_id);
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
index 3d40ded..7837892 100644
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@@ -28,14 +28,11 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
-											int order);
 
 extern "C" void qubithash(void *state, const void *input)
 {
@@ -96,7 +93,7 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce,
 		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
 
 		cuda_check_cpu_init(thr_id, throughput);
 
@@ -167,6 +164,8 @@ extern "C" void free_qubit(int thr_id)
 
 	cudaFree(d_hash[thr_id]);
 
+	x11_simd512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp
index f65388e..099501a 100644
--- a/scrypt-jane.cpp
+++ b/scrypt-jane.cpp
@@ -426,10 +426,17 @@ unsigned char GetNfactor(unsigned int nTimestamp)
 	return Nfactor;
 }
 
+static bool init[MAX_GPUS] = { 0 };
+
 // cleanup
 void free_scrypt_jane(int thr_id)
 {
-	// todo ?
+	int dev_id = device_map[thr_id];
+
+	cudaSetDevice(dev_id);
+	cudaDeviceReset(); // well, simple way to free ;)
+
+	init[thr_id] = false;
 }
 
 #define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
@@ -467,7 +474,18 @@ int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsi
 		s_Nfactor = Nfactor;
 	}
 
-	int throughput = cuda_throughput(thr_id);
+	static __thread int throughput = 0;
+	if(!init[thr_id]) {
+		int dev_id = device_map[thr_id];
+
+		cudaSetDevice(dev_id);
+		cudaDeviceReset();
+		cudaSetDevice(dev_id);
+		throughput = cuda_throughput(thr_id);
+		applog(LOG_INFO, "GPU #%d: cuda throughput is %d", dev_id, throughput);
+
+		init[thr_id] = true;
+	}
 
 	if(throughput == 0)
 		return -1;
@@ -602,7 +620,7 @@ int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsi
 
 				if (memcmp(thash, &hash[cur][8*i], 32) == 0)
 				{
-					bn_store_hash_target_ratio(thash, ptarget, work);
+					work_set_target_ratio(work, thash);
 					*hashes_done = n - pdata[19];
 					pdata[19] = tmp_nonce;
 					scrypt_free(&Vbuf);
diff --git a/scrypt.cpp b/scrypt.cpp
index d13b0bd..62c5a58 100644
--- a/scrypt.cpp
+++ b/scrypt.cpp
@@ -685,10 +685,18 @@ static int lastFactor = 0;
 
 static void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad);
 
+static bool init[MAX_GPUS] = { 0 };
+
 // cleanup
 void free_scrypt(int thr_id)
 {
-	// todo ?
+	int dev_id = device_map[thr_id];
+
+	// trivial way to free all...
+	cudaSetDevice(dev_id);
+	cudaDeviceReset();
+
+	init[thr_id] = false;
 }
 
 // Scrypt proof of work algorithm
@@ -701,9 +709,20 @@ int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 	int result = 0;
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
-	int throughput = cuda_throughput(thr_id);
+	static __thread int throughput = 0;
+
+	if (!init[thr_id]) {
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
+		cudaDeviceReset();
+		cudaSetDevice(dev_id);
+		throughput = cuda_throughput(thr_id);
+		applog(LOG_INFO, "GPU #%d: cuda throughput is %d", dev_id, throughput);
+
+		init[thr_id] = true;
+	}
 
-	if(throughput == 0)
+	if (throughput == 0)
 		return -1;
 
 	gettimeofday(tv_start, NULL);
@@ -912,7 +931,7 @@ int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 							device_map[thr_id], device_name[thr_id], i, cur);
 					} else {
 						*hashes_done = n - pdata[19];
-						bn_store_hash_target_ratio(refhash, ptarget, work);
+						work_set_target_ratio(work, refhash);
 						pdata[19] = nonce[cur] + i;
 						result = 1;
 						goto byebye;
diff --git a/scrypt/blake.cu b/scrypt/blake.cu
index 09ed4e6..5e8443a 100644
--- a/scrypt/blake.cu
+++ b/scrypt/blake.cu
@@ -404,10 +404,10 @@ void cuda_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool
 
 static std::map<int, uint32_t *> context_good[2];
 
+static bool init[MAX_GPUS] = { 0 };
+
 bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
 {
-	static bool init[MAX_GPUS] = { 0 };
-
 	if (!init[thr_id])
 	{
 		// allocate pinned host memory for good hashes
@@ -441,3 +441,13 @@ void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32
 						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
 	}
 }
+
+void default_free_blake256(int thr_id)
+{
+	if (init[thr_id]) {
+		cudaFree(context_good[0][thr_id]);
+		cudaFree(context_good[1][thr_id]);
+		init[thr_id] = false;
+	}
+}
+
diff --git a/scrypt/salsa_kernel.cu b/scrypt/salsa_kernel.cu
index a22a863..9fb3c44 100644
--- a/scrypt/salsa_kernel.cu
+++ b/scrypt/salsa_kernel.cu
@@ -144,9 +144,8 @@ int cuda_throughput(int thr_id)
 		cuCtxCreate( &ctx, CU_CTX_SCHED_YIELD, device_map[thr_id] );
 		cuCtxSetCurrent(ctx);
 #else
-		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield));
 		checkCudaErrors(cudaSetDevice(device_map[thr_id]));
-		// checkCudaErrors(cudaFree(0));
+		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield));
 #endif
 
 		KernelInterface *kernel;
@@ -256,7 +255,7 @@ inline int _ConvertSMVer2Cores(int major, int minor)
 	}
 
 	// If we don't find the values, we default use the previous one to run properly
-    applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128);
+	applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128);
 	return 128;
 }
 
diff --git a/skein.cu b/skein.cu
index cfeef37..8577e2f 100644
--- a/skein.cu
+++ b/skein.cu
@@ -11,12 +11,14 @@
 #include <openssl/sha.h>
 
 static uint32_t *d_hash[MAX_GPUS];
+static __thread bool sm5 = true;
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_setBlock_80(void *pdata);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 
 extern void skeincoin_init(int thr_id);
+extern void skeincoin_free(int thr_id);
 extern void skeincoin_setBlock_80(int thr_id, void *pdata);
 extern uint32_t skeincoin_hash_sm5(int thr_id, uint32_t threads, uint32_t startNounce, int swap, uint64_t target64, uint32_t *secNonce);
 
@@ -355,7 +357,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 	const uint32_t first_nonce = pdata[19];
 	const int swap = 1;
 
-	bool sm5 = (device_sm[device_map[thr_id]] >= 500);
+	sm5 = (device_sm[device_map[thr_id]] >= 500);
 	bool checkSecnonce = (have_stratum || have_longpoll) && !sm5;
 
 	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 20);
@@ -473,9 +475,11 @@ extern "C" void free_skeincoin(int thr_id)
 
 	cudaSetDevice(device_map[thr_id]);
 
-	cudaFree(d_hash[thr_id]);
+	if (!sm5) {
+		cudaFree(d_hash[thr_id]);
+		cuda_check_cpu_free(thr_id);
+	}
 
-	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
diff --git a/skein2.cpp b/skein2.cpp
index d2974dd..ef0c930 100644
--- a/skein2.cpp
+++ b/skein2.cpp
@@ -16,6 +16,7 @@ extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_setBlock_80(void *pdata);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 
+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 void skein2hash(void *output, const void *input)
@@ -38,6 +39,7 @@ static bool init[MAX_GPUS] = { 0 };
 
 int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
+	int dev_id = device_map[thr_id];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
@@ -50,9 +52,9 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 
-		cudaMalloc(&d_hash[thr_id], throughput * 64U);
+		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);
 
 		quark_skein512_cpu_init(thr_id, throughput);
 		cuda_check_cpu_init(thr_id, throughput);
@@ -92,7 +94,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 				work_set_target_ratio(work, vhash64);
 				if (secNonce != 0) {
 					if (!opt_quiet)
-						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", device_map[thr_id], swab32(secNonce));
+						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce));
 
 					endiandata[19] = secNonce;
 					skein2hash(vhash64, endiandata);
@@ -104,7 +106,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 				pdata[19] = swab32(foundNonce);
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", dev_id, foundNonce);
 			}
 		}
 
@@ -135,4 +137,4 @@ void free_skein2(int thr_id)
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/x11/c11.cu b/x11/c11.cu
index c59c594..c3a0848 100644
--- a/x11/c11.cu
+++ b/x11/c11.cu
@@ -32,6 +32,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -50,6 +51,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -255,6 +257,8 @@ extern "C" void free_c11(int thr_id)
 	cudaSetDevice(device_map[thr_id]);
 
 	cudaFree(d_hash[thr_id]);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
 
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu
index 326ed12..797fa49 100644
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@@ -672,6 +672,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads)
 	return 0;
 }
 
+__host__
+void x11_simd512_cpu_free(int thr_id)
+{
+	cudaFree(d_temp4[thr_id]);
+	cudaFree(d_state[thr_id]);
+}
+
 __host__
 void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
diff --git a/x11/fresh.cu b/x11/fresh.cu
index 81ecac9..49af70b 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -21,6 +21,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -163,6 +164,7 @@ extern "C" void free_fresh(int thr_id)
 	cudaSetDevice(device_map[thr_id]);
 
 	cudaFree(d_hash[thr_id]);
+	x11_simd512_cpu_free(thr_id);
 
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
diff --git a/x11/s3.cu b/x11/s3.cu
index c3fc8d3..e8e522c 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -21,6 +21,7 @@ extern void x11_shavite512_setBlock_80(void *pdata);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -146,6 +147,7 @@ extern "C" void free_s3(int thr_id)
 	cudaSetDevice(device_map[thr_id]);
 
 	cudaFree(d_hash[thr_id]);
+	x11_simd512_cpu_free(thr_id);
 
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
diff --git a/x11/x11.cu b/x11/x11.cu
index 3ac4158..ba061e0 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -32,6 +32,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -50,6 +51,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -254,6 +256,9 @@ extern "C" void free_x11(int thr_id)
 
 	cudaFree(d_hash[thr_id]);
 
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu
index ea26f17..d243935 100644
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@@ -38,6 +38,9 @@
  * @author   phm <phm@inbox.com>
  */
 
+// store allocated textures device addresses
+static unsigned int* d_textures[MAX_GPUS][4];
+
 #define mixtab0(x) (*((uint32_t*)mixtabs + (    (x))))
 #define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
@@ -657,25 +660,37 @@ __global__ void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce,
 	}
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
+
+__host__
+void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
+{
+	texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
+	texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
+	texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
+	texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
+}
 
-__host__ void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
+__host__
+void x13_fugue512_cpu_free(int thr_id)
 {
-	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
+	for (int i=0; i<4; i++)
+		cudaFree(d_textures[thr_id][i]);
 }
 
-__host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	const uint32_t threadsperblock = 256;
 
diff --git a/x13/x13.cu b/x13/x13.cu
index a2727c2..5833e71 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -34,6 +34,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -52,6 +53,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -61,7 +63,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
+extern void x13_fugue512_cpu_free(int thr_id);
 
 // X13 CPU Hash
 extern "C" void x13hash(void *output, const void *input)
@@ -256,6 +258,10 @@ extern "C" void free_x13(int thr_id)
 
 	cudaFree(d_hash[thr_id]);
 
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
diff --git a/x15/x14.cu b/x15/x14.cu
index 81b6183..195ace9 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -38,6 +38,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -56,6 +57,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -65,6 +67,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
 
 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -185,9 +188,10 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
-
 		cuda_check_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
 		init[thr_id] = true;
 	}
 
@@ -260,8 +264,12 @@ extern "C" void free_x14(int thr_id)
 
 	cudaFree(d_hash[thr_id]);
 
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/x15/x15.cu b/x15/x15.cu
index 92b6efc..5808770 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -39,6 +39,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -57,6 +58,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -66,6 +68,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
 
 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -275,6 +278,10 @@ extern "C" void free_x15(int thr_id)
 
 	cudaFree(d_hash[thr_id]);
 
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
diff --git a/x17/x17.cu b/x17/x17.cu
index 086b5e8..ee42522 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -42,6 +42,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -60,6 +61,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
 
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -69,6 +71,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
 
 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -297,6 +300,10 @@ extern "C" void free_x17(int thr_id)
 
 	cudaFree(d_hash[thr_id]);
 
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
diff --git a/zr5.cu b/zr5.cu
index 5ee62a0..93fe4c7 100644
--- a/zr5.cu
+++ b/zr5.cu
@@ -319,6 +319,7 @@ extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
 
 extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -491,6 +492,7 @@ extern "C" void free_zr5(int thr_id)
 
 	cudaFree(d_txs[thr_id]);
 
+	quark_groestl512_cpu_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;