algos: free allocated mem for algo switch

All can be freed propertly now, except script (reset) and lyra2 (leak)
2025-03-13 05:51:19 +00:00 · 2015-10-08 21:31:16 +02:00 · 2015-10-08 21:31:16 +02:00 · 922c2a5cd7
commit 922c2a5cd7
parent ee93927fac
40 changed files with 339 additions and 95 deletions
--- a/Algo256/bmw.cu
+++ b/Algo256/bmw.cu
@ -13,6 +13,7 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];

 extern void bmw256_midstate_init(int thr_id, uint32_t threads);
+extern void bmw256_midstate_free(int thr_id);
 extern void bmw256_setBlock_80(int thr_id, void *pdata);
 extern void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int swap);

@ -111,8 +112,9 @@ extern "C" void free_bmw(int thr_id)
 	cudaSetDevice(device_map[thr_id]);

 	cudaFree(d_hash[thr_id]);
-
+	bmw256_midstate_free(thr_id);
 	cuda_check_cpu_free(thr_id);
+
 	init[thr_id] = false;

 	cudaDeviceSynchronize();
--- a/Algo256/cuda_bmw.cu
+++ b/Algo256/cuda_bmw.cu
@ -372,3 +372,9 @@ void bmw256_midstate_init(int thr_id, uint32_t threads)
 {
 	cudaMalloc(&d_midstate[thr_id], sizeof(sph_bmw256_context));
 }
+
+__host__
+void bmw256_midstate_free(int thr_id)
+{
+	cudaFree(d_midstate[thr_id]);
+}
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@ -7,6 +7,7 @@

 static uint32_t *h_GNonces[MAX_GPUS];
 static uint32_t *d_GNonces[MAX_GPUS];
+static unsigned int* d_textures[MAX_GPUS][8];

 __constant__ uint32_t pTarget[8];

@ -249,28 +250,31 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *out
 	}
 }

-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}

 __host__
 void groestl256_cpu_init(int thr_id, uint32_t threads)
 {
 	// Texturen mit obigem Makro initialisieren
-	texDef(t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
-	texDef(t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
-	texDef(t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256);
-	texDef(t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256);
-	texDef(t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256);
-	texDef(t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256);
-	texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
-	texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
+	texDef(0, t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
+	texDef(1, t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
+	texDef(2, t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256);
+	texDef(3, t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256);
+	texDef(4, t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256);
+	texDef(5, t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256);
+	texDef(6, t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
+	texDef(7, t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);

 	cudaMalloc(&d_GNonces[thr_id], 2*sizeof(uint32_t));
 	cudaMallocHost(&h_GNonces[thr_id], 2*sizeof(uint32_t));
@ -279,6 +283,9 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
 __host__
 void groestl256_cpu_free(int thr_id)
 {
+	for (int i=0; i<8; i++)
+		cudaFree(d_textures[thr_id][i]);
+
 	cudaFree(d_GNonces[thr_id]);
 	cudaFreeHost(h_GNonces[thr_id]);
 }
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@ -33,8 +33,9 @@ __device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d

 cuda_compactTestFunction_t h_JackpotTrueFunction[MAX_GPUS], h_JackpotFalseFunction[MAX_GPUS];

-// Setup-Funktionen
-__host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
+// Setup-Function
+__host__
+void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
 	cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
@ -51,6 +52,18 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
 	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
 }

+__host__
+void jackpot_compactTest_cpu_free(int thr_id)
+{
+	cudaFree(d_tempBranch1Nonces[thr_id]);
+	cudaFree(d_numValid[thr_id]);
+
+	cudaFree(d_partSum[0][thr_id]);
+	cudaFree(d_partSum[1][thr_id]);
+
+	cudaFreeHost(h_numValid[thr_id]);
+}
+
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 /**
 * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -21,6 +21,7 @@ extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -29,6 +30,7 @@ extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

 extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads);
+extern void jackpot_compactTest_cpu_free(int thr_id);
 extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, 
 											uint32_t *d_nonces1, uint32_t *nrm1,
 											uint32_t *d_nonces2, uint32_t *nrm2,
@ -264,9 +266,11 @@ extern "C" void free_jackpot(int thr_id)
 	cudaFree(d_branch1Nonces[thr_id]);
 	cudaFree(d_branch2Nonces[thr_id]);
 	cudaFree(d_branch3Nonces[thr_id]);
-
 	cudaFree(d_jackpotNonces[thr_id]);

+	quark_groestl512_cpu_free(thr_id);
+	jackpot_compactTest_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -19,6 +19,7 @@ extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t st

 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -173,6 +174,7 @@ extern "C" void free_nist5(int thr_id)

 	cudaFree(d_hash[thr_id]);

+	quark_groestl512_cpu_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

--- a/cuda_skeincoin.cu
+++ b/cuda_skeincoin.cu
@ -708,6 +708,12 @@ __host__
 void skeincoin_init(int thr_id)
 {
 	cuda_get_arch(thr_id);
+	CUDA_SAFE_CALL(cudaMalloc(&d_found[thr_id], 2 * sizeof(uint32_t)));
+}
+
+__host__
+void skeincoin_free(int thr_id) {
+	cudaFree(d_found[thr_id]);
 }

 __host__
@ -716,7 +722,6 @@ void skeincoin_setBlock_80(int thr_id, void *pdata)
 	uint64_t message[16];
 	memcpy(&message[0], pdata, 80);

-	CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 2 * sizeof(uint32_t)));
 	cudaMemcpyToSymbol(c_message16, &message[8], 16, 0, cudaMemcpyHostToDevice);

 	precalc(message);
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@ -191,8 +191,9 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(uint32_t threads, uin

 // ---------------------------- END CUDA blake512 functions ------------------------------------

-// Setup-Funktionen
-__host__ void blake512_cpu_init(int thr_id, uint32_t threads)
+// Setup Function
+__host__
+void blake512_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol( c_sigma,
@ -211,12 +212,19 @@ __host__ void blake512_cpu_init(int thr_id, uint32_t threads)
 						0, cudaMemcpyHostToDevice);

 	// Speicher für alle Ergebnisse belegen
-	CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], (size_t) 64 * threads));
+}
+
+__host__
+void blake512_cpu_free(int thr_id)
+{
+	cudaFree(d_hash5output[thr_id]);
 }

 static int BLOCKSIZE = 84;

-__host__ void blake512_cpu_setBlock(void *pdata, int len)
+__host__
+void blake512_cpu_setBlock(void *pdata, int len)
 	// data muss 84-Byte haben!
 	// heftyHash hat 32-Byte
 {
--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@ -124,7 +124,13 @@ __host__
 void combine_cpu_init(int thr_id, uint32_t threads)
 {
 	// Speicher für alle Ergebnisse belegen
-	CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], (size_t) 32 * threads));
+}
+
+__host__
+void combine_cpu_free(int thr_id)
+{
+	cudaFree(d_hashoutput[thr_id]);
 }

 __host__
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@ -9,6 +9,7 @@
 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *heavy_heftyHashes[MAX_GPUS];
 extern uint32_t *heavy_nonceVector[MAX_GPUS];
+static unsigned int *d_textures[MAX_GPUS][8];

 // globaler Speicher für unsere Ergebnisse
 uint32_t *d_hash4output[MAX_GPUS];
@ -730,36 +731,50 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(uint32_t threads, u
 	}
 }

-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}

-// Setup-Funktionen
-__host__ void groestl512_cpu_init(int thr_id, uint32_t threads)
+// Setup Function
+__host__
+void groestl512_cpu_init(int thr_id, uint32_t threads)
 {
 	// Texturen mit obigem Makro initialisieren
-	texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
-	texDef(t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
-	texDef(t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
-	texDef(t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
-	texDef(t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
-	texDef(t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
-	texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
-	texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+	texDef(0, t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+	texDef(1, t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
+	texDef(2, t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
+	texDef(3, t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
+	texDef(4, t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
+	texDef(5, t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
+	texDef(6, t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
+	texDef(7, t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);

 	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
+	cudaMalloc(&d_hash4output[thr_id], (size_t) 64 * threads);
+}
+
+__host__
+void groestl512_cpu_free(int thr_id)
+{
+	for (int i=0; i <8; i++)
+		cudaFree(d_textures[thr_id][i]);
+
+	cudaFree(d_hash4output[thr_id]);
 }

 static int BLOCKSIZE = 84;

-__host__ void groestl512_cpu_setBlock(void *data, int len)
+__host__
+void groestl512_cpu_setBlock(void *data, int len)
 	// data muss 80/84-Byte haben!
 	// heftyHash hat 32-Byte
 {
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@ -31,7 +31,8 @@ uint32_t hefty_cpu_hashTable[] = {
    0x510e527fUL,
    0x9b05688cUL,
    0x1f83d9abUL,
-    0x5be0cd19UL };
+    0x5be0cd19UL
+};

 uint32_t hefty_cpu_constantTable[] = {
    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
@ -316,7 +317,13 @@ void hefty_cpu_init(int thr_id, uint32_t threads)
                        sizeof(uint32_t) * 64 );

    // Speicher für alle Hefty1 hashes belegen
-    CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads));
+    CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], (size_t) 32 * threads));
+}
+
+__host__
+void hefty_cpu_free(int thr_id)
+{
+    cudaFree(heavy_heftyHashes[thr_id]);
 }

 __host__
--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@ -185,7 +185,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(uint32_t threads, ui

 // ---------------------------- END CUDA keccak512 functions ------------------------------------

-__host__ 
+__host__
 void keccak512_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
@ -195,7 +195,13 @@ void keccak512_cpu_init(int thr_id, uint32_t threads)
 						0, cudaMemcpyHostToDevice);

 	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
+	cudaMalloc(&d_hash3output[thr_id], (size_t) 64 * threads);
+}
+
+__host__
+void keccak512_cpu_free(int thr_id)
+{
+	cudaFree(d_hash3output[thr_id]);
 }

 // ----------------BEGIN keccak512 CPU version from scrypt-jane code --------------------
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@ -160,8 +160,9 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(uint32_t threads, uint3
 	}
 }

-// Setup-Funktionen
-__host__ void sha256_cpu_init(int thr_id, uint32_t threads)
+// Setup Function
+__host__
+void sha256_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol(	sha256_gpu_constantTable,
@ -169,7 +170,13 @@ __host__ void sha256_cpu_init(int thr_id, uint32_t threads)
 						sizeof(uint32_t) * 64 );

 	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
+	cudaMalloc(&d_hash2output[thr_id], (size_t) 8 * sizeof(uint32_t) * threads);
+}
+
+__host__
+void sha256_cpu_free(int thr_id)
+{
+	cudaFree(d_hash2output[thr_id]);
 }

 static int BLOCKSIZE = 84;
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@ -178,7 +178,7 @@ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned l
    {
        uint16_t *ext = (uint16_t *)&pdata[20];

-        if (opt_vote > maxvote) {
+        if (opt_vote > maxvote && !opt_benchmark) {
            applog(LOG_WARNING, "Your block reward vote (%hu) exceeds "
                    "the maxvote reported by the pool (%hu).",
                    opt_vote, maxvote);
@ -310,12 +310,18 @@ extern "C" void free_heavy(int thr_id)

    cudaFree(heavy_nonceVector[thr_id]);

-    // todo: free sub algos vectors
+    blake512_cpu_free(thr_id);
+    groestl512_cpu_free(thr_id);
+    hefty_cpu_free(thr_id);
+    keccak512_cpu_free(thr_id);
+    sha256_cpu_free(thr_id);
+    combine_cpu_free(thr_id);

    init[thr_id] = false;

    cudaDeviceSynchronize();
 }
+
 __host__
 void heavycoin_hash(uchar* output, const uchar* input, int len)
 {
--- a/heavy/heavy.h
+++ b/heavy/heavy.h
@ -4,27 +4,33 @@
 void blake512_cpu_init(int thr_id, uint32_t threads);
 void blake512_cpu_setBlock(void *pdata, int len);
 void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+void blake512_cpu_free(int thr_id);

 void groestl512_cpu_init(int thr_id, uint32_t threads);
 void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
 void groestl512_cpu_setBlock(void *data, int len);
 void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+void groestl512_cpu_free(int thr_id);

 void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce);
 void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len);
 void hefty_cpu_init(int thr_id, uint32_t threads);
+void hefty_cpu_free(int thr_id);

 void keccak512_cpu_init(int thr_id, uint32_t threads);
 void keccak512_cpu_setBlock(void *data, int len);
 void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
 void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+void keccak512_cpu_free(int thr_id);

 void sha256_cpu_init(int thr_id, uint32_t threads);
 void sha256_cpu_setBlock(void *data, int len);
 void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce);
 void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
+void sha256_cpu_free(int thr_id);

 void combine_cpu_init(int thr_id, uint32_t threads);
 void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash);
+void combine_cpu_free(int thr_id);

 #endif
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@ -17,6 +17,7 @@ extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const
 extern void blake256_cpu_setBlock_80(uint32_t *pdata);
 extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+extern void keccak256_cpu_free(int thr_id);
 extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);

@ -174,7 +175,9 @@ extern "C" void free_lyra2(int thr_id)

 	cudaFree(d_hash[thr_id]);

+	keccak256_cpu_free(thr_id);
 	groestl256_cpu_free(thr_id);
+
 	init[thr_id] = false;

 	cudaDeviceSynchronize();
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@ -19,6 +19,7 @@ extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const
 extern void blake256_cpu_setBlock_80(uint32_t *pdata);
 extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+extern void keccak256_cpu_free(int thr_id);
 extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
@ -183,6 +184,8 @@ extern "C" void free_lyra2v2(int thr_id)
 	cudaFree(d_matrix[thr_id]);

 	bmw256_cpu_free(thr_id);
+	keccak256_cpu_free(thr_id);
+
 	init[thr_id] = false;

 	cudaDeviceSynchronize();
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@ -730,7 +730,7 @@ void neoscrypt_cpu_init(int thr_id, uint32_t threads)
 {
 	cuda_get_arch(thr_id);
 	cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t));
-	CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], threads * 256 * SHIFT));
+	CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], (size_t) 256 * SHIFT * threads));
 	cudaMemcpyToSymbol(W, &d_buffer[thr_id], sizeof(uint4*), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice);
 }
--- a/pentablake.cu
+++ b/pentablake.cu
@ -381,7 +381,7 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 		if (active_gpus > 1) {
 			cudaSetDevice(device_map[thr_id]);
 		}
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
 		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));

@ -452,7 +452,7 @@ void free_pentablake(int thr_id)
 	cudaSetDevice(device_map[thr_id]);

 	cudaFree(d_hash[thr_id]);
-	cudaFree(h_resNounce[thr_id]);
+	cudaFreeHost(h_resNounce[thr_id]);
 	cudaFree(d_resNounce[thr_id]);

 	init[thr_id] = false;
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -141,6 +141,14 @@ void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
        quark_groestl512_sm20_init(thr_id, threads);
 }

+__host__
+void quark_groestl512_cpu_free(int thr_id)
+{
+    int dev_id = device_map[thr_id];
+    if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
+        quark_groestl512_sm20_free(thr_id);
+}
+
 __host__
 void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
--- a/quark/cuda_quark_groestl512_sm20.cu
+++ b/quark/cuda_quark_groestl512_sm20.cu
@ -5,6 +5,8 @@
 #define MAXWELL_OR_FERMI 0
 #define USE_SHARED 1

+static unsigned int *d_textures[MAX_GPUS][8];
+
 // #define SPH_C32(x)    ((uint32_t)(x ## U))
 // #define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))

@ -274,28 +276,38 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32
 #endif
 }

-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}

 __host__
 void quark_groestl512_sm20_init(int thr_id, uint32_t threads)
 {
 	// Texturen mit obigem Makro initialisieren
-	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
-	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
-	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
-	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
-	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
-	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
-	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
-	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+	texDef(0, t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+	texDef(1, t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
+	texDef(2, t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
+	texDef(3, t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
+	texDef(4, t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
+	texDef(5, t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
+	texDef(6, t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
+	texDef(7, t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+}
+
+__host__
+void quark_groestl512_sm20_free(int thr_id)
+{
+	for (int i=0; i<8; i++)
+		cudaFree(d_textures[thr_id][i]);
 }

 __host__
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -30,6 +30,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -270,6 +271,7 @@ extern "C" void free_quark(int thr_id)
 	cudaFree(d_branch2Nonces[thr_id]);
 	cudaFree(d_branch3Nonces[thr_id]);

+	quark_groestl512_cpu_free(thr_id);
 	quark_compactTest_cpu_free(thr_id);

 	cuda_check_cpu_free(thr_id);
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@ -28,14 +28,11 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);

 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
-											int order);

 extern "C" void qubithash(void *state, const void *input)
 {
@ -96,7 +93,7 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce,
 		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);

-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);

 		cuda_check_cpu_init(thr_id, throughput);

@ -167,6 +164,8 @@ extern "C" void free_qubit(int thr_id)

 	cudaFree(d_hash[thr_id]);

+	x11_simd512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

--- a/scrypt-jane.cpp
+++ b/scrypt-jane.cpp
@ -426,10 +426,17 @@ unsigned char GetNfactor(unsigned int nTimestamp)
 	return Nfactor;
 }

+static bool init[MAX_GPUS] = { 0 };
+
 // cleanup
 void free_scrypt_jane(int thr_id)
 {
-	// todo ?
+	int dev_id = device_map[thr_id];
+
+	cudaSetDevice(dev_id);
+	cudaDeviceReset(); // well, simple way to free ;)
+
+	init[thr_id] = false;
 }

 #define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
@ -467,7 +474,18 @@ int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsi
 		s_Nfactor = Nfactor;
 	}

-	int throughput = cuda_throughput(thr_id);
+	static __thread int throughput = 0;
+	if(!init[thr_id]) {
+		int dev_id = device_map[thr_id];
+
+		cudaSetDevice(dev_id);
+		cudaDeviceReset();
+		cudaSetDevice(dev_id);
+		throughput = cuda_throughput(thr_id);
+		applog(LOG_INFO, "GPU #%d: cuda throughput is %d", dev_id, throughput);
+
+		init[thr_id] = true;
+	}

 	if(throughput == 0)
 		return -1;
@ -602,7 +620,7 @@ int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsi

 				if (memcmp(thash, &hash[cur][8*i], 32) == 0)
 				{
-					bn_store_hash_target_ratio(thash, ptarget, work);
+					work_set_target_ratio(work, thash);
 					*hashes_done = n - pdata[19];
 					pdata[19] = tmp_nonce;
 					scrypt_free(&Vbuf);
--- a/scrypt.cpp
+++ b/scrypt.cpp
@ -685,10 +685,18 @@ static int lastFactor = 0;

 static void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad);

+static bool init[MAX_GPUS] = { 0 };
+
 // cleanup
 void free_scrypt(int thr_id)
 {
-	// todo ?
+	int dev_id = device_map[thr_id];
+
+	// trivial way to free all...
+	cudaSetDevice(dev_id);
+	cudaDeviceReset();
+
+	init[thr_id] = false;
 }

 // Scrypt proof of work algorithm
@ -701,9 +709,20 @@ int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 	int result = 0;
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
-	int throughput = cuda_throughput(thr_id);
+	static __thread int throughput = 0;

-	if(throughput == 0)
+	if (!init[thr_id]) {
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
+		cudaDeviceReset();
+		cudaSetDevice(dev_id);
+		throughput = cuda_throughput(thr_id);
+		applog(LOG_INFO, "GPU #%d: cuda throughput is %d", dev_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	if (throughput == 0)
 		return -1;

 	gettimeofday(tv_start, NULL);
@ -912,7 +931,7 @@ int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 							device_map[thr_id], device_name[thr_id], i, cur);
 					} else {
 						*hashes_done = n - pdata[19];
-						bn_store_hash_target_ratio(refhash, ptarget, work);
+						work_set_target_ratio(work, refhash);
 						pdata[19] = nonce[cur] + i;
 						result = 1;
 						goto byebye;
--- a/scrypt/blake.cu
+++ b/scrypt/blake.cu
@ -404,10 +404,10 @@ void cuda_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool

 static std::map<int, uint32_t *> context_good[2];

+static bool init[MAX_GPUS] = { 0 };
+
 bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
 {
-	static bool init[MAX_GPUS] = { 0 };
-
 	if (!init[thr_id])
 	{
 		// allocate pinned host memory for good hashes
@ -441,3 +441,13 @@ void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32
 						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
 	}
 }
+
+void default_free_blake256(int thr_id)
+{
+	if (init[thr_id]) {
+		cudaFree(context_good[0][thr_id]);
+		cudaFree(context_good[1][thr_id]);
+		init[thr_id] = false;
+	}
+}
+
--- a/scrypt/salsa_kernel.cu
+++ b/scrypt/salsa_kernel.cu
@ -144,9 +144,8 @@ int cuda_throughput(int thr_id)
 		cuCtxCreate( &ctx, CU_CTX_SCHED_YIELD, device_map[thr_id] );
 		cuCtxSetCurrent(ctx);
 #else
-		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield));
 		checkCudaErrors(cudaSetDevice(device_map[thr_id]));
-		// checkCudaErrors(cudaFree(0));
+		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield));
 #endif

 		KernelInterface *kernel;
@ -256,7 +255,7 @@ inline int _ConvertSMVer2Cores(int major, int minor)
 	}

 	// If we don't find the values, we default use the previous one to run properly
-    applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128);
+	applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128);
 	return 128;
 }

--- a/skein.cu
+++ b/skein.cu
@ -11,12 +11,14 @@
 #include <openssl/sha.h>

 static uint32_t *d_hash[MAX_GPUS];
+static __thread bool sm5 = true;

 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_setBlock_80(void *pdata);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);

 extern void skeincoin_init(int thr_id);
+extern void skeincoin_free(int thr_id);
 extern void skeincoin_setBlock_80(int thr_id, void *pdata);
 extern uint32_t skeincoin_hash_sm5(int thr_id, uint32_t threads, uint32_t startNounce, int swap, uint64_t target64, uint32_t *secNonce);

@ -355,7 +357,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 	const uint32_t first_nonce = pdata[19];
 	const int swap = 1;

-	bool sm5 = (device_sm[device_map[thr_id]] >= 500);
+	sm5 = (device_sm[device_map[thr_id]] >= 500);
 	bool checkSecnonce = (have_stratum || have_longpoll) && !sm5;

 	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 20);
@ -473,9 +475,11 @@ extern "C" void free_skeincoin(int thr_id)

 	cudaSetDevice(device_map[thr_id]);

-	cudaFree(d_hash[thr_id]);
+	if (!sm5) {
+		cudaFree(d_hash[thr_id]);
+		cuda_check_cpu_free(thr_id);
+	}

-	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

 	cudaDeviceSynchronize();
--- a/skein2.cpp
+++ b/skein2.cpp
@ -16,6 +16,7 @@ extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_setBlock_80(void *pdata);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);

+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

 void skein2hash(void *output, const void *input)
@ -38,6 +39,7 @@ static bool init[MAX_GPUS] = { 0 };

 int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
+	int dev_id = device_map[thr_id];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
@ -50,9 +52,9 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned

 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);

-		cudaMalloc(&d_hash[thr_id], throughput * 64U);
+		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);

 		quark_skein512_cpu_init(thr_id, throughput);
 		cuda_check_cpu_init(thr_id, throughput);
@ -92,7 +94,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 				work_set_target_ratio(work, vhash64);
 				if (secNonce != 0) {
 					if (!opt_quiet)
-						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", device_map[thr_id], swab32(secNonce));
+						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce));

 					endiandata[19] = secNonce;
 					skein2hash(vhash64, endiandata);
@ -104,7 +106,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 				pdata[19] = swab32(foundNonce);
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", dev_id, foundNonce);
 			}
 		}

@ -135,4 +137,4 @@ void free_skein2(int thr_id)
 	init[thr_id] = false;

 	cudaDeviceSynchronize();
-}
+}
--- a/x11/c11.cu
+++ b/x11/c11.cu
@ -32,6 +32,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -50,6 +51,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);

 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -255,6 +257,8 @@ extern "C" void free_c11(int thr_id)
 	cudaSetDevice(device_map[thr_id]);

 	cudaFree(d_hash[thr_id]);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);

 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@ -672,6 +672,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads)
 	return 0;
 }

+__host__
+void x11_simd512_cpu_free(int thr_id)
+{
+	cudaFree(d_temp4[thr_id]);
+	cudaFree(d_state[thr_id]);
+}
+
 __host__
 void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -21,6 +21,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);

 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -163,6 +164,7 @@ extern "C" void free_fresh(int thr_id)
 	cudaSetDevice(device_map[thr_id]);

 	cudaFree(d_hash[thr_id]);
+	x11_simd512_cpu_free(thr_id);

 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
--- a/x11/s3.cu
+++ b/x11/s3.cu
@ -21,6 +21,7 @@ extern void x11_shavite512_setBlock_80(void *pdata);

 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);

 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -146,6 +147,7 @@ extern "C" void free_s3(int thr_id)
 	cudaSetDevice(device_map[thr_id]);

 	cudaFree(d_hash[thr_id]);
+	x11_simd512_cpu_free(thr_id);

 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -32,6 +32,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -50,6 +51,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);

 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -254,6 +256,9 @@ extern "C" void free_x11(int thr_id)

 	cudaFree(d_hash[thr_id]);

+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@ -38,6 +38,9 @@
 * @author   phm <phm@inbox.com>
 */

+// store allocated textures device addresses
+static unsigned int* d_textures[MAX_GPUS][4];
+
 #define mixtab0(x) (*((uint32_t*)mixtabs + (    (x))))
 #define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
@ -657,25 +660,37 @@ __global__ void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce,
 	}
 }

-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
-
-__host__ void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
-{
-	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
 }

-__host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
+{
+	texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
+	texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
+	texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
+	texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
+}
+
+__host__
+void x13_fugue512_cpu_free(int thr_id)
+{
+	for (int i=0; i<4; i++)
+		cudaFree(d_textures[thr_id][i]);
+}
+
+__host__
+void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	const uint32_t threadsperblock = 256;

--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -34,6 +34,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star

 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -52,6 +53,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);

 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -61,7 +63,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star

 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
+extern void x13_fugue512_cpu_free(int thr_id);

 // X13 CPU Hash
 extern "C" void x13hash(void *output, const void *input)
@ -256,6 +258,10 @@ extern "C" void free_x13(int thr_id)

 	cudaFree(d_hash[thr_id]);

+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -38,6 +38,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star

 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -56,6 +57,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);

 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -65,6 +67,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star

 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);

 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -185,9 +188,10 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);

-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
-
 		cuda_check_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
 		init[thr_id] = true;
 	}

@ -260,8 +264,12 @@ extern "C" void free_x14(int thr_id)

 	cudaFree(d_hash[thr_id]);

+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

 	cudaDeviceSynchronize();
-}
+}
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -39,6 +39,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star

 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -57,6 +58,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);

 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -66,6 +68,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star

 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);

 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -275,6 +278,10 @@ extern "C" void free_x15(int thr_id)

 	cudaFree(d_hash[thr_id]);

+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -42,6 +42,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star

 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -60,6 +61,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);

 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -69,6 +71,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star

 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);

 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -297,6 +300,10 @@ extern "C" void free_x17(int thr_id)

 	cudaFree(d_hash[thr_id]);

+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

--- a/zr5.cu
+++ b/zr5.cu
@ -319,6 +319,7 @@ extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st

 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);

 extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -491,6 +492,7 @@ extern "C" void free_zr5(int thr_id)

 	cudaFree(d_txs[thr_id]);

+	quark_groestl512_cpu_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;