algos: add functions to free allocated resources

Will be used later for algo switching not really tested yet...
2015-09-25 07:51:09 +02:00 · 2015-09-25 07:51:09 +02:00 · e1c4b3042c
commit e1c4b3042c
parent 8ec5dd7d8e
46 changed files with 681 additions and 38 deletions
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@ -489,3 +489,19 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non

 	return rc;
 }
+
+// cleanup
+extern "C" void free_blake256(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFreeHost(h_resNonce[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/Algo256/bmw.cu
+++ b/Algo256/bmw.cu
@ -101,3 +101,19 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_bmw(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/Algo256/cuda_bmw256.cu
+++ b/Algo256/cuda_bmw256.cu
@ -269,6 +269,13 @@ void bmw256_cpu_init(int thr_id, uint32_t threads)
 	cudaMallocHost(&d_gnounce[thr_id], 2 * sizeof(uint32_t));
 }

+__host__
+void bmw256_cpu_free(int thr_id)
+{
+	cudaFree(d_GNonce[thr_id]);
+	cudaFreeHost(d_gnounce[thr_id]);
+}
+
 __host__
 void bmw256_setTarget(const void *pTargetIn)
 {
--- a/Algo256/cuda_fugue256.cu
+++ b/Algo256/cuda_fugue256.cu
@ -732,6 +732,13 @@ void fugue256_cpu_init(int thr_id, uint32_t threads)
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }

+__host__
+void fugue256_cpu_free(int thr_id)
+{
+	cudaFree(d_fugue256_hashoutput[thr_id]);
+	cudaFree(d_resultNonce[thr_id]);
+}
+
 __host__
 void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@ -262,7 +262,6 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *out
 __host__
 void groestl256_cpu_init(int thr_id, uint32_t threads)
 {
-
 	// Texturen mit obigem Makro initialisieren
 	texDef(t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
 	texDef(t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
@ -277,6 +276,13 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
 	cudaMallocHost(&h_GNonces[thr_id], 2*sizeof(uint32_t));
 }

+__host__
+void groestl256_cpu_free(int thr_id)
+{
+	cudaFree(d_GNonces[thr_id]);
+	cudaFreeHost(h_GNonces[thr_id]);
+}
+
 __host__
 uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
--- a/Algo256/cuda_keccak256.cu
+++ b/Algo256/cuda_keccak256.cu
@ -300,3 +300,10 @@ void keccak256_cpu_init(int thr_id, uint32_t threads)
 	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
 	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
 }
+
+__host__
+void keccak256_cpu_free(int thr_id)
+{
+	cudaFree(d_KNonce[thr_id]);
+	cudaFreeHost(d_nounce[thr_id]);
+}
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@ -17,6 +17,7 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];

 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+extern void keccak256_cpu_free(int thr_id);
 extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
 extern uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);

@ -93,3 +94,19 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no

 	return 0;
 }
+
+// cleanup
+extern "C" void free_keccak256(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	keccak256_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -250,3 +250,25 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc

 	return 0;
 }
+
+// cleanup
+extern "C" void free_jackpot(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cudaFree(d_branch1Nonces[thr_id]);
+	cudaFree(d_branch2Nonces[thr_id]);
+	cudaFree(d_branch3Nonces[thr_id]);
+
+	cudaFree(d_jackpotNonces[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -1503,6 +1503,45 @@ static bool wanna_mine(int thr_id)
 	return state;
 }

+// required to switch algos
+void miner_free_device(int thr_id)
+{
+	// todo: some kind of algo "registration"
+	// to call automatically if needed
+	free_blake256(thr_id);
+	free_bmw(thr_id);
+	free_c11(thr_id);
+	free_deep(thr_id);
+	free_keccak256(thr_id);
+	free_fresh(thr_id);
+	free_fugue256(thr_id);
+	free_groestlcoin(thr_id);
+	free_heavy(thr_id);
+	free_jackpot(thr_id);
+	free_luffa(thr_id);
+	free_lyra2(thr_id);
+	free_lyra2v2(thr_id);
+	free_myriad(thr_id);
+	free_neoscrypt(thr_id);
+	free_nist5(thr_id);
+	free_pentablake(thr_id);
+	free_quark(thr_id);
+	free_qubit(thr_id);
+	free_skeincoin(thr_id);
+	free_skein2(thr_id);
+	free_s3(thr_id);
+	free_whirlx(thr_id);
+	free_x11(thr_id);
+	free_x13(thr_id);
+	free_x14(thr_id);
+	free_x15(thr_id);
+	free_x17(thr_id);
+	free_zr5(thr_id);
+	//free_sha256d(thr_id);
+	free_scrypt(thr_id);
+	free_scrypt_jane(thr_id);
+}
+
 static void *miner_thread(void *userdata)
 {
 	struct thr_info *mythr = (struct thr_info *)userdata;
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@ -23,6 +23,13 @@ void cuda_check_cpu_init(int thr_id, uint32_t threads)
    init_done = true;
 }

+__host__
+void cuda_check_cpu_free(int thr_id)
+{
+	cudaFree(d_resNonces[thr_id]);
+	cudaFreeHost(h_resNonces[thr_id]);
+}
+
 // Target Difficulty
 __host__
 void cuda_check_cpu_setTarget(const void *ptarget)
--- a/cuda_fugue256.h
+++ b/cuda_fugue256.h
@ -4,5 +4,6 @@
 void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce);
 void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
 void fugue256_cpu_init(int thr_id, uint32_t threads);
+void fugue256_cpu_free(int thr_id);

 #endif
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -102,6 +102,12 @@ void groestlcoin_cpu_init(int thr_id, uint32_t threads)
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }

+__host__
+void groestlcoin_cpu_free(int thr_id)
+{
+	cudaFree(d_resultNonce[thr_id]);
+}
+
 __host__
 void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
--- a/cuda_groestlcoin.h
+++ b/cuda_groestlcoin.h
@ -2,6 +2,7 @@
 #define _CUDA_GROESTLCOIN_H

 void groestlcoin_cpu_init(int thr_id, uint32_t threads);
+void groestlcoin_cpu_free(int thr_id);
 void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
 void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);

--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -31,6 +31,7 @@ extern int cuda_arch[MAX_GPUS];
 extern int cuda_get_arch(int thr_id);
 extern void cuda_reset_device(int thr_id, bool *init);
 extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
+extern void cuda_check_cpu_free(int thr_id);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
 extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
 extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@ -327,6 +327,13 @@ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
    cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads);
 }

+__host__
+void myriadgroestl_cpu_free(int thr_id)
+{
+    cudaFree(d_resultNonce[thr_id]);
+    cudaFree(d_outputHashes[thr_id]);
+}
+
 __host__
 void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -72,6 +72,7 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
+	int res = 0;

 	uint32_t throughput =  device_intensity(thr_id, __func__, 1 << 20); // 256*256*16
 	throughput = min(throughput,  (max_nonce - first_nonce));
@ -83,19 +84,26 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,
 	{
 		cudaSetDevice(device_map[thr_id]);

-		// Konstanten kopieren, Speicher belegen
+		// Constants copy/init (no device alloc in these algos)
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);

-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		// char[64] work space for hashes results
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));

 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}

+#ifdef USE_STREAMS
+	cudaStream_t stream[5];
+	for (int i = 0; i < 5; i++)
+		cudaStreamCreate(&stream[i]);
+#endif
+
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);

@ -123,7 +131,7 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,
 			nist5hash(vhash64, endiandata);

 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
+				res = 1;
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				bn_store_hash_target_ratio(vhash64, ptarget, work);
 				if (secNonce != 0) {
@ -135,7 +143,7 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,
 					res++;
 				}
 				pdata[19] = foundNonce;
-				return res;
+				goto out;
 			}
 			else {
 				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
@ -146,6 +154,27 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,

 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);

-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+out:
+#ifdef USE_STREAMS
+	for (int i = 0; i < 5; i++)
+		cudaStreamDestroy(stream[i]);
+#endif
+
+	return res;
 }
+
+// ressources cleanup
+extern "C" void free_nist5(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@ -20,6 +20,15 @@ extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n,
    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))

+void fugue256_hash(unsigned char* output, const unsigned char* input, int len)
+{
+	sph_fugue256_context ctx;
+
+	sph_fugue256_init(&ctx);
+	sph_fugue256(&ctx, input, len);
+	sph_fugue256_close(&ctx, (void *)output);
+}
+
 static bool init[MAX_GPUS] = { 0 };

 int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
@ -91,11 +100,17 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 	return 0;
 }

-void fugue256_hash(unsigned char* output, const unsigned char* input, int len)
+// cleanup
+void free_fugue256(int thr_id)
 {
-	sph_fugue256_context ctx;
+	if (!init[thr_id])
+		return;

-	sph_fugue256_init(&ctx);
-	sph_fugue256(&ctx, input, len);
-	sph_fugue256_close(&ctx, (void *)output);
-}
+	cudaSetDevice(device_map[thr_id]);
+
+	fugue256_cpu_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@ -91,3 +91,16 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	return 0;
 }

+// cleanup
+void free_groestlcoin(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	groestlcoin_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@ -300,6 +300,22 @@ exit:
    return rc;
 }

+// cleanup
+extern "C" void free_heavy(int thr_id)
+{
+    if (!init[thr_id])
+        return;
+
+    cudaSetDevice(device_map[thr_id]);
+
+    cudaFree(heavy_nonceVector[thr_id]);
+
+    // todo: free sub algos vectors
+
+    init[thr_id] = false;
+
+    cudaDeviceSynchronize();
+}
 __host__
 void heavycoin_hash(uchar* output, const uchar* input, int len)
 {
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@ -24,6 +24,7 @@ extern void skein256_cpu_init(int thr_id, uint32_t threads);
 extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);

 extern void groestl256_cpu_init(int thr_id, uint32_t threads);
+extern void groestl256_cpu_free(int thr_id);
 extern void groestl256_setTarget(const void *ptarget);
 extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
 extern uint32_t groestl256_getSecNonce(int thr_id, int num);
@ -162,3 +163,19 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,

 	return 0;
 }
+
+// cleanup
+extern "C" void free_lyra2(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	groestl256_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@ -28,6 +28,7 @@ extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* matrix);

 extern void bmw256_setTarget(const void *ptarget);
 extern void bmw256_cpu_init(int thr_id, uint32_t threads);
+extern void bmw256_cpu_free(int thr_id);
 extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);

 void lyra2v2_hash(void *state, const void *input)
@ -169,3 +170,20 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 	MyStreamSynchronize(NULL, 0, device_map[thr_id]);
 	return 0;
 }
+
+// cleanup
+extern "C" void free_lyra2v2(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_hash2[thr_id]);
+
+	bmw256_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/miner.h
+++ b/miner.h
@ -300,6 +300,42 @@ extern int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, un
 extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done,
 	unsigned char *scratchbuf, struct timeval *tv_start, struct timeval *tv_end);

+/* free device allocated memory per algo */
+void miner_free_device(int thr_id);
+
+extern void free_blake256(int thr_id);
+extern void free_bmw(int thr_id);
+extern void free_c11(int thr_id);
+extern void free_deep(int thr_id);
+extern void free_keccak256(int thr_id);
+extern void free_fresh(int thr_id);
+extern void free_fugue256(int thr_id);
+extern void free_groestlcoin(int thr_id);
+extern void free_heavy(int thr_id);
+extern void free_jackpot(int thr_id);
+extern void free_luffa(int thr_id);
+extern void free_lyra2(int thr_id);
+extern void free_lyra2v2(int thr_id);
+extern void free_myriad(int thr_id);
+extern void free_neoscrypt(int thr_id);
+extern void free_nist5(int thr_id);
+extern void free_pentablake(int thr_id);
+extern void free_quark(int thr_id);
+extern void free_qubit(int thr_id);
+extern void free_skeincoin(int thr_id);
+extern void free_skein2(int thr_id);
+extern void free_s3(int thr_id);
+extern void free_whirlx(int thr_id);
+extern void free_x11(int thr_id);
+extern void free_x13(int thr_id);
+extern void free_x14(int thr_id);
+extern void free_x15(int thr_id);
+extern void free_x17(int thr_id);
+extern void free_zr5(int thr_id);
+//extern void free_sha256d(int thr_id);
+extern void free_scrypt(int thr_id);
+extern void free_scrypt_jane(int thr_id);
+
 /* api related */
 void *api_thread(void *userdata);
 void api_set_throughput(int thr_id, uint32_t throughput);
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@ -8,6 +8,7 @@
 #include "miner.h"

 void myriadgroestl_cpu_init(int thr_id, uint32_t threads);
+void myriadgroestl_cpu_free(int thr_id);
 void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
 void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);

@ -95,3 +96,16 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 	return 0;
 }

+// cleanup
+void free_myriad(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	myriadgroestl_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@ -735,6 +735,13 @@ void neoscrypt_cpu_init(int thr_id, uint32_t threads)
 	cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice);
 }

+__host__
+void neoscrypt_cpu_free(int thr_id)
+{
+	cudaFree(d_NNonce[thr_id]);
+	cudaFree(d_buffer[thr_id]);
+}
+
 __host__
 uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order)
 {
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@ -4,6 +4,7 @@

 extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);
 extern void neoscrypt_cpu_init(int thr_id, uint32_t threads);
+extern void neoscrypt_cpu_free(int thr_id);
 extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order);

 static bool init[MAX_GPUS] = { 0 };
@ -81,3 +82,17 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+void free_neoscrypt(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	neoscrypt_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/pentablake.cu
+++ b/pentablake.cu
@ -442,3 +442,20 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return rc;
 }
+
+// cleanup
+void free_pentablake(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(h_resNounce[thr_id]);
+	cudaFree(d_resNounce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@ -27,7 +27,7 @@ __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_Qua

 cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS];

-// Setup-Funktionen
+// Setup/Alloc Function
 __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
@ -45,6 +45,18 @@ __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
 	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
 }

+// Because all alloc should have a free...
+__host__ void quark_compactTest_cpu_free(int thr_id)
+{
+	cudaFree(d_tempBranch1Nonces[thr_id]);
+	cudaFree(d_numValid[thr_id]);
+
+	cudaFree(d_partSum[0][thr_id]);
+	cudaFree(d_partSum[1][thr_id]);
+
+	cudaFreeHost(h_numValid[thr_id]);
+}
+
 #if __CUDA_ARCH__ < 300
 /**
 * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -41,6 +41,7 @@ extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
+extern void quark_compactTest_cpu_free(int thr_id);
 extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
 											uint32_t *d_nonces1, uint32_t *nrm1,
 											uint32_t *d_nonces2, uint32_t *nrm2,
@ -51,6 +52,7 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t thre

 extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

+
 // Original Quarkhash Funktion aus einem miner Quelltext
 extern "C" void quarkhash(void *state, const void *input)
 {
@ -252,3 +254,26 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_quark(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+	cudaDeviceSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	cudaFree(d_branch1Nonces[thr_id]);
+	cudaFree(d_branch2Nonces[thr_id]);
+	cudaFree(d_branch3Nonces[thr_id]);
+
+	quark_compactTest_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@ -128,3 +128,19 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_deep(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/qubit/luffa.cu
+++ b/qubit/luffa.cu
@ -96,3 +96,19 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_luffa(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@ -156,3 +156,19 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_qubit(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/scrypt-jane.cpp
+++ b/scrypt-jane.cpp
@ -426,6 +426,12 @@ unsigned char GetNfactor(unsigned int nTimestamp)
 	return Nfactor;
 }

+// cleanup
+void free_scrypt_jane(int thr_id)
+{
+	// todo ?
+}
+
 #define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
 					 | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
 static int s_Nfactor = 0;
--- a/scrypt.cpp
+++ b/scrypt.cpp
@ -685,6 +685,12 @@ static int lastFactor = 0;

 static void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad);

+// cleanup
+void free_scrypt(int thr_id)
+{
+	// todo ?
+}
+
 // Scrypt proof of work algorithm
 // using SSE2 vectorized HMAC SHA256 on CPU and
 // a salsa core implementation on GPU with CUDA
--- a/skein.cu
+++ b/skein.cu
@ -464,3 +464,19 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no

 	return 0;
 }
+
+// cleanup
+extern "C" void free_skeincoin(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/skein2.cpp
+++ b/skein2.cpp
@ -120,3 +120,19 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned

 	return 0;
 }
+
+// cleanup
+void free_skein2(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x11/c11.cu
+++ b/x11/c11.cu
@ -54,9 +54,6 @@ extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t start
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-                                          uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);

 // Flax/C11 CPU Hash
 extern "C" void c11hash(void *output, const void *input)
@ -248,3 +245,19 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_c11(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -25,10 +25,6 @@ extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t start
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
-											int order);

 // CPU Hash
 extern "C" void fresh_hash(void *state, const void *input)
@ -157,3 +153,19 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_fresh(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x11/s3.cu
+++ b/x11/s3.cu
@ -136,3 +136,19 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_s3(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -54,10 +54,6 @@ extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t start
 extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-                                          uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
-
 // X11 CPU Hash
 extern "C" void x11hash(void *output, const void *input)
 {
@ -247,3 +243,19 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_x11(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -62,9 +62,6 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, 
-                                          uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);

 // X13 CPU Hash
 extern "C" void x13hash(void *output, const void *input)
@ -248,3 +245,19 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_x13(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x15/cuda_whirlpoolx.cu
+++ b/x15/cuda_whirlpoolx.cu
@ -544,6 +544,15 @@ extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads)
 	CUDA_SAFE_CALL(cudaMalloc(&d_tmp[thr_id], 8 * 9 * sizeof(uint64_t))); // d_tmp[threadIdx.x+64] (7+64)
 }

+__host__
+extern void whirlpoolx_cpu_free(int thr_id)
+{
+	cudaFree(d_WXNonce[thr_id]);
+	cudaFreeHost(h_wxnounce[thr_id]);
+	cudaFree(d_xtra[thr_id]);
+	cudaFree(d_tmp[thr_id]);
+}
+
 __host__
 void whirlpoolx_setBlock_80(void *pdata, const void *ptarget)
 {
--- a/x15/whirlpoolx.cu
+++ b/x15/whirlpoolx.cu
@ -12,6 +12,7 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];

 extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads);
+extern void whirlpoolx_cpu_free(int thr_id);
 extern void whirlpoolx_setBlock_80(void *pdata, const void *ptarget);
 extern uint32_t whirlpoolx_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
 extern void whirlpoolx_precompute(int thr_id);
@ -99,3 +100,19 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc

 	return 0;
 }
+
+// cleanup
+extern "C" void free_whirlx(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	whirlpoolx_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -69,9 +69,6 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star
 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);

 // X14 CPU Hash function
 extern "C" void x14hash(void *output, const void *input)
@ -252,3 +249,19 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_x14(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -74,9 +74,6 @@ extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
 extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x15_whirlpool_cpu_free(int thr_id);

-extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);

 // X15 CPU Hash function
 extern "C" void x15hash(void *output, const void *input)
@ -267,3 +264,19 @@ extern "C" int scanhash_x15(int thr_id,  struct work* work, uint32_t max_nonce,
 	x15_whirlpool_cpu_free(thr_id);
 	return 0;
 }
+
+// cleanup
+extern "C" void free_x15(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -82,10 +82,6 @@ extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startN
 extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
 extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
-											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
-											int order);

 // X17 Hashfunktion
 extern "C" void x17hash(void *output, const void *input)
@ -290,3 +286,19 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_x17(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/zr5.cu
+++ b/zr5.cu
@ -469,3 +469,30 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_zr5(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+
+	cudaFree(d_poks[thr_id]);
+	cudaFree(d_permut[thr_id]);
+	cudaFree(d_buffers[thr_id]);
+
+	cudaFree(d_blake[thr_id]);
+	cudaFree(d_groes[thr_id]);
+	cudaFree(d_jh512[thr_id]);
+	cudaFree(d_skein[thr_id]);
+
+	cudaFree(d_txs[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}