diff --git a/api.cpp b/api.cpp
index 0e8b5b3..482fcc4 100644
--- a/api.cpp
+++ b/api.cpp
@@ -101,14 +101,11 @@ static char *buffer = NULL;
 static time_t startup = 0;
 static int bye = 0;
 
-extern int opt_intensity;
-extern int opt_n_threads;
 extern char *opt_api_allow;
 extern int opt_api_listen; /* port */
 extern uint64_t global_hashrate;
 extern uint32_t accepted_count;
 extern uint32_t rejected_count;
-extern int num_processors;
 extern int device_map[8];
 extern char *device_name[8];
 
diff --git a/api/index.php b/api/index.php
index fb57b9e..de15c8d 100644
--- a/api/index.php
+++ b/api/index.php
@@ -52,6 +52,7 @@ function translateField($key)
 
 	$intl['H'] = 'Bloc height';
 	$intl['I'] = 'Intensity';
+	$intl['HWF'] = 'Failures';
 
 	$intl['TEMP'] = 'TÂ°c';
 	$intl['FAN'] = 'Fan %';
diff --git a/blake32.cu b/blake32.cu
index 65df026..87bd1ca 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -17,8 +17,6 @@ extern "C" {
 /* threads per block and throughput (intensity) */
 #define TPB 128
 
-extern int num_processors;
-
 /* added in sph_blake.c */
 extern "C" int blake256_rounds = 14;
 
@@ -428,9 +426,9 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 
 	if (!init[thr_id]) {
 		if (num_processors > 1)
-			CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)));
-		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)));
+			cudaSetDevice(device_map[thr_id]);
+		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), 0);
 		init[thr_id] = true;
 	}
 
diff --git a/ccminer.cpp b/ccminer.cpp
index 0a7f003..9ae3cb7 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -1104,9 +1104,9 @@ static void *miner_thread(void *userdata)
 		if (max64 < minmax) {
 			switch (opt_algo) {
 			case ALGO_BLAKECOIN:
-				minmax = 0x4000000;
-				break;
 			case ALGO_BLAKE:
+				minmax = 0x80000000U;
+				break;
 			case ALGO_DOOM:
 			case ALGO_JACKPOT:
 			case ALGO_KECCAK:
@@ -1158,6 +1158,7 @@ static void *miner_thread(void *userdata)
 						applog(LOG_DEBUG, "job done, wait for a new one...");
 					work_restart[thr_id].restart = 1;
 					hashlog_purge_old();
+					stats_purge_old();
 					// wait a bit for a new job...
 					usleep(500*1000);
 					(*nonceptr) = end_nonce + 1;
diff --git a/cuda.cpp b/cuda.cpp
index 1a7fc38..780b648 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -1,7 +1,6 @@
 ï»¿#include <stdio.h>
 #include <memory.h>
 #include <string.h>
-
 #include <map>
 
 #ifndef _WIN32
@@ -22,6 +21,10 @@
 
 #include "cuda_runtime.h"
 
+#ifdef WIN32
+#include "compat.h" // sleep
+#endif
+
 extern char *device_name[8];
 extern int device_map[8];
 extern int device_sm[8];
@@ -145,3 +148,11 @@ cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
 		result = cudaStreamSynchronize(stream);
 	return result;
 }
+
+void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func)
+{
+	struct cgpu_info *gpu = &thr_info[thr_id].gpu;
+	gpu->hw_errors++;
+	applog(LOG_ERR, "GPU #%d: %s %s", device_map[thr_id], func, cudaGetErrorString(err));
+	sleep(1);
+}
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index e129de8..e20f81d 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -43,11 +43,11 @@ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonce
 __host__
 void cuda_check_cpu_init(int thr_id, int threads)
 {
-    cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
-    cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
+    CUDA_CALL_OR_RET(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)));
+    CUDA_CALL_OR_RET(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)));
 }
 
-// Target Difficulty setzen
+// Target Difficulty
 __host__
 void cuda_check_cpu_setTarget(const void *ptarget)
 {
diff --git a/cuda_helper.h b/cuda_helper.h
index 2d5af22..249599f 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -22,6 +22,7 @@ extern void cuda_check_cpu_setTarget(const void *ptarget);
 extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
 extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
 
 extern __device__ __device_builtin__ void __syncthreads(void);
 
@@ -122,17 +123,34 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
 #endif
 
 /*********************************************************************/
-// Macro to catch CUDA errors in CUDA runtime calls
+// Macros to catch CUDA errors in CUDA runtime calls
+
 #define CUDA_SAFE_CALL(call)                                          \
 do {                                                                  \
 	cudaError_t err = call;                                           \
 	if (cudaSuccess != err) {                                         \
-		fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
-		         __FILE__, __LINE__, cudaGetErrorString(err) );       \
+		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
+		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
 		exit(EXIT_FAILURE);                                           \
 	}                                                                 \
 } while (0)
 
+#define CUDA_CALL_OR_RET(call) do {                                   \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return;                                                       \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET_X(call, ret) do {                            \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return ret;                                                   \
+	}                                                                 \
+} while (0)
+
 /*********************************************************************/
 #ifdef _WIN64
 #define USE_XOR_ASM_OPTS 0
@@ -185,7 +203,7 @@ uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t
 	return result;
 }
 #else
-#define xor8(a,b,c,d,e,f,g,h) (a^b^c^d^e^f^g^h)
+#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h))
 #endif
 
 // device asm for x17
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index 9e4f9e7..3a07c38 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -75,7 +75,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 		((uint32_t*)ptarget)[7] = 0x00FF;
 
 	int throughput = opt_work_size ? opt_work_size : (1 << 20); // 256*4096
-	throughput = min(throughput, max_nonce - first_nonce);
+	throughput = min(throughput, (int) (max_nonce - first_nonce));
 
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
diff --git a/miner.h b/miner.h
index 2280071..4c84c87 100644
--- a/miner.h
+++ b/miner.h
@@ -415,6 +415,9 @@ extern bool opt_debug;
 extern bool opt_quiet;
 extern bool opt_protocol;
 extern bool opt_tracegpu;
+extern int opt_intensity;
+extern int opt_n_threads;
+extern int num_processors;
 extern int opt_timeout;
 extern bool want_longpoll;
 extern bool have_longpoll;
diff --git a/pentablake.cu b/pentablake.cu
index aaa6a9b..726ba5f 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -45,9 +45,6 @@ extern "C" void pentablakehash(void *output, const void *input)
 
 #define MAXU 0xffffffffU
 
-// in cpu-miner.c
-extern int opt_n_threads;
-
 __constant__
 static uint32_t __align__(32) c_Target[8];
 
@@ -514,8 +511,8 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *
 		((uint32_t*)ptarget)[7] = 0x000F;
 
 	if (!init[thr_id]) {
-		if (opt_n_threads > 1) {
-			CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		if (num_processors > 1) {
+			cudaSetDevice(device_map[thr_id]);
 		}
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
 		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index 01363ee..64bd15c 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -239,10 +239,10 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 __host__ void quark_blake512_cpu_init(int thr_id, int threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
-	cudaMemcpyToSymbol( c_sigma,
+	CUDA_CALL_OR_RET( cudaMemcpyToSymbol(c_sigma,
 						host_sigma,
 						sizeof(host_sigma),
-						0, cudaMemcpyHostToDevice);
+						0, cudaMemcpyHostToDevice));
 }
 
 // Blake512 für 80 Byte grosse Eingangsdaten
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
index df2f886..31df2f9 100644
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@@ -26,7 +26,7 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern int x11_simd512_cpu_init(int thr_id, int threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void x11_echo512_cpu_init(int thr_id, int threads);
@@ -89,14 +89,14 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
 	{
 		cudaSetDevice(device_map[thr_id]);
 
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
-
 		qubit_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
 
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
diff --git a/stats.cpp b/stats.cpp
index fef5c30..b4673dd 100644
--- a/stats.cpp
+++ b/stats.cpp
@@ -18,7 +18,6 @@ static uint64_t uid = 0;
 #define STATS_PURGE_TIMEOUT 120*60 /* 120 mn */
 
 extern uint64_t global_hashrate;
-extern int opt_n_threads;
 extern int opt_statsavg;
 extern int device_map[8];
 
diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu
index 51f407c..43ec8c3 100644
--- a/x11/cuda_x11_aes.cu
+++ b/x11/cuda_x11_aes.cu
@@ -275,27 +275,27 @@ static __constant__ uint32_t d_AES1[256];
 static __constant__ uint32_t d_AES2[256];
 static __constant__ uint32_t d_AES3[256];
 
-static void aes_cpu_init()
+static void aes_cpu_init(int thr_id)
 {
-	cudaMemcpyToSymbol( d_AES0,
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES0,
                         h_AES0,
                         sizeof(h_AES0),
-                        0, cudaMemcpyHostToDevice);
+                        0, cudaMemcpyHostToDevice));
 
-	cudaMemcpyToSymbol( d_AES1,
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES1,
                         h_AES1,
                         sizeof(h_AES1),
-                        0, cudaMemcpyHostToDevice);
+                        0, cudaMemcpyHostToDevice));
 
-	cudaMemcpyToSymbol( d_AES2,
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES2,
                         h_AES2,
                         sizeof(h_AES2),
-                        0, cudaMemcpyHostToDevice);
+                        0, cudaMemcpyHostToDevice));
 
-	cudaMemcpyToSymbol( d_AES3,
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES3,
                         h_AES3,
                         sizeof(h_AES3),
-                        0, cudaMemcpyHostToDevice);
+                        0, cudaMemcpyHostToDevice));
 }
 
 __device__ __forceinline__
diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index 5fc2a0f..c82e4da 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -303,7 +303,7 @@ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash
 __host__
 void x11_echo512_cpu_init(int thr_id, int threads)
 {
-	aes_cpu_init();
+	aes_cpu_init(thr_id);
 }
 
 __host__
diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu
index c6dd1dc..50d0bde 100644
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@@ -356,11 +356,12 @@ __global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint
 }
 
 
-// Setup-Funktionen
-__host__ void x11_luffa512_cpu_init(int thr_id, int threads)
+// Setup Function
+__host__
+void x11_luffa512_cpu_init(int thr_id, int threads)
 {
-    cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice);
-    cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice);
+    CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice));
+    CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice));
 }
 
 __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu
index dfe6a1e..0131b6c 100644
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@@ -1404,7 +1404,7 @@ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputH
 
 __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
 {
-	aes_cpu_init();
+	aes_cpu_init(thr_id);
 }
 
 __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu
index df6e659..49bd1bc 100644
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@@ -10,9 +10,6 @@
 #include "cuda_helper.h"
 #include <stdio.h>
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
 int *d_state[8];
 uint4 *d_temp4[8];
 
@@ -624,10 +621,10 @@ x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, ui
 }
 
 __host__
-void x11_simd512_cpu_init(int thr_id, int threads)
+int x11_simd512_cpu_init(int thr_id, int threads)
 {
-	CUDA_SAFE_CALL(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads));
-	CUDA_SAFE_CALL(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads));
+	CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
+	CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
 
 	cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
@@ -644,7 +641,9 @@ void x11_simd512_cpu_init(int thr_id, int threads)
 	texRef1D_128.normalized = 0;
 	texRef1D_128.filterMode = cudaFilterModePoint;
 	texRef1D_128.addressMode[0] = cudaAddressModeClamp;
-	CUDA_SAFE_CALL(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads));
+	CUDA_CALL_OR_RET_X(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads), (int) err);
+
+	return 0;
 }
 
 __host__
diff --git a/x11/fresh.cu b/x11/fresh.cu
index dc1a5a5..8ea3233 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -19,7 +19,7 @@ extern void x11_shavite512_setBlock_80(void *pdata);
 extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
 extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern int  x11_simd512_cpu_init(int thr_id, int threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void x11_echo512_cpu_init(int thr_id, int threads);
@@ -84,14 +84,14 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
 
 	if (!init[thr_id])
 	{
-		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4));
+		cudaSetDevice(device_map[thr_id]);
 
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
 
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4), 0);
+
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
diff --git a/x11/s3.cu b/x11/s3.cu
index 2c7ac5e..dae3d1b 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -19,7 +19,7 @@ extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
 extern void x11_shavite512_setBlock_80(void *pdata);
 
-extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern int  x11_simd512_cpu_init(int thr_id, int threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void quark_skein512_cpu_init(int thr_id, int threads);
@@ -56,11 +56,10 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	static bool init[8] = { 0 };
-#ifdef WIN32
-	// reduce a bit the intensity on windows
-	int intensity = 19; // 256*256*8;
-#else
 	int intensity = 20; // 256*256*8*2;
+#ifdef WIN32
+	// reduce by one the intensity on windows
+	intensity--;
 #endif
 	int throughput = opt_work_size ? opt_work_size : (1 << intensity);
 	throughput = min(throughput, (int)(max_nonce - first_nonce));
@@ -70,14 +69,14 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
 
 	if (!init[thr_id])
 	{
-		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
+		cudaSetDevice(device_map[thr_id]);
 
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
diff --git a/x11/x11.cu b/x11/x11.cu
index 121d416..3b8b059 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -51,7 +51,7 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern int  x11_simd512_cpu_init(int thr_id, int threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void x11_echo512_cpu_init(int thr_id, int threads);
@@ -153,8 +153,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 
 	if (!init[thr_id])
 	{
-		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		cudaSetDevice(device_map[thr_id]);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
@@ -165,8 +164,12 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
diff --git a/x13/x13.cu b/x13/x13.cu
index 0dd72ce..4b59ed3 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -54,7 +54,7 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern int  x11_simd512_cpu_init(int thr_id, int threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void x11_echo512_cpu_init(int thr_id, int threads);
@@ -162,8 +162,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 
 	if (!init[thr_id])
 	{
-		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 2 * 32 * throughput));
+		cudaSetDevice(device_map[thr_id]);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
@@ -174,10 +173,15 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		x11_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
 		x11_echo512_cpu_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
diff --git a/x15/x14.cu b/x15/x14.cu
index b772af0..8ed930f 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -57,7 +57,7 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern int  x11_simd512_cpu_init(int thr_id, int threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void x11_echo512_cpu_init(int thr_id, int threads);
@@ -174,8 +174,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 
 	if (!init[thr_id])
 	{
-		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		cudaSetDevice(device_map[thr_id]);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
@@ -192,6 +191,8 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);
 
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
diff --git a/x15/x15.cu b/x15/x15.cu
index 589cc43..578daf3 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -58,7 +58,7 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern int  x11_simd512_cpu_init(int thr_id, int threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void x11_echo512_cpu_init(int thr_id, int threads);
@@ -184,8 +184,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 
 	if (!init[thr_id])
 	{
-		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		cudaSetDevice(device_map[thr_id]);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
@@ -203,6 +202,8 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x14_shabal512_cpu_init(thr_id, throughput);
 		x15_whirlpool_cpu_init(thr_id, throughput, 0);
 
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
diff --git a/x17/x17.cu b/x17/x17.cu
index 8ae0ceb..33b53f2 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -61,7 +61,7 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern int  x11_simd512_cpu_init(int thr_id, int threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void x11_echo512_cpu_init(int thr_id, int threads);
@@ -203,7 +203,6 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
@@ -223,6 +222,8 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 		x17_sha512_cpu_init(thr_id, throughput);
 		x17_haval256_cpu_init(thr_id, throughput);
 
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;