Handle a maximum of 16 gpus (vs 8 before)

Some cards have 2 gpus on board...
2015-01-22 04:34:30 +01:00 · 2015-01-22 04:34:30 +01:00 · cafd4477d7
commit cafd4477d7
parent 65cefa4a7d
44 changed files with 136 additions and 124 deletions
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@ -49,8 +49,8 @@ extern "C" uint32_t crc32_u32t(const uint32_t *buf, size_t size);
 #endif

 /* 8 adapters max */
-static uint32_t *d_resNonce[8];
-static uint32_t *h_resNonce[8];
+static uint32_t *d_resNonce[MAX_GPUS];
+static uint32_t *h_resNonce[MAX_GPUS];

 /* max count of found nonces in one call */
 #define NBN 2
@ -377,7 +377,7 @@ void blake256_cpu_setBlock_16(uint32_t *penddata, const uint32_t *midstate, cons
 }
 #endif

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds=14)
--- a/Algo256/cuda_fugue256.cu
+++ b/Algo256/cuda_fugue256.cu
@ -8,8 +8,8 @@

 #define USE_SHARED 1

-uint32_t *d_fugue256_hashoutput[8];
-uint32_t *d_resultNonce[8];
+uint32_t *d_fugue256_hashoutput[MAX_GPUS];
+uint32_t *d_resultNonce[MAX_GPUS];

 __constant__ uint32_t GPUstate[30]; // Single GPU
 __constant__ uint32_t pTarget[8]; // Single GPU
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@ -2,8 +2,8 @@

 #include "cuda_helper.h"

-uint32_t *d_gnounce[8];
-uint32_t *d_GNonce[8];
+uint32_t *d_gnounce[MAX_GPUS];
+uint32_t *d_GNonce[MAX_GPUS];

 __constant__ uint32_t pTarget[8];

--- a/Algo256/cuda_keccak256.cu
+++ b/Algo256/cuda_keccak256.cu
@ -22,8 +22,8 @@ static const uint64_t host_keccak_round_constants[24] = {
 	0x0000000080000001ull, 0x8000000080008008ull
 };

-uint32_t *d_nounce[8];
-uint32_t *d_KNonce[8];
+uint32_t *d_nounce[MAX_GPUS];
+uint32_t *d_KNonce[MAX_GPUS];

 __constant__ uint32_t pTarget[8];
 __constant__ uint64_t keccak_round_constants[24];
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@ -14,7 +14,7 @@ extern "C"

 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void keccak256_cpu_init(int thr_id, int threads);
 extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
@ -34,7 +34,7 @@ extern "C" void keccak256_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@ -4,11 +4,11 @@
 #include "cuda_helper.h"
 #include <sm_30_intrinsics.h>

-static uint32_t *d_tempBranch1Nonces[8];
-static uint32_t *d_numValid[8];
-static uint32_t *h_numValid[8];
+static uint32_t *d_tempBranch1Nonces[MAX_GPUS];
+static uint32_t *d_numValid[MAX_GPUS];
+static uint32_t *h_numValid[MAX_GPUS];

-static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
+static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen

 // True/False tester
 typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
@ -26,7 +26,8 @@ __device__ uint32_t JackpotFalseTest(uint32_t *inpHash)
 }

 __device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d_JackpotFalseFunction = JackpotFalseTest;
-cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8];
+
+cuda_compactTestFunction_t h_JackpotTrueFunction[MAX_GPUS], h_JackpotFalseFunction[MAX_GPUS];

 // Setup-Funktionen
 __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -10,7 +10,7 @@ extern "C"
 #include "miner.h"
 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void jackpot_keccak512_cpu_init(int thr_id, int threads);
 extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
@ -37,10 +37,10 @@ extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st
 extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
-static uint32_t *d_jackpotNonces[8];
-static uint32_t *d_branch1Nonces[8];
-static uint32_t *d_branch2Nonces[8];
-static uint32_t *d_branch3Nonces[8];
+static uint32_t *d_jackpotNonces[MAX_GPUS];
+static uint32_t *d_branch1Nonces[MAX_GPUS];
+static uint32_t *d_branch2Nonces[MAX_GPUS];
+static uint32_t *d_branch3Nonces[MAX_GPUS];

 // Original jackpothash Funktion aus einem miner Quelltext
 extern "C" unsigned int jackpothash(void *state, const void *input)
@ -85,7 +85,7 @@ extern "C" unsigned int jackpothash(void *state, const void *input)
    return round;
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -214,9 +214,9 @@ bool opt_trust_pool = false;
 uint16_t opt_vote = 9999;
 int num_cpus;
 int active_gpus;
-char * device_name[8];
-short device_map[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
-long  device_sm[8] = { 0 };
+char * device_name[MAX_GPUS];
+short device_map[MAX_GPUS] = { 0 };
+long  device_sm[MAX_GPUS] = { 0 };
 char *rpc_user = NULL;
 static char *rpc_url;
 static char *rpc_userpass;
@ -2223,6 +2223,11 @@ int main(int argc, char *argv[])
 	if (num_cpus < 1)
 		num_cpus = 1;

+	// default thread to device map
+	for (i = 0; i < MAX_GPUS; i++) {
+		device_map[i] = i;
+	}
+
 	// number of gpus
 	active_gpus = cuda_num_devices();
 	cuda_devicenames();
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@ -10,9 +10,9 @@

 __constant__ uint32_t pTarget[8]; // 32 bytes

-// store 8 device arrays of 8 nonces
-static uint32_t* h_resNonces[8];
-static uint32_t* d_resNonces[8];
+// store MAX_GPUS device arrays of 8 nonces
+static uint32_t* h_resNonces[MAX_GPUS];
+static uint32_t* d_resNonces[MAX_GPUS];

 __host__
 void cuda_check_cpu_init(int thr_id, int threads)
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -8,7 +8,7 @@

 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
-extern uint32_t *d_resultNonce[8];
+extern uint32_t *d_resultNonce[MAX_GPUS];

 __constant__ uint32_t groestlcoin_gpu_msg[32];

--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -13,8 +13,12 @@

 #include <stdint.h>

-extern "C" short device_map[8];
-extern "C"  long device_sm[8];
+#ifndef MAX_GPUS
+#define MAX_GPUS 16
+#endif
+
+extern "C" short device_map[MAX_GPUS];
+extern "C"  long device_sm[MAX_GPUS];

 // common functions
 extern void cuda_check_cpu_init(int thr_id, int threads);
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@ -15,8 +15,8 @@

 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
-uint32_t *d_outputHashes[8];
-extern uint32_t *d_resultNonce[8];
+uint32_t *d_outputHashes[MAX_GPUS];
+extern uint32_t *d_resultNonce[MAX_GPUS];

 __constant__ uint32_t myriadgroestl_gpu_msg[32];

--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -11,7 +11,7 @@ extern "C"

 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
@ -64,7 +64,7 @@ extern "C" void nist5hash(void *state, const void *input)
    memcpy(state, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@ -14,13 +14,13 @@ extern "C" void my_fugue256_close(void *cc, void *dst);
 extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);

 // vorbereitete Kontexte nach den ersten 80 Bytes
-sph_fugue256_context  ctx_fugue_const[8];
+// sph_fugue256_context  ctx_fugue_const[MAX_GPUS];

 #define SWAP32(x) \
    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@ -58,7 +58,7 @@ extern "C" void groestlhash(void *state, const void *input)
    memcpy(state, hashB, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
    uint32_t max_nonce, unsigned long *hashes_done)
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@ -4,11 +4,11 @@
 #include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *heavy_heftyHashes[8];
-extern uint32_t *heavy_nonceVector[8];
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];

 // globaler Speicher für unsere Ergebnisse
-uint32_t *d_hash5output[8];
+uint32_t *d_hash5output[MAX_GPUS];

 // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@ -3,13 +3,13 @@
 #include "cuda_helper.h"

 // globaler Speicher für unsere Ergebnisse
-static uint32_t *d_hashoutput[8];
-extern uint32_t *d_hash2output[8];
-extern uint32_t *d_hash3output[8];
-extern uint32_t *d_hash4output[8];
-extern uint32_t *d_hash5output[8];
+static uint32_t *d_hashoutput[MAX_GPUS];
+extern uint32_t *d_hash2output[MAX_GPUS];
+extern uint32_t *d_hash3output[MAX_GPUS];
+extern uint32_t *d_hash4output[MAX_GPUS];
+extern uint32_t *d_hash5output[MAX_GPUS];

-extern uint32_t *heavy_nonceVector[8];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];

 /* Combines top 64-bits from each hash into a single hash */
 __device__
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@ -4,11 +4,11 @@
 #include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *heavy_heftyHashes[8];
-extern uint32_t *heavy_nonceVector[8];
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];

 // globaler Speicher für unsere Ergebnisse
-uint32_t *d_hash4output[8];
+uint32_t *d_hash4output[MAX_GPUS];

 __constant__ uint32_t groestl_gpu_state[32];
 __constant__ uint32_t groestl_gpu_msg[32];
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@ -8,7 +8,7 @@
 #define USE_SHARED 1

 // globaler Speicher für alle HeftyHashes aller Threads
-uint32_t *heavy_heftyHashes[8];
+uint32_t *heavy_heftyHashes[MAX_GPUS];

 /* Hash-Tabellen */
 __constant__ uint32_t hefty_gpu_constantTable[64];
--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@ -4,13 +4,13 @@
 #include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *heavy_heftyHashes[8];
-extern uint32_t *heavy_nonceVector[8];
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];

 // globaler Speicher für unsere Ergebnisse
-uint32_t *d_hash3output[8];
-extern uint32_t *d_hash4output[8];
-extern uint32_t *d_hash5output[8];
+uint32_t *d_hash3output[MAX_GPUS];
+extern uint32_t *d_hash4output[MAX_GPUS];
+extern uint32_t *d_hash5output[MAX_GPUS];

 // der Keccak512 State nach der ersten Runde (72 Bytes)
 __constant__ uint64_t c_State[25];
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@ -4,11 +4,11 @@
 #include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *heavy_heftyHashes[8];
-extern uint32_t *heavy_nonceVector[8];
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];

 // globaler Speicher für unsere Ergebnisse
-uint32_t *d_hash2output[8];
+uint32_t *d_hash2output[MAX_GPUS];


 /* Hash-Tabellen */
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@ -16,18 +16,18 @@ extern "C" {
 #include "heavy/heavy.h"
 #include "cuda_helper.h"

-extern uint32_t *d_hash2output[8];
-extern uint32_t *d_hash3output[8];
-extern uint32_t *d_hash4output[8];
-extern uint32_t *d_hash5output[8];
+extern uint32_t *d_hash2output[MAX_GPUS];
+extern uint32_t *d_hash3output[MAX_GPUS];
+extern uint32_t *d_hash4output[MAX_GPUS];
+extern uint32_t *d_hash5output[MAX_GPUS];

 #define HEAVYCOIN_BLKHDR_SZ 84
 #define MNR_BLKHDR_SZ       80

 // nonce-array für die threads
-uint32_t *heavy_nonceVector[8];
+uint32_t *heavy_nonceVector[MAX_GPUS];

-extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];

 /* Combines top 64-bits from each hash into a single hash */
 static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
@ -127,7 +127,7 @@ struct check_nonce_for_remove
    }
 };

-static bool init[8] = {0,0,0,0,0,0,0,0};
+static bool init[MAX_GPUS] = { 0 };

 __host__
 int scanhash_heavy(int thr_id, uint32_t *pdata,
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@ -9,7 +9,7 @@ extern "C" {
 #include "miner.h"
 #include "cuda_helper.h"

-static _ALIGN(64) uint64_t *d_hash[8];
+static _ALIGN(64) uint64_t *d_hash[MAX_GPUS];

 extern void blake256_cpu_init(int thr_id, int threads);
 extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
@ -55,7 +55,7 @@ extern "C" void lyra2_hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
--- a/miner.h
+++ b/miner.h
@ -485,9 +485,10 @@ extern uint32_t opt_work_size;
 extern uint64_t global_hashrate;
 extern double   global_diff;

-extern char* device_name[8];
-extern short device_map[8];
-extern long  device_sm[8];
+#define MAX_GPUS 16
+extern char* device_name[MAX_GPUS];
+extern short device_map[MAX_GPUS];
+extern long  device_sm[MAX_GPUS];

 #define CL_N    "\x1B[0m"
 #define CL_RED  "\x1B[31m"
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@ -32,7 +32,7 @@ extern "C" void myriadhash(void *state, const void *input)
    memcpy(state, hashB, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
--- a/nvml.cpp
+++ b/nvml.cpp
@ -34,7 +34,7 @@ int cuda_num_devices();
 extern nvml_handle *hnvml;
 extern char driver_version[32];

-static uint32_t device_bus_ids[8] = { 0 };
+static uint32_t device_bus_ids[MAX_GPUS] = { 0 };

 /*
 * Wrappers to emulate dlopen() on other systems like Windows
@ -451,7 +451,7 @@ int nvml_destroy(nvml_handle *nvmlh)
 #ifdef WIN32
 #include "nvapi/nvapi_ccminer.h"

-static int nvapi_dev_map[8] = { 0 };
+static int nvapi_dev_map[MAX_GPUS] = { 0 };
 static NvDisplayHandle hDisplay_a[NVAPI_MAX_PHYSICAL_GPUS * 2] = { 0 };
 static NvPhysicalGpuHandle phys[NVAPI_MAX_PHYSICAL_GPUS] = { 0 };
 static NvU32 nvapi_dev_cnt = 0;
--- a/pentablake.cu
+++ b/pentablake.cu
@ -49,9 +49,9 @@ static uint32_t __align__(32) c_Target[8];
 __constant__
 static uint64_t __align__(32) c_data[32];

-static uint32_t *d_hash[8];
-static uint32_t *d_resNounce[8];
-static uint32_t *h_resNounce[8];
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNounce[MAX_GPUS];
+static uint32_t *h_resNounce[MAX_GPUS];
 static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };

 /* prefer uint32_t to prevent size conversions = speed +5/10 % */
@ -362,7 +362,7 @@ void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice));
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@ -10,13 +10,13 @@ extern "C"
 #include "miner.h"
 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
-static uint32_t *d_animeNonces[8];
-static uint32_t *d_branch1Nonces[8];
-static uint32_t *d_branch2Nonces[8];
-static uint32_t *d_branch3Nonces[8];
+static uint32_t *d_animeNonces[MAX_GPUS];
+static uint32_t *d_branch1Nonces[MAX_GPUS];
+static uint32_t *d_branch2Nonces[MAX_GPUS];
+static uint32_t *d_branch3Nonces[MAX_GPUS];

 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -160,7 +160,7 @@ struct HashPredicate
 };
 */

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@ -4,11 +4,11 @@
 #include "cuda_helper.h"
 #include <sm_30_intrinsics.h>

-static uint32_t *d_tempBranch1Nonces[8];
-static uint32_t *d_numValid[8];
-static uint32_t *h_numValid[8];
+static uint32_t *d_tempBranch1Nonces[MAX_GPUS];
+static uint32_t *d_numValid[MAX_GPUS];
+static uint32_t *h_numValid[MAX_GPUS];

-static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
+static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen

 // True/False tester
 typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
@ -24,7 +24,8 @@ __device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
 }

 __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
-cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
+
+cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS];

 // Setup-Funktionen
 __host__ void quark_compactTest_cpu_init(int thr_id, int threads)
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -12,13 +12,13 @@ extern "C"

 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
-static uint32_t *d_quarkNonces[8];
-static uint32_t *d_branch1Nonces[8];
-static uint32_t *d_branch2Nonces[8];
-static uint32_t *d_branch3Nonces[8];
+static uint32_t *d_quarkNonces[MAX_GPUS];
+static uint32_t *d_branch1Nonces[MAX_GPUS];
+static uint32_t *d_branch2Nonces[MAX_GPUS];
+static uint32_t *d_branch3Nonces[MAX_GPUS];

 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
@ -130,7 +130,7 @@ extern "C" void quarkhash(void *state, const void *input)
    memcpy(state, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
@ -149,7 +149,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		cudaSetDevice(device_map[thr_id]);

 		// Konstanten kopieren, Speicher belegen
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));

 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@ -14,7 +14,7 @@ extern "C" {

 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void qubit_luffa512_cpu_init(int thr_id, int threads);
 extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
@ -52,7 +52,7 @@ extern "C" void deephash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
--- a/qubit/doom.cu
+++ b/qubit/doom.cu
@ -10,7 +10,7 @@ extern "C" {

 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void qubit_luffa512_cpu_init(int thr_id, int threads);
 extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
@ -32,7 +32,7 @@ extern void doomhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@ -14,7 +14,7 @@ extern "C" {

 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void qubit_luffa512_cpu_init(int thr_id, int threads);
 extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
@ -72,7 +72,7 @@ extern "C" void qubithash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
--- a/qubit/qubit_luffa512.cu
+++ b/qubit/qubit_luffa512.cu
@ -32,8 +32,8 @@ typedef unsigned char BitSequence;
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __constant__ uint32_t c_Target[8];

-static uint32_t *h_resNounce[8];
-static uint32_t *d_resNounce[8];
+static uint32_t *h_resNounce[MAX_GPUS];
+static uint32_t *d_resNounce[MAX_GPUS];

 #define NBN 1 /* max results, could be 2, see blake32.cu */
 #if NBN > 1
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@ -10,8 +10,8 @@
 #include "miner.h"
 #include "cuda_helper.h"

-uint32_t *d_state[8];
-uint4 *d_temp4[8];
+uint32_t *d_state[MAX_GPUS];
+uint4 *d_temp4[MAX_GPUS];

 // texture bound to d_temp4[thr_id], for read access in Compaction kernel
 texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -12,7 +12,7 @@ extern "C" {
 // to test gpu hash on a null buffer
 #define NULLTEST 0

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_setBlock_80(void *pdata);
@ -68,7 +68,7 @@ extern "C" void fresh_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
--- a/x11/s3.cu
+++ b/x11/s3.cu
@ -13,7 +13,7 @@ extern "C" {

 #include <stdint.h>

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
@ -49,7 +49,7 @@ extern "C" void s3hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 /* Main S3 entry point */
 extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -20,7 +20,7 @@ extern "C"
 #include <stdio.h>
 #include <memory.h>

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
@ -125,7 +125,7 @@ extern "C" void x11hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -23,7 +23,7 @@ extern "C"

 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];


 extern void quark_blake512_cpu_init(int thr_id, int threads);
@ -144,7 +144,7 @@ extern "C" void x13hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
@ -14,8 +14,8 @@
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __constant__ uint32_t pTarget[8];

-uint32_t *d_wnounce[8];
-uint32_t *d_WNonce[8];
+uint32_t *d_wnounce[MAX_GPUS];
+uint32_t *d_WNonce[MAX_GPUS];

 #define USE_ALL_TABLES 1

--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@ -9,7 +9,7 @@ extern "C"

 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode);
 extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -49,7 +49,7 @@ extern "C" void wcoinhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_whc(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -27,7 +27,7 @@ extern "C" {
 #include "cuda_helper.h"

 // Memory for the hash functions
-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
@ -155,7 +155,7 @@ extern "C" void x14hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -28,7 +28,7 @@ extern "C" {
 #include "cuda_helper.h"

 // Memory for the hash functions
-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
@ -165,7 +165,7 @@ extern "C" void x15hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -31,7 +31,7 @@ extern "C"
 #include "miner.h"
 #include "cuda_helper.h"

-static uint32_t *d_hash[8];
+static uint32_t *d_hash[MAX_GPUS];

 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
@ -184,7 +184,7 @@ extern "C" void x17hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-static bool init[8] = { 0 };
+static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,