From b128312efbf7af21877fef3d14c816a744b314e0 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 11 Nov 2014 18:54:56 +0100 Subject: [PATCH] cuda: store device SM in a global var sample usage made for blake and fugue (higher intensity for SM5.2) add these to cuda_helper and clean unused code --- JHA/cuda_jha_compactionTest.cu | 8 -------- JHA/jackpotcoin.cu | 2 -- blake32.cu | 9 ++++----- cpu-miner.c | 1 + cuda.cu | 2 +- cuda_fugue256.cu | 8 +------- cuda_groestlcoin.cu | 11 ----------- cuda_helper.h | 3 +++ cuda_myriadgroestl.cu | 11 ----------- cuda_nist5.cu | 3 --- fuguecoin.cpp | 6 +++++- heavy/cuda_hefty1.cu | 14 +------------- keccak/keccak256.cu | 3 --- pentablake.cu | 4 +--- quark/animecoin.cu | 2 -- quark/cuda_quark_compactionTest.cu | 8 -------- quark/cuda_quark_groestl512.cu | 10 ---------- quark/cuda_skein512.cu | 3 --- quark/quarkcoin.cu | 2 -- qubit/deep.cu | 2 -- qubit/doom.cu | 2 -- qubit/qubit.cu | 2 -- x11/fresh.cu | 2 -- x11/s3.cu | 2 -- x11/x11.cu | 3 --- x13/x13.cu | 2 -- x15/whirlpool.cu | 4 +--- x15/x14.cu | 3 --- x15/x15.cu | 3 --- x17/x17.cu | 3 --- 30 files changed, 18 insertions(+), 120 deletions(-) diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu index 4f15edb..aab71b6 100644 --- a/JHA/cuda_jha_compactionTest.cu +++ b/JHA/cuda_jha_compactionTest.cu @@ -4,12 +4,6 @@ #include "cuda_helper.h" #include -// aus cpu-miner.c -extern int device_map[8]; - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props[8]; - static uint32_t *d_tempBranch1Nonces[8]; static uint32_t *d_numValid[8]; static uint32_t *h_numValid[8]; @@ -40,8 +34,6 @@ cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8]; // Setup-Funktionen __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads) { - cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); - cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t)); diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index faa7f93..4e1073d 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -10,8 +10,6 @@ extern "C" #include "miner.h" #include "cuda_helper.h" -extern int device_map[8]; - static uint32_t *d_hash[8]; extern void jackpot_keccak512_cpu_init(int thr_id, int threads); diff --git a/blake32.cu b/blake32.cu index c5a2197..2dc7235 100644 --- a/blake32.cu +++ b/blake32.cu @@ -17,6 +17,8 @@ extern "C" { /* threads per block and throughput (intensity) */ #define TPB 128 +extern int opt_n_threads; + /* added in sph_blake.c */ extern "C" int blake256_rounds = 14; @@ -39,10 +41,6 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14 #define MAXU 0xffffffffU -// in cpu-miner.c -extern bool opt_n_threads; -extern int device_map[8]; - #if PRECALC64 __constant__ uint32_t _ALIGN(32) d_data[12]; #else @@ -399,7 +397,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt #else uint32_t crcsum; #endif - uint32_t throughput = opt_work_size ? opt_work_size : (1 << 20); // 1048576 nonces per call + int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20; + uint32_t throughput = opt_work_size ? opt_work_size : (1 << intensity); throughput = min(throughput, max_nonce - first_nonce); int rc = 0; diff --git a/cpu-miner.c b/cpu-miner.c index e6f89c9..ba0b8a9 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -213,6 +213,7 @@ uint16_t opt_vote = 9999; static int num_processors; int device_map[8] = {0,1,2,3,4,5,6,7}; // CB char *device_name[8]; // CB +int device_sm[8]; static char *rpc_url; static char *rpc_userpass; static char *rpc_user, *rpc_pass; diff --git a/cuda.cu b/cuda.cu index 3758335..eb76a7e 100644 --- a/cuda.cu +++ b/cuda.cu @@ -19,7 +19,6 @@ #include "cuda_helper.h" extern char *device_name[8]; -extern int device_map[8]; // CUDA Devices on the System extern "C" int cuda_num_devices() @@ -66,6 +65,7 @@ extern "C" void cuda_devicenames() cudaGetDeviceProperties(&props, device_map[i]); device_name[i] = strdup(props.name); + device_sm[i] = props.major * 100 + props.minor * 10; } } diff --git a/cuda_fugue256.cu b/cuda_fugue256.cu index 0c5302e..9a3b6cc 100644 --- a/cuda_fugue256.cu +++ b/cuda_fugue256.cu @@ -8,12 +8,6 @@ #define USE_SHARED 1 -// aus cpu-miner.c -extern int device_map[8]; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - uint32_t *d_fugue256_hashoutput[8]; uint32_t *d_resultNonce[8]; @@ -726,7 +720,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas void fugue256_cpu_init(int thr_id, int threads) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(device_map[thr_id]); // Kopiere die Hash-Tabellen in den GPU-Speicher texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index 851ef67..e2fd59d 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -6,15 +6,6 @@ #include "cuda_helper.h" #include -// aus cpu-miner.c -extern int device_map[8]; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props[8]; - // globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU extern uint32_t *d_resultNonce[8]; @@ -102,8 +93,6 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads) { cudaSetDevice(device_map[thr_id]); - cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); - // Speicher für Gewinner-Nonce belegen cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); } diff --git a/cuda_helper.h b/cuda_helper.h index 2e1a311..39b4354 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -13,6 +13,9 @@ #include +extern int device_map[8]; +extern int device_sm[8]; + // common functions extern void cuda_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_setTarget(const void *ptarget); diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu index 66c3074..388bf3e 100644 --- a/cuda_myriadgroestl.cu +++ b/cuda_myriadgroestl.cu @@ -5,15 +5,6 @@ #include "cuda_helper.h" -// aus cpu-miner.c -extern int device_map[8]; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props[8]; - // globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU uint32_t *d_outputHashes[8]; @@ -324,8 +315,6 @@ __host__ void myriadgroestl_cpu_init(int thr_id, int threads) temp, sizeof(uint32_t) * 64 ); - cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); - // Speicher für Gewinner-Nonce belegen cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); diff --git a/cuda_nist5.cu b/cuda_nist5.cu index 5a41b0d..9e4f9e7 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -11,9 +11,6 @@ extern "C" #include "cuda_helper.h" -// in cpu-miner.c -extern int device_map[8]; - // Speicher für Input/Output der verketteten Hashfunktionen static uint32_t *d_hash[8]; diff --git a/fuguecoin.cpp b/fuguecoin.cpp index 5d1d59c..32135e4 100644 --- a/fuguecoin.cpp +++ b/fuguecoin.cpp @@ -13,6 +13,9 @@ extern "C" void my_fugue256(void *cc, const void *data, size_t len); extern "C" void my_fugue256_close(void *cc, void *dst); extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst); +extern int device_map[8]; +extern int device_sm[8]; + #ifdef _MSC_VER #define MIN min #else @@ -30,7 +33,8 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt uint32_t max_nonce, unsigned long *hashes_done) { uint32_t start_nonce = pdata[19]++; - uint32_t throughPut = opt_work_size ? opt_work_size : (1 << 19); + int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19; + uint32_t throughPut = opt_work_size ? opt_work_size : (1 << intensity); throughPut = MIN(throughPut, max_nonce - start_nonce); if (opt_benchmark) diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu index dcbc4d4..6d2b324 100644 --- a/heavy/cuda_hefty1.cu +++ b/heavy/cuda_hefty1.cu @@ -2,19 +2,9 @@ #include #include "cuda_helper.h" -#include #define USE_SHARED 1 -// aus cpu-miner.c -extern int device_map[8]; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props[8]; - // globaler Speicher für alle HeftyHashes aller Threads uint32_t *d_heftyHashes[8]; @@ -305,8 +295,6 @@ __host__ void hefty_cpu_init(int thr_id, int threads) { cudaSetDevice(device_map[thr_id]); - cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); - // Kopiere die Hash-Tabellen in den GPU-Speicher cudaMemcpyToSymbol( hefty_gpu_constantTable, hefty_cpu_constantTable, @@ -397,7 +385,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) { // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, // alle anderen mit 512 Threads. - int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512; + int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); diff --git a/keccak/keccak256.cu b/keccak/keccak256.cu index b95e984..b539725 100644 --- a/keccak/keccak256.cu +++ b/keccak/keccak256.cu @@ -14,9 +14,6 @@ extern "C" #include "cuda_helper.h" -// in cpu-miner.c -extern int device_map[8]; - static uint32_t *d_hash[8]; extern void keccak256_cpu_init(int thr_id, int threads); diff --git a/pentablake.cu b/pentablake.cu index 1059f5a..ae87172 100644 --- a/pentablake.cu +++ b/pentablake.cu @@ -46,9 +46,7 @@ extern "C" void pentablakehash(void *output, const void *input) #define MAXU 0xffffffffU // in cpu-miner.c -extern bool opt_n_threads; -extern bool opt_benchmark; -extern int device_map[8]; +extern int opt_n_threads; __constant__ static uint32_t __align__(32) c_Target[8]; diff --git a/quark/animecoin.cu b/quark/animecoin.cu index dc24bb3..0733008 100644 --- a/quark/animecoin.cu +++ b/quark/animecoin.cu @@ -10,8 +10,6 @@ extern "C" #include "miner.h" #include "cuda_helper.h" -extern int device_map[8]; - static uint32_t *d_hash[8]; // Speicher zur Generierung der Noncevektoren für die bedingten Hashes diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu index 0a771db..6a0ec5f 100644 --- a/quark/cuda_quark_compactionTest.cu +++ b/quark/cuda_quark_compactionTest.cu @@ -4,12 +4,6 @@ #include "cuda_helper.h" #include -// aus cpu-miner.c -extern int device_map[8]; - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props[8]; - static uint32_t *d_tempBranch1Nonces[8]; static uint32_t *d_numValid[8]; static uint32_t *h_numValid[8]; @@ -38,8 +32,6 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8]; // Setup-Funktionen __host__ void quark_compactTest_cpu_init(int thr_id, int threads) { - cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); - cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index 9f39517..a52b2a4 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -8,15 +8,6 @@ #define TPB 256 #define THF 4 -// aus cpu-miner.c -extern int device_map[8]; - -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - -// diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props[8]; - // 64 Register Variante für Compute 3.0 #include "groestl_functions_quad.cu" #include "bitslice_transformations_quad.cu" @@ -127,7 +118,6 @@ __global__ void __launch_bounds__(TPB, THF) // Setup-Funktionen __host__ void quark_groestl512_cpu_init(int thr_id, int threads) { - cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); } __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index eedc1a1..129e1ea 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -4,9 +4,6 @@ #include "cuda_helper.h" -// aus cpu-miner.c -extern int device_map[8]; - // Take a look at: https://www.schneier.com/skein1.3.pdf #define SHL(x, n) ((x) << (n)) diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index 636b68e..2f05b3b 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -12,8 +12,6 @@ extern "C" #include "cuda_helper.h" -extern int device_map[8]; - static uint32_t *d_hash[8]; // Speicher zur Generierung der Noncevektoren für die bedingten Hashes diff --git a/qubit/deep.cu b/qubit/deep.cu index 44cff7c..b68fbb7 100644 --- a/qubit/deep.cu +++ b/qubit/deep.cu @@ -14,8 +14,6 @@ extern "C" { #include "cuda_helper.h" -extern int device_map[8]; - static uint32_t *d_hash[8]; extern void qubit_luffa512_cpu_init(int thr_id, int threads); diff --git a/qubit/doom.cu b/qubit/doom.cu index 4e40626..ba950cb 100644 --- a/qubit/doom.cu +++ b/qubit/doom.cu @@ -10,8 +10,6 @@ extern "C" { #include "cuda_helper.h" -extern int device_map[8]; - static uint32_t *d_hash[8]; extern void qubit_luffa512_cpu_init(int thr_id, int threads); diff --git a/qubit/qubit.cu b/qubit/qubit.cu index 45b0d14..92db8a7 100644 --- a/qubit/qubit.cu +++ b/qubit/qubit.cu @@ -14,8 +14,6 @@ extern "C" { #include "cuda_helper.h" -extern int device_map[8]; - static uint32_t *d_hash[8]; extern void qubit_luffa512_cpu_init(int thr_id, int threads); diff --git a/x11/fresh.cu b/x11/fresh.cu index 8aa1214..4745f0b 100644 --- a/x11/fresh.cu +++ b/x11/fresh.cu @@ -14,8 +14,6 @@ extern "C" { static uint32_t *d_hash[8]; -extern int device_map[8]; - extern void x11_shavite512_cpu_init(int thr_id, int threads); extern void x11_shavite512_setBlock_80(void *pdata); extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); diff --git a/x11/s3.cu b/x11/s3.cu index 24e73bb..81b8624 100644 --- a/x11/s3.cu +++ b/x11/s3.cu @@ -13,8 +13,6 @@ extern "C" { #include -extern int device_map[8]; - static uint32_t *d_hash[8]; extern void x11_shavite512_cpu_init(int thr_id, int threads); diff --git a/x11/x11.cu b/x11/x11.cu index 6dad14c..81e1776 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -20,9 +20,6 @@ extern "C" #include #include -// in cpu-miner.c -extern int device_map[8]; - static uint32_t *d_hash[8]; extern void quark_blake512_cpu_init(int thr_id, int threads); diff --git a/x13/x13.cu b/x13/x13.cu index 747ec92..4918e3a 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -23,8 +23,6 @@ extern "C" #include "cuda_helper.h" -extern int device_map[8]; - static uint32_t *d_hash[8]; diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu index c9575ed..c2ae9bc 100644 --- a/x15/whirlpool.cu +++ b/x15/whirlpool.cu @@ -7,10 +7,8 @@ extern "C" #include "miner.h" } -// from cpu-miner.c -extern int device_map[8]; +#include "cuda_helper.h" -// Speicher für Input/Output der verketteten Hashfunktionen static uint32_t *d_hash[8]; extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode); diff --git a/x15/x14.cu b/x15/x14.cu index b24a61d..a0adc2f 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -26,9 +26,6 @@ extern "C" { #include "cuda_helper.h" -// from cpu-miner.c -extern int device_map[8]; - // Memory for the hash functions static uint32_t *d_hash[8]; diff --git a/x15/x15.cu b/x15/x15.cu index ec73805..a424834 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -27,9 +27,6 @@ extern "C" { #include "cuda_helper.h" -// from cpu-miner.c -extern int device_map[8]; - // Memory for the hash functions static uint32_t *d_hash[8]; diff --git a/x17/x17.cu b/x17/x17.cu index e4573a5..28a382f 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -33,9 +33,6 @@ extern "C" static uint32_t *d_hash[8]; -// in cpu-miner.c -extern int device_map[8]; - extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);