cuda: store device SM in a global var

sample usage made for blake and fugue (higher intensity for SM5.2) add these to cuda_helper and clean unused code
10 years ago · b128312efb
30 changed files with 18 additions and 120 deletions
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@ -4,12 +4,6 @@
 #include "cuda_helper.h"
 #include <sm_30_intrinsics.h>
 // aus cpu-miner.c
 extern int device_map[8];
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];
 static uint32_t *d_tempBranch1Nonces[8];
 static uint32_t *d_numValid[8];
 static uint32_t *h_numValid[8];
@ -40,8 +34,6 @@ cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8];
 // Setup-Funktionen
 __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
 {
 	cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 	cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
 	cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -10,8 +10,6 @@ extern "C"
 #include "miner.h"
 #include "cuda_helper.h"
 extern int device_map[8];
 static uint32_t *d_hash[8];
 extern void jackpot_keccak512_cpu_init(int thr_id, int threads);
--- a/blake32.cu
+++ b/blake32.cu
@ -17,6 +17,8 @@ extern "C" {
 /* threads per block and throughput (intensity) */
 #define TPB 128
 extern int opt_n_threads;
 /* added in sph_blake.c */
 extern "C" int blake256_rounds = 14;
@ -39,10 +41,6 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14
 #define MAXU 0xffffffffU
 // in cpu-miner.c
 extern bool opt_n_threads;
 extern int device_map[8];
 #if PRECALC64
 __constant__ uint32_t _ALIGN(32) d_data[12];
 #else
@ -399,7 +397,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 #else
 	uint32_t crcsum;
 #endif
-	uint32_t throughput = opt_work_size ? opt_work_size : (1 << 20); // 1048576 nonces per call
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20;
 	uint32_t throughput = opt_work_size ? opt_work_size : (1 << intensity);
 	throughput = min(throughput, max_nonce - first_nonce);
 	int rc = 0;
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -213,6 +213,7 @@ uint16_t opt_vote = 9999;
 static int num_processors;
 int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
 char *device_name[8]; // CB
 int device_sm[8];
 static char *rpc_url;
 static char *rpc_userpass;
 static char *rpc_user, *rpc_pass;
--- a/cuda.cu
+++ b/cuda.cu
@ -19,7 +19,6 @@
 #include "cuda_helper.h"
 extern char *device_name[8];
 extern int device_map[8];
 // CUDA Devices on the System
 extern "C" int cuda_num_devices()
@ -66,6 +65,7 @@ extern "C" void cuda_devicenames()
 		cudaGetDeviceProperties(&props, device_map[i]);
 		device_name[i] = strdup(props.name);
 		device_sm[i] = props.major * 100 + props.minor * 10;
 	}
 }
--- a/cuda_fugue256.cu
+++ b/cuda_fugue256.cu
@ -8,12 +8,6 @@
 #define USE_SHARED 1
 // aus cpu-miner.c
 extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 uint32_t *d_fugue256_hashoutput[8];
 uint32_t *d_resultNonce[8];
@ -726,7 +720,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 void fugue256_cpu_init(int thr_id, int threads)
 {
-    cudaSetDevice(device_map[thr_id]);
+	cudaSetDevice(device_map[thr_id]);
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -6,15 +6,6 @@
 #include "cuda_helper.h"
 #include <host_defines.h>
 // aus cpu-miner.c
 extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 extern uint32_t *d_resultNonce[8];
@ -102,8 +93,6 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 {
    cudaSetDevice(device_map[thr_id]);
    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
    // Speicher für Gewinner-Nonce belegen
    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
 }
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -13,6 +13,9 @@
 #include <stdint.h>
 extern int device_map[8];
 extern int device_sm[8];
 // common functions
 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@ -5,15 +5,6 @@
 #include "cuda_helper.h"
 // aus cpu-miner.c
 extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 uint32_t *d_outputHashes[8];
@ -324,8 +315,6 @@ __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
                        temp,
                        sizeof(uint32_t) * 64 );
    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
    // Speicher für Gewinner-Nonce belegen
    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -11,9 +11,6 @@ extern "C"
 #include "cuda_helper.h"
 // in cpu-miner.c
 extern int device_map[8];
 // Speicher für Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@ -13,6 +13,9 @@ extern "C" void my_fugue256(void *cc, const void *data, size_t len);
 extern "C" void my_fugue256_close(void *cc, void *dst);
 extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
 extern int device_map[8];
 extern int device_sm[8];
 #ifdef _MSC_VER
 #define MIN min
 #else
@ -30,7 +33,8 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	uint32_t max_nonce, unsigned long *hashes_done)
 {	
 	uint32_t start_nonce = pdata[19]++;
-	uint32_t throughPut = opt_work_size ? opt_work_size : (1 << 19);
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
 	uint32_t throughPut = opt_work_size ? opt_work_size : (1 << intensity);
 	throughPut = MIN(throughPut, max_nonce - start_nonce);
 	if (opt_benchmark)
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@ -2,19 +2,9 @@
 #include <memory.h>
 #include "cuda_helper.h"
 #include <device_functions.h>
 #define USE_SHARED 1
 // aus cpu-miner.c
 extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];
 // globaler Speicher für alle HeftyHashes aller Threads
 uint32_t *d_heftyHashes[8];
@ -305,8 +295,6 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
 {
    cudaSetDevice(device_map[thr_id]);
    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
    // Kopiere die Hash-Tabellen in den GPU-Speicher
    cudaMemcpyToSymbol(    hefty_gpu_constantTable,
                        hefty_cpu_constantTable,
@ -397,7 +385,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
 {
    // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
    // alle anderen mit 512 Threads.
-    int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512;
+    int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512;
    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid((threads + threadsperblock-1)/threadsperblock);
--- a/keccak/keccak256.cu
+++ b/keccak/keccak256.cu
@ -14,9 +14,6 @@ extern "C"
 #include "cuda_helper.h"
 // in cpu-miner.c
 extern int device_map[8];
 static uint32_t *d_hash[8];
 extern void keccak256_cpu_init(int thr_id, int threads);
--- a/pentablake.cu
+++ b/pentablake.cu
@ -46,9 +46,7 @@ extern "C" void pentablakehash(void *output, const void *input)
 #define MAXU 0xffffffffU
 // in cpu-miner.c
-extern bool opt_n_threads;
+extern int opt_n_threads;
 extern bool opt_benchmark;
 extern int device_map[8];
 __constant__
 static uint32_t __align__(32) c_Target[8];
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@ -10,8 +10,6 @@ extern "C"
 #include "miner.h"
 #include "cuda_helper.h"
 extern int device_map[8];
 static uint32_t *d_hash[8];
 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@ -4,12 +4,6 @@
 #include "cuda_helper.h"
 #include <sm_30_intrinsics.h>
 // aus cpu-miner.c
 extern int device_map[8];
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];
 static uint32_t *d_tempBranch1Nonces[8];
 static uint32_t *d_numValid[8];
 static uint32_t *h_numValid[8];
@ -38,8 +32,6 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
 // Setup-Funktionen
 __host__ void quark_compactTest_cpu_init(int thr_id, int threads)
 {
 	cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 	cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
 	cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -8,15 +8,6 @@
 #define TPB 256
 #define THF 4
 // aus cpu-miner.c
 extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];
 // 64 Register Variante für Compute 3.0
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"
@ -127,7 +118,6 @@ __global__ void __launch_bounds__(TPB, THF)
 // Setup-Funktionen
 __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
 {
    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 }
 __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@ -4,9 +4,6 @@
 #include "cuda_helper.h"
 // aus cpu-miner.c
 extern int device_map[8];
 // Take a look at: https://www.schneier.com/skein1.3.pdf
 #define SHL(x, n)			((x) << (n))
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -12,8 +12,6 @@ extern "C"
 #include "cuda_helper.h"
 extern int device_map[8];
 static uint32_t *d_hash[8];
 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@ -14,8 +14,6 @@ extern "C" {
 #include "cuda_helper.h"
 extern int device_map[8];
 static uint32_t *d_hash[8];
 extern void qubit_luffa512_cpu_init(int thr_id, int threads);
--- a/qubit/doom.cu
+++ b/qubit/doom.cu
@ -10,8 +10,6 @@ extern "C" {
 #include "cuda_helper.h"
 extern int device_map[8];
 static uint32_t *d_hash[8];
 extern void qubit_luffa512_cpu_init(int thr_id, int threads);
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@ -14,8 +14,6 @@ extern "C" {
 #include "cuda_helper.h"
 extern int device_map[8];
 static uint32_t *d_hash[8];
 extern void qubit_luffa512_cpu_init(int thr_id, int threads);
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -14,8 +14,6 @@ extern "C" {
 static uint32_t *d_hash[8];
 extern int device_map[8];
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_setBlock_80(void *pdata);
 extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
--- a/x11/s3.cu
+++ b/x11/s3.cu
@ -13,8 +13,6 @@ extern "C" {
 #include <stdint.h>
 extern int device_map[8];
 static uint32_t *d_hash[8];
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -20,9 +20,6 @@ extern "C"
 #include <stdio.h>
 #include <memory.h>
 // in cpu-miner.c
 extern int device_map[8];
 static uint32_t *d_hash[8];
 extern void quark_blake512_cpu_init(int thr_id, int threads);
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -23,8 +23,6 @@ extern "C"
 #include "cuda_helper.h"
 extern int device_map[8];
 static uint32_t *d_hash[8];
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@ -7,10 +7,8 @@ extern "C"
 #include "miner.h"
 }
-// from cpu-miner.c
+#include "cuda_helper.h"
 extern int device_map[8];
 // Speicher für Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];
 extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode);
--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -26,9 +26,6 @@ extern "C" {
 #include "cuda_helper.h"
 // from cpu-miner.c
 extern int device_map[8];
 // Memory for the hash functions
 static uint32_t *d_hash[8];
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -27,9 +27,6 @@ extern "C" {
 #include "cuda_helper.h"
 // from cpu-miner.c
 extern int device_map[8];
 // Memory for the hash functions
 static uint32_t *d_hash[8];
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -33,9 +33,6 @@ extern "C"
 static uint32_t *d_hash[8];
 // in cpu-miner.c
 extern int device_map[8];
 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
 extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);