From b128312efbf7af21877fef3d14c816a744b314e0 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 11 Nov 2014 18:54:56 +0100
Subject: [PATCH] cuda: store device SM in a global var

sample usage made for blake and fugue (higher intensity for SM5.2)

add these to cuda_helper and clean unused code
---
 JHA/cuda_jha_compactionTest.cu     |  8 --------
 JHA/jackpotcoin.cu                 |  2 --
 blake32.cu                         |  9 ++++-----
 cpu-miner.c                        |  1 +
 cuda.cu                            |  2 +-
 cuda_fugue256.cu                   |  8 +-------
 cuda_groestlcoin.cu                | 11 -----------
 cuda_helper.h                      |  3 +++
 cuda_myriadgroestl.cu              | 11 -----------
 cuda_nist5.cu                      |  3 ---
 fuguecoin.cpp                      |  6 +++++-
 heavy/cuda_hefty1.cu               | 14 +-------------
 keccak/keccak256.cu                |  3 ---
 pentablake.cu                      |  4 +---
 quark/animecoin.cu                 |  2 --
 quark/cuda_quark_compactionTest.cu |  8 --------
 quark/cuda_quark_groestl512.cu     | 10 ----------
 quark/cuda_skein512.cu             |  3 ---
 quark/quarkcoin.cu                 |  2 --
 qubit/deep.cu                      |  2 --
 qubit/doom.cu                      |  2 --
 qubit/qubit.cu                     |  2 --
 x11/fresh.cu                       |  2 --
 x11/s3.cu                          |  2 --
 x11/x11.cu                         |  3 ---
 x13/x13.cu                         |  2 --
 x15/whirlpool.cu                   |  4 +---
 x15/x14.cu                         |  3 ---
 x15/x15.cu                         |  3 ---
 x17/x17.cu                         |  3 ---
 30 files changed, 18 insertions(+), 120 deletions(-)

diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu
index 4f15edb..aab71b6 100644
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@@ -4,12 +4,6 @@
 #include "cuda_helper.h"
 #include <sm_30_intrinsics.h>
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
-
 static uint32_t *d_tempBranch1Nonces[8];
 static uint32_t *d_numValid[8];
 static uint32_t *h_numValid[8];
@@ -40,8 +34,6 @@ cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8];
 // Setup-Funktionen
 __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
 {
-	cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
 	cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
 	cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
 
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index faa7f93..4e1073d 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -10,8 +10,6 @@ extern "C"
 #include "miner.h"
 #include "cuda_helper.h"
 
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 extern void jackpot_keccak512_cpu_init(int thr_id, int threads);
diff --git a/blake32.cu b/blake32.cu
index c5a2197..2dc7235 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -17,6 +17,8 @@ extern "C" {
 /* threads per block and throughput (intensity) */
 #define TPB 128
 
+extern int opt_n_threads;
+
 /* added in sph_blake.c */
 extern "C" int blake256_rounds = 14;
 
@@ -39,10 +41,6 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14
 
 #define MAXU 0xffffffffU
 
-// in cpu-miner.c
-extern bool opt_n_threads;
-extern int device_map[8];
-
 #if PRECALC64
 __constant__ uint32_t _ALIGN(32) d_data[12];
 #else
@@ -399,7 +397,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 #else
 	uint32_t crcsum;
 #endif
-	uint32_t throughput = opt_work_size ? opt_work_size : (1 << 20); // 1048576 nonces per call
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20;
+	uint32_t throughput = opt_work_size ? opt_work_size : (1 << intensity);
 	throughput = min(throughput, max_nonce - first_nonce);
 
 	int rc = 0;
diff --git a/cpu-miner.c b/cpu-miner.c
index e6f89c9..ba0b8a9 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -213,6 +213,7 @@ uint16_t opt_vote = 9999;
 static int num_processors;
 int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
 char *device_name[8]; // CB
+int device_sm[8];
 static char *rpc_url;
 static char *rpc_userpass;
 static char *rpc_user, *rpc_pass;
diff --git a/cuda.cu b/cuda.cu
index 3758335..eb76a7e 100644
--- a/cuda.cu
+++ b/cuda.cu
@@ -19,7 +19,6 @@
 #include "cuda_helper.h"
 
 extern char *device_name[8];
-extern int device_map[8];
 
 // CUDA Devices on the System
 extern "C" int cuda_num_devices()
@@ -66,6 +65,7 @@ extern "C" void cuda_devicenames()
 		cudaGetDeviceProperties(&props, device_map[i]);
 
 		device_name[i] = strdup(props.name);
+		device_sm[i] = props.major * 100 + props.minor * 10;
 	}
 }
 
diff --git a/cuda_fugue256.cu b/cuda_fugue256.cu
index 0c5302e..9a3b6cc 100644
--- a/cuda_fugue256.cu
+++ b/cuda_fugue256.cu
@@ -8,12 +8,6 @@
 
 #define USE_SHARED 1
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
 uint32_t *d_fugue256_hashoutput[8];
 uint32_t *d_resultNonce[8];
 
@@ -726,7 +720,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 
 void fugue256_cpu_init(int thr_id, int threads)
 {
-    cudaSetDevice(device_map[thr_id]);
+	cudaSetDevice(device_map[thr_id]);
 
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index 851ef67..e2fd59d 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -6,15 +6,6 @@
 #include "cuda_helper.h"
 #include <host_defines.h>
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
-
 // globaler Speicher fÃ¼r alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 extern uint32_t *d_resultNonce[8];
@@ -102,8 +93,6 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 {
     cudaSetDevice(device_map[thr_id]);
 
-    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
     // Speicher fÃ¼r Gewinner-Nonce belegen
     cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
 }
diff --git a/cuda_helper.h b/cuda_helper.h
index 2e1a311..39b4354 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -13,6 +13,9 @@
 
 #include <stdint.h>
 
+extern int device_map[8];
+extern int device_sm[8];
+
 // common functions
 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu
index 66c3074..388bf3e 100644
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@@ -5,15 +5,6 @@
 
 #include "cuda_helper.h"
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
-
 // globaler Speicher fÃ¼r alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 uint32_t *d_outputHashes[8];
@@ -324,8 +315,6 @@ __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
                         temp,
                         sizeof(uint32_t) * 64 );
 
-    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
     // Speicher fÃ¼r Gewinner-Nonce belegen
     cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
 
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index 5a41b0d..9e4f9e7 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -11,9 +11,6 @@ extern "C"
 
 #include "cuda_helper.h"
 
-// in cpu-miner.c
-extern int device_map[8];
-
 // Speicher fÃ¼r Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];
 
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
index 5d1d59c..32135e4 100644
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@@ -13,6 +13,9 @@ extern "C" void my_fugue256(void *cc, const void *data, size_t len);
 extern "C" void my_fugue256_close(void *cc, void *dst);
 extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
 
+extern int device_map[8];
+extern int device_sm[8];
+
 #ifdef _MSC_VER
 #define MIN min
 #else
@@ -30,7 +33,8 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	uint32_t max_nonce, unsigned long *hashes_done)
 {	
 	uint32_t start_nonce = pdata[19]++;
-	uint32_t throughPut = opt_work_size ? opt_work_size : (1 << 19);
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
+	uint32_t throughPut = opt_work_size ? opt_work_size : (1 << intensity);
 	throughPut = MIN(throughPut, max_nonce - start_nonce);
 
 	if (opt_benchmark)
diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu
index dcbc4d4..6d2b324 100644
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@@ -2,19 +2,9 @@
 #include <memory.h>
 
 #include "cuda_helper.h"
-#include <device_functions.h>
 
 #define USE_SHARED 1
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
-
 // globaler Speicher für alle HeftyHashes aller Threads
 uint32_t *d_heftyHashes[8];
 
@@ -305,8 +295,6 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
 {
     cudaSetDevice(device_map[thr_id]);
 
-    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
     // Kopiere die Hash-Tabellen in den GPU-Speicher
     cudaMemcpyToSymbol(    hefty_gpu_constantTable,
                         hefty_cpu_constantTable,
@@ -397,7 +385,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
 {
     // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
     // alle anderen mit 512 Threads.
-    int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512;
+    int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
diff --git a/keccak/keccak256.cu b/keccak/keccak256.cu
index b95e984..b539725 100644
--- a/keccak/keccak256.cu
+++ b/keccak/keccak256.cu
@@ -14,9 +14,6 @@ extern "C"
 
 #include "cuda_helper.h"
 
-// in cpu-miner.c
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 extern void keccak256_cpu_init(int thr_id, int threads);
diff --git a/pentablake.cu b/pentablake.cu
index 1059f5a..ae87172 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -46,9 +46,7 @@ extern "C" void pentablakehash(void *output, const void *input)
 #define MAXU 0xffffffffU
 
 // in cpu-miner.c
-extern bool opt_n_threads;
-extern bool opt_benchmark;
-extern int device_map[8];
+extern int opt_n_threads;
 
 __constant__
 static uint32_t __align__(32) c_Target[8];
diff --git a/quark/animecoin.cu b/quark/animecoin.cu
index dc24bb3..0733008 100644
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@@ -10,8 +10,6 @@ extern "C"
 #include "miner.h"
 #include "cuda_helper.h"
 
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu
index 0a771db..6a0ec5f 100644
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@@ -4,12 +4,6 @@
 #include "cuda_helper.h"
 #include <sm_30_intrinsics.h>
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
-
 static uint32_t *d_tempBranch1Nonces[8];
 static uint32_t *d_numValid[8];
 static uint32_t *h_numValid[8];
@@ -38,8 +32,6 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
 // Setup-Funktionen
 __host__ void quark_compactTest_cpu_init(int thr_id, int threads)
 {
-	cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
 	cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
 	cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
 
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index 9f39517..a52b2a4 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -8,15 +8,6 @@
 #define TPB 256
 #define THF 4
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
-
 // 64 Register Variante für Compute 3.0
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"
@@ -127,7 +118,6 @@ __global__ void __launch_bounds__(TPB, THF)
 // Setup-Funktionen
 __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
 {
-    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 }
 
 __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
index eedc1a1..129e1ea 100644
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@@ -4,9 +4,6 @@
 
 #include "cuda_helper.h"
 
-// aus cpu-miner.c
-extern int device_map[8];
-
 // Take a look at: https://www.schneier.com/skein1.3.pdf
 
 #define SHL(x, n)			((x) << (n))
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index 636b68e..2f05b3b 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -12,8 +12,6 @@ extern "C"
 
 #include "cuda_helper.h"
 
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 // Speicher zur Generierung der Noncevektoren fÃ¼r die bedingten Hashes
diff --git a/qubit/deep.cu b/qubit/deep.cu
index 44cff7c..b68fbb7 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -14,8 +14,6 @@ extern "C" {
 
 #include "cuda_helper.h"
 
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 extern void qubit_luffa512_cpu_init(int thr_id, int threads);
diff --git a/qubit/doom.cu b/qubit/doom.cu
index 4e40626..ba950cb 100644
--- a/qubit/doom.cu
+++ b/qubit/doom.cu
@@ -10,8 +10,6 @@ extern "C" {
 
 #include "cuda_helper.h"
 
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 extern void qubit_luffa512_cpu_init(int thr_id, int threads);
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
index 45b0d14..92db8a7 100644
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@@ -14,8 +14,6 @@ extern "C" {
 
 #include "cuda_helper.h"
 
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 extern void qubit_luffa512_cpu_init(int thr_id, int threads);
diff --git a/x11/fresh.cu b/x11/fresh.cu
index 8aa1214..4745f0b 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -14,8 +14,6 @@ extern "C" {
 
 static uint32_t *d_hash[8];
 
-extern int device_map[8];
-
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_setBlock_80(void *pdata);
 extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
diff --git a/x11/s3.cu b/x11/s3.cu
index 24e73bb..81b8624 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -13,8 +13,6 @@ extern "C" {
 
 #include <stdint.h>
 
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
diff --git a/x11/x11.cu b/x11/x11.cu
index 6dad14c..81e1776 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -20,9 +20,6 @@ extern "C"
 #include <stdio.h>
 #include <memory.h>
 
-// in cpu-miner.c
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 extern void quark_blake512_cpu_init(int thr_id, int threads);
diff --git a/x13/x13.cu b/x13/x13.cu
index 747ec92..4918e3a 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -23,8 +23,6 @@ extern "C"
 
 #include "cuda_helper.h"
 
-extern int device_map[8];
-
 static uint32_t *d_hash[8];
 
 
diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu
index c9575ed..c2ae9bc 100644
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@@ -7,10 +7,8 @@ extern "C"
 #include "miner.h"
 }
 
-// from cpu-miner.c
-extern int device_map[8];
+#include "cuda_helper.h"
 
-// Speicher fÃ¼r Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];
 
 extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode);
diff --git a/x15/x14.cu b/x15/x14.cu
index b24a61d..a0adc2f 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -26,9 +26,6 @@ extern "C" {
 
 #include "cuda_helper.h"
 
-// from cpu-miner.c
-extern int device_map[8];
-
 // Memory for the hash functions
 static uint32_t *d_hash[8];
 
diff --git a/x15/x15.cu b/x15/x15.cu
index ec73805..a424834 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -27,9 +27,6 @@ extern "C" {
 
 #include "cuda_helper.h"
 
-// from cpu-miner.c
-extern int device_map[8];
-
 // Memory for the hash functions
 static uint32_t *d_hash[8];
 
diff --git a/x17/x17.cu b/x17/x17.cu
index e4573a5..28a382f 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -33,9 +33,6 @@ extern "C"
 
 static uint32_t *d_hash[8];
 
-// in cpu-miner.c
-extern int device_map[8];
-
 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
 extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);