cuda: store device SM in a global var

sample usage made for blake and fugue (higher intensity for SM5.2)

add these to cuda_helper and clean unused code
This commit is contained in:
Tanguy Pruvot 2014-11-11 18:54:56 +01:00
parent 99bbe380fd
commit b128312efb
30 changed files with 18 additions and 120 deletions

View File

@ -4,12 +4,6 @@
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c
extern int device_map[8];
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
static uint32_t *d_tempBranch1Nonces[8];
static uint32_t *d_numValid[8];
static uint32_t *h_numValid[8];
@ -40,8 +34,6 @@ cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8];
// Setup-Funktionen
__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));

View File

@ -10,8 +10,6 @@ extern "C"
#include "miner.h"
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
extern void jackpot_keccak512_cpu_init(int thr_id, int threads);

View File

@ -17,6 +17,8 @@ extern "C" {
/* threads per block and throughput (intensity) */
#define TPB 128
extern int opt_n_threads;
/* added in sph_blake.c */
extern "C" int blake256_rounds = 14;
@ -39,10 +41,6 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14
#define MAXU 0xffffffffU
// in cpu-miner.c
extern bool opt_n_threads;
extern int device_map[8];
#if PRECALC64
__constant__ uint32_t _ALIGN(32) d_data[12];
#else
@ -399,7 +397,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
#else
uint32_t crcsum;
#endif
uint32_t throughput = opt_work_size ? opt_work_size : (1 << 20); // 1048576 nonces per call
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20;
uint32_t throughput = opt_work_size ? opt_work_size : (1 << intensity);
throughput = min(throughput, max_nonce - first_nonce);
int rc = 0;

View File

@ -213,6 +213,7 @@ uint16_t opt_vote = 9999;
static int num_processors;
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
char *device_name[8]; // CB
int device_sm[8];
static char *rpc_url;
static char *rpc_userpass;
static char *rpc_user, *rpc_pass;

View File

@ -19,7 +19,6 @@
#include "cuda_helper.h"
extern char *device_name[8];
extern int device_map[8];
// CUDA Devices on the System
extern "C" int cuda_num_devices()
@ -66,6 +65,7 @@ extern "C" void cuda_devicenames()
cudaGetDeviceProperties(&props, device_map[i]);
device_name[i] = strdup(props.name);
device_sm[i] = props.major * 100 + props.minor * 10;
}
}

View File

@ -8,12 +8,6 @@
#define USE_SHARED 1
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
uint32_t *d_fugue256_hashoutput[8];
uint32_t *d_resultNonce[8];
@ -726,7 +720,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
void fugue256_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);

View File

@ -6,15 +6,6 @@
#include "cuda_helper.h"
#include <host_defines.h>
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU
extern uint32_t *d_resultNonce[8];
@ -102,8 +93,6 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
}

View File

@ -13,6 +13,9 @@
#include <stdint.h>
extern int device_map[8];
extern int device_sm[8];
// common functions
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);

View File

@ -5,15 +5,6 @@
#include "cuda_helper.h"
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU
uint32_t *d_outputHashes[8];
@ -324,8 +315,6 @@ __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
temp,
sizeof(uint32_t) * 64 );
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));

View File

@ -11,9 +11,6 @@ extern "C"
#include "cuda_helper.h"
// in cpu-miner.c
extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];

View File

@ -13,6 +13,9 @@ extern "C" void my_fugue256(void *cc, const void *data, size_t len);
extern "C" void my_fugue256_close(void *cc, void *dst);
extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
extern int device_map[8];
extern int device_sm[8];
#ifdef _MSC_VER
#define MIN min
#else
@ -30,7 +33,8 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t start_nonce = pdata[19]++;
uint32_t throughPut = opt_work_size ? opt_work_size : (1 << 19);
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
uint32_t throughPut = opt_work_size ? opt_work_size : (1 << intensity);
throughPut = MIN(throughPut, max_nonce - start_nonce);
if (opt_benchmark)

View File

@ -2,19 +2,9 @@
#include <memory.h>
#include "cuda_helper.h"
#include <device_functions.h>
#define USE_SHARED 1
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
uint32_t *d_heftyHashes[8];
@ -305,8 +295,6 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( hefty_gpu_constantTable,
hefty_cpu_constantTable,
@ -397,7 +385,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
{
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512;
int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);

View File

@ -14,9 +14,6 @@ extern "C"
#include "cuda_helper.h"
// in cpu-miner.c
extern int device_map[8];
static uint32_t *d_hash[8];
extern void keccak256_cpu_init(int thr_id, int threads);

View File

@ -46,9 +46,7 @@ extern "C" void pentablakehash(void *output, const void *input)
#define MAXU 0xffffffffU
// in cpu-miner.c
extern bool opt_n_threads;
extern bool opt_benchmark;
extern int device_map[8];
extern int opt_n_threads;
__constant__
static uint32_t __align__(32) c_Target[8];

View File

@ -10,8 +10,6 @@ extern "C"
#include "miner.h"
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes

View File

@ -4,12 +4,6 @@
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c
extern int device_map[8];
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
static uint32_t *d_tempBranch1Nonces[8];
static uint32_t *d_numValid[8];
static uint32_t *h_numValid[8];
@ -38,8 +32,6 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
// Setup-Funktionen
__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));

View File

@ -8,15 +8,6 @@
#define TPB 256
#define THF 4
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
// 64 Register Variante für Compute 3.0
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
@ -127,7 +118,6 @@ __global__ void __launch_bounds__(TPB, THF)
// Setup-Funktionen
__host__ void quark_groestl512_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
}
__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

View File

@ -4,9 +4,6 @@
#include "cuda_helper.h"
// aus cpu-miner.c
extern int device_map[8];
// Take a look at: https://www.schneier.com/skein1.3.pdf
#define SHL(x, n) ((x) << (n))

View File

@ -12,8 +12,6 @@ extern "C"
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes

View File

@ -14,8 +14,6 @@ extern "C" {
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
extern void qubit_luffa512_cpu_init(int thr_id, int threads);

View File

@ -10,8 +10,6 @@ extern "C" {
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
extern void qubit_luffa512_cpu_init(int thr_id, int threads);

View File

@ -14,8 +14,6 @@ extern "C" {
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
extern void qubit_luffa512_cpu_init(int thr_id, int threads);

View File

@ -14,8 +14,6 @@ extern "C" {
static uint32_t *d_hash[8];
extern int device_map[8];
extern void x11_shavite512_cpu_init(int thr_id, int threads);
extern void x11_shavite512_setBlock_80(void *pdata);
extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);

View File

@ -13,8 +13,6 @@ extern "C" {
#include <stdint.h>
extern int device_map[8];
static uint32_t *d_hash[8];
extern void x11_shavite512_cpu_init(int thr_id, int threads);

View File

@ -20,9 +20,6 @@ extern "C"
#include <stdio.h>
#include <memory.h>
// in cpu-miner.c
extern int device_map[8];
static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);

View File

@ -23,8 +23,6 @@ extern "C"
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];

View File

@ -7,10 +7,8 @@ extern "C"
#include "miner.h"
}
// from cpu-miner.c
extern int device_map[8];
#include "cuda_helper.h"
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode);

View File

@ -26,9 +26,6 @@ extern "C" {
#include "cuda_helper.h"
// from cpu-miner.c
extern int device_map[8];
// Memory for the hash functions
static uint32_t *d_hash[8];

View File

@ -27,9 +27,6 @@ extern "C" {
#include "cuda_helper.h"
// from cpu-miner.c
extern int device_map[8];
// Memory for the hash functions
static uint32_t *d_hash[8];

View File

@ -33,9 +33,6 @@ extern "C"
static uint32_t *d_hash[8];
// in cpu-miner.c
extern int device_map[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);