cuda: store device SM in a global var
sample usage made for blake and fugue (higher intensity for SM5.2) add these to cuda_helper and clean unused code
This commit is contained in:
parent
99bbe380fd
commit
b128312efb
@ -4,12 +4,6 @@
|
||||
#include "cuda_helper.h"
|
||||
#include <sm_30_intrinsics.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
static uint32_t *d_tempBranch1Nonces[8];
|
||||
static uint32_t *d_numValid[8];
|
||||
static uint32_t *h_numValid[8];
|
||||
@ -40,8 +34,6 @@ cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8];
|
||||
// Setup-Funktionen
|
||||
__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
|
||||
cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
|
||||
|
||||
|
@ -10,8 +10,6 @@ extern "C"
|
||||
#include "miner.h"
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
extern void jackpot_keccak512_cpu_init(int thr_id, int threads);
|
||||
|
@ -17,6 +17,8 @@ extern "C" {
|
||||
/* threads per block and throughput (intensity) */
|
||||
#define TPB 128
|
||||
|
||||
extern int opt_n_threads;
|
||||
|
||||
/* added in sph_blake.c */
|
||||
extern "C" int blake256_rounds = 14;
|
||||
|
||||
@ -39,10 +41,6 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14
|
||||
|
||||
#define MAXU 0xffffffffU
|
||||
|
||||
// in cpu-miner.c
|
||||
extern bool opt_n_threads;
|
||||
extern int device_map[8];
|
||||
|
||||
#if PRECALC64
|
||||
__constant__ uint32_t _ALIGN(32) d_data[12];
|
||||
#else
|
||||
@ -399,7 +397,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
|
||||
#else
|
||||
uint32_t crcsum;
|
||||
#endif
|
||||
uint32_t throughput = opt_work_size ? opt_work_size : (1 << 20); // 1048576 nonces per call
|
||||
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20;
|
||||
uint32_t throughput = opt_work_size ? opt_work_size : (1 << intensity);
|
||||
throughput = min(throughput, max_nonce - first_nonce);
|
||||
|
||||
int rc = 0;
|
||||
|
@ -213,6 +213,7 @@ uint16_t opt_vote = 9999;
|
||||
static int num_processors;
|
||||
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
|
||||
char *device_name[8]; // CB
|
||||
int device_sm[8];
|
||||
static char *rpc_url;
|
||||
static char *rpc_userpass;
|
||||
static char *rpc_user, *rpc_pass;
|
||||
|
2
cuda.cu
2
cuda.cu
@ -19,7 +19,6 @@
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern char *device_name[8];
|
||||
extern int device_map[8];
|
||||
|
||||
// CUDA Devices on the System
|
||||
extern "C" int cuda_num_devices()
|
||||
@ -66,6 +65,7 @@ extern "C" void cuda_devicenames()
|
||||
cudaGetDeviceProperties(&props, device_map[i]);
|
||||
|
||||
device_name[i] = strdup(props.name);
|
||||
device_sm[i] = props.major * 100 + props.minor * 10;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8,12 +8,6 @@
|
||||
|
||||
#define USE_SHARED 1
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
uint32_t *d_fugue256_hashoutput[8];
|
||||
uint32_t *d_resultNonce[8];
|
||||
|
||||
@ -726,7 +720,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
|
||||
|
||||
void fugue256_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
|
||||
|
@ -6,15 +6,6 @@
|
||||
#include "cuda_helper.h"
|
||||
#include <host_defines.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
__constant__ uint32_t pTarget[8]; // Single GPU
|
||||
extern uint32_t *d_resultNonce[8];
|
||||
@ -102,8 +93,6 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
// Speicher für Gewinner-Nonce belegen
|
||||
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
|
||||
}
|
||||
|
@ -13,6 +13,9 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
extern int device_map[8];
|
||||
extern int device_sm[8];
|
||||
|
||||
// common functions
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
|
@ -5,15 +5,6 @@
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
__constant__ uint32_t pTarget[8]; // Single GPU
|
||||
uint32_t *d_outputHashes[8];
|
||||
@ -324,8 +315,6 @@ __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
|
||||
temp,
|
||||
sizeof(uint32_t) * 64 );
|
||||
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
// Speicher für Gewinner-Nonce belegen
|
||||
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
|
||||
|
||||
|
@ -11,9 +11,6 @@ extern "C"
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// in cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// Speicher für Input/Output der verketteten Hashfunktionen
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
|
@ -13,6 +13,9 @@ extern "C" void my_fugue256(void *cc, const void *data, size_t len);
|
||||
extern "C" void my_fugue256_close(void *cc, void *dst);
|
||||
extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
extern int device_map[8];
|
||||
extern int device_sm[8];
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define MIN min
|
||||
#else
|
||||
@ -30,7 +33,8 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
|
||||
uint32_t max_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
uint32_t start_nonce = pdata[19]++;
|
||||
uint32_t throughPut = opt_work_size ? opt_work_size : (1 << 19);
|
||||
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
|
||||
uint32_t throughPut = opt_work_size ? opt_work_size : (1 << intensity);
|
||||
throughPut = MIN(throughPut, max_nonce - start_nonce);
|
||||
|
||||
if (opt_benchmark)
|
||||
|
@ -2,19 +2,9 @@
|
||||
#include <memory.h>
|
||||
|
||||
#include "cuda_helper.h"
|
||||
#include <device_functions.h>
|
||||
|
||||
#define USE_SHARED 1
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
uint32_t *d_heftyHashes[8];
|
||||
|
||||
@ -305,8 +295,6 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||
cudaMemcpyToSymbol( hefty_gpu_constantTable,
|
||||
hefty_cpu_constantTable,
|
||||
@ -397,7 +385,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
|
||||
{
|
||||
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
|
||||
// alle anderen mit 512 Threads.
|
||||
int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512;
|
||||
int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
|
@ -14,9 +14,6 @@ extern "C"
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// in cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
extern void keccak256_cpu_init(int thr_id, int threads);
|
||||
|
@ -46,9 +46,7 @@ extern "C" void pentablakehash(void *output, const void *input)
|
||||
#define MAXU 0xffffffffU
|
||||
|
||||
// in cpu-miner.c
|
||||
extern bool opt_n_threads;
|
||||
extern bool opt_benchmark;
|
||||
extern int device_map[8];
|
||||
extern int opt_n_threads;
|
||||
|
||||
__constant__
|
||||
static uint32_t __align__(32) c_Target[8];
|
||||
|
@ -10,8 +10,6 @@ extern "C"
|
||||
#include "miner.h"
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
|
||||
|
@ -4,12 +4,6 @@
|
||||
#include "cuda_helper.h"
|
||||
#include <sm_30_intrinsics.h>
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
static uint32_t *d_tempBranch1Nonces[8];
|
||||
static uint32_t *d_numValid[8];
|
||||
static uint32_t *h_numValid[8];
|
||||
@ -38,8 +32,6 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
|
||||
// Setup-Funktionen
|
||||
__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
|
||||
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
|
||||
|
||||
|
@ -8,15 +8,6 @@
|
||||
#define TPB 256
|
||||
#define THF 4
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
// 64 Register Variante für Compute 3.0
|
||||
#include "groestl_functions_quad.cu"
|
||||
#include "bitslice_transformations_quad.cu"
|
||||
@ -127,7 +118,6 @@ __global__ void __launch_bounds__(TPB, THF)
|
||||
// Setup-Funktionen
|
||||
__host__ void quark_groestl512_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
}
|
||||
|
||||
__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||
|
@ -4,9 +4,6 @@
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// Take a look at: https://www.schneier.com/skein1.3.pdf
|
||||
|
||||
#define SHL(x, n) ((x) << (n))
|
||||
|
@ -12,8 +12,6 @@ extern "C"
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
|
||||
|
@ -14,8 +14,6 @@ extern "C" {
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
extern void qubit_luffa512_cpu_init(int thr_id, int threads);
|
||||
|
@ -10,8 +10,6 @@ extern "C" {
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
extern void qubit_luffa512_cpu_init(int thr_id, int threads);
|
||||
|
@ -14,8 +14,6 @@ extern "C" {
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
extern void qubit_luffa512_cpu_init(int thr_id, int threads);
|
||||
|
@ -14,8 +14,6 @@ extern "C" {
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
extern int device_map[8];
|
||||
|
||||
extern void x11_shavite512_cpu_init(int thr_id, int threads);
|
||||
extern void x11_shavite512_setBlock_80(void *pdata);
|
||||
extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
|
||||
|
@ -13,8 +13,6 @@ extern "C" {
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
extern void x11_shavite512_cpu_init(int thr_id, int threads);
|
||||
|
@ -20,9 +20,6 @@ extern "C"
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// in cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, int threads);
|
||||
|
@ -23,8 +23,6 @@ extern "C"
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
extern int device_map[8];
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
|
||||
|
@ -7,10 +7,8 @@ extern "C"
|
||||
#include "miner.h"
|
||||
}
|
||||
|
||||
// from cpu-miner.c
|
||||
extern int device_map[8];
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// Speicher für Input/Output der verketteten Hashfunktionen
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode);
|
||||
|
@ -26,9 +26,6 @@ extern "C" {
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// from cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// Memory for the hash functions
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
|
@ -27,9 +27,6 @@ extern "C" {
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
// from cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// Memory for the hash functions
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
|
@ -33,9 +33,6 @@ extern "C"
|
||||
|
||||
static uint32_t *d_hash[8];
|
||||
|
||||
// in cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, int threads);
|
||||
extern void quark_blake512_cpu_setBlock_80(void *pdata);
|
||||
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
|
||||
|
Loading…
x
Reference in New Issue
Block a user