Browse Source

cuda: store device SM in a global var

sample usage made for blake and fugue (higher intensity for SM5.2)

add these to cuda_helper and clean unused code
2upstream
Tanguy Pruvot 10 years ago
parent
commit
b128312efb
  1. 8
      JHA/cuda_jha_compactionTest.cu
  2. 2
      JHA/jackpotcoin.cu
  3. 9
      blake32.cu
  4. 1
      cpu-miner.c
  5. 2
      cuda.cu
  6. 8
      cuda_fugue256.cu
  7. 11
      cuda_groestlcoin.cu
  8. 3
      cuda_helper.h
  9. 11
      cuda_myriadgroestl.cu
  10. 3
      cuda_nist5.cu
  11. 6
      fuguecoin.cpp
  12. 14
      heavy/cuda_hefty1.cu
  13. 3
      keccak/keccak256.cu
  14. 4
      pentablake.cu
  15. 2
      quark/animecoin.cu
  16. 8
      quark/cuda_quark_compactionTest.cu
  17. 10
      quark/cuda_quark_groestl512.cu
  18. 3
      quark/cuda_skein512.cu
  19. 2
      quark/quarkcoin.cu
  20. 2
      qubit/deep.cu
  21. 2
      qubit/doom.cu
  22. 2
      qubit/qubit.cu
  23. 2
      x11/fresh.cu
  24. 2
      x11/s3.cu
  25. 3
      x11/x11.cu
  26. 2
      x13/x13.cu
  27. 4
      x15/whirlpool.cu
  28. 3
      x15/x14.cu
  29. 3
      x15/x15.cu
  30. 3
      x17/x17.cu

8
JHA/cuda_jha_compactionTest.cu

@ -4,12 +4,6 @@ @@ -4,12 +4,6 @@
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c
extern int device_map[8];
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
static uint32_t *d_tempBranch1Nonces[8];
static uint32_t *d_numValid[8];
static uint32_t *h_numValid[8];
@ -40,8 +34,6 @@ cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8]; @@ -40,8 +34,6 @@ cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8];
// Setup-Funktionen
__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));

2
JHA/jackpotcoin.cu

@ -10,8 +10,6 @@ extern "C" @@ -10,8 +10,6 @@ extern "C"
#include "miner.h"
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
extern void jackpot_keccak512_cpu_init(int thr_id, int threads);

9
blake32.cu

@ -17,6 +17,8 @@ extern "C" { @@ -17,6 +17,8 @@ extern "C" {
/* threads per block and throughput (intensity) */
#define TPB 128
extern int opt_n_threads;
/* added in sph_blake.c */
extern "C" int blake256_rounds = 14;
@ -39,10 +41,6 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14 @@ -39,10 +41,6 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14
#define MAXU 0xffffffffU
// in cpu-miner.c
extern bool opt_n_threads;
extern int device_map[8];
#if PRECALC64
__constant__ uint32_t _ALIGN(32) d_data[12];
#else
@ -399,7 +397,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt @@ -399,7 +397,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
#else
uint32_t crcsum;
#endif
uint32_t throughput = opt_work_size ? opt_work_size : (1 << 20); // 1048576 nonces per call
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20;
uint32_t throughput = opt_work_size ? opt_work_size : (1 << intensity);
throughput = min(throughput, max_nonce - first_nonce);
int rc = 0;

1
cpu-miner.c

@ -213,6 +213,7 @@ uint16_t opt_vote = 9999; @@ -213,6 +213,7 @@ uint16_t opt_vote = 9999;
static int num_processors;
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
char *device_name[8]; // CB
int device_sm[8];
static char *rpc_url;
static char *rpc_userpass;
static char *rpc_user, *rpc_pass;

2
cuda.cu

@ -19,7 +19,6 @@ @@ -19,7 +19,6 @@
#include "cuda_helper.h"
extern char *device_name[8];
extern int device_map[8];
// CUDA Devices on the System
extern "C" int cuda_num_devices()
@ -66,6 +65,7 @@ extern "C" void cuda_devicenames() @@ -66,6 +65,7 @@ extern "C" void cuda_devicenames()
cudaGetDeviceProperties(&props, device_map[i]);
device_name[i] = strdup(props.name);
device_sm[i] = props.major * 100 + props.minor * 10;
}
}

8
cuda_fugue256.cu

@ -8,12 +8,6 @@ @@ -8,12 +8,6 @@
#define USE_SHARED 1
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
uint32_t *d_fugue256_hashoutput[8];
uint32_t *d_resultNonce[8];
@ -726,7 +720,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas @@ -726,7 +720,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
void fugue256_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);

11
cuda_groestlcoin.cu

@ -6,15 +6,6 @@ @@ -6,15 +6,6 @@
#include "cuda_helper.h"
#include <host_defines.h>
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU
extern uint32_t *d_resultNonce[8];
@ -102,8 +93,6 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads) @@ -102,8 +93,6 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
}

3
cuda_helper.h

@ -13,6 +13,9 @@ @@ -13,6 +13,9 @@
#include <stdint.h>
extern int device_map[8];
extern int device_sm[8];
// common functions
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);

11
cuda_myriadgroestl.cu

@ -5,15 +5,6 @@ @@ -5,15 +5,6 @@
#include "cuda_helper.h"
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU
uint32_t *d_outputHashes[8];
@ -324,8 +315,6 @@ __host__ void myriadgroestl_cpu_init(int thr_id, int threads) @@ -324,8 +315,6 @@ __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
temp,
sizeof(uint32_t) * 64 );
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));

3
cuda_nist5.cu

@ -11,9 +11,6 @@ extern "C" @@ -11,9 +11,6 @@ extern "C"
#include "cuda_helper.h"
// in cpu-miner.c
extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];

6
fuguecoin.cpp

@ -13,6 +13,9 @@ extern "C" void my_fugue256(void *cc, const void *data, size_t len); @@ -13,6 +13,9 @@ extern "C" void my_fugue256(void *cc, const void *data, size_t len);
extern "C" void my_fugue256_close(void *cc, void *dst);
extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
extern int device_map[8];
extern int device_sm[8];
#ifdef _MSC_VER
#define MIN min
#else
@ -30,7 +33,8 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt @@ -30,7 +33,8 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t start_nonce = pdata[19]++;
uint32_t throughPut = opt_work_size ? opt_work_size : (1 << 19);
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
uint32_t throughPut = opt_work_size ? opt_work_size : (1 << intensity);
throughPut = MIN(throughPut, max_nonce - start_nonce);
if (opt_benchmark)

14
heavy/cuda_hefty1.cu

@ -2,19 +2,9 @@ @@ -2,19 +2,9 @@
#include <memory.h>
#include "cuda_helper.h"
#include <device_functions.h>
#define USE_SHARED 1
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
uint32_t *d_heftyHashes[8];
@ -305,8 +295,6 @@ __host__ void hefty_cpu_init(int thr_id, int threads) @@ -305,8 +295,6 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( hefty_gpu_constantTable,
hefty_cpu_constantTable,
@ -397,7 +385,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) @@ -397,7 +385,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
{
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512;
int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);

3
keccak/keccak256.cu

@ -14,9 +14,6 @@ extern "C" @@ -14,9 +14,6 @@ extern "C"
#include "cuda_helper.h"
// in cpu-miner.c
extern int device_map[8];
static uint32_t *d_hash[8];
extern void keccak256_cpu_init(int thr_id, int threads);

4
pentablake.cu

@ -46,9 +46,7 @@ extern "C" void pentablakehash(void *output, const void *input) @@ -46,9 +46,7 @@ extern "C" void pentablakehash(void *output, const void *input)
#define MAXU 0xffffffffU
// in cpu-miner.c
extern bool opt_n_threads;
extern bool opt_benchmark;
extern int device_map[8];
extern int opt_n_threads;
__constant__
static uint32_t __align__(32) c_Target[8];

2
quark/animecoin.cu

@ -10,8 +10,6 @@ extern "C" @@ -10,8 +10,6 @@ extern "C"
#include "miner.h"
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes

8
quark/cuda_quark_compactionTest.cu

@ -4,12 +4,6 @@ @@ -4,12 +4,6 @@
#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
// aus cpu-miner.c
extern int device_map[8];
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
static uint32_t *d_tempBranch1Nonces[8];
static uint32_t *d_numValid[8];
static uint32_t *h_numValid[8];
@ -38,8 +32,6 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8]; @@ -38,8 +32,6 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
// Setup-Funktionen
__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));

10
quark/cuda_quark_groestl512.cu

@ -8,15 +8,6 @@ @@ -8,15 +8,6 @@
#define TPB 256
#define THF 4
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
// 64 Register Variante für Compute 3.0
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
@ -127,7 +118,6 @@ __global__ void __launch_bounds__(TPB, THF) @@ -127,7 +118,6 @@ __global__ void __launch_bounds__(TPB, THF)
// Setup-Funktionen
__host__ void quark_groestl512_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
}
__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

3
quark/cuda_skein512.cu

@ -4,9 +4,6 @@ @@ -4,9 +4,6 @@
#include "cuda_helper.h"
// aus cpu-miner.c
extern int device_map[8];
// Take a look at: https://www.schneier.com/skein1.3.pdf
#define SHL(x, n) ((x) << (n))

2
quark/quarkcoin.cu

@ -12,8 +12,6 @@ extern "C" @@ -12,8 +12,6 @@ extern "C"
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes

2
qubit/deep.cu

@ -14,8 +14,6 @@ extern "C" { @@ -14,8 +14,6 @@ extern "C" {
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
extern void qubit_luffa512_cpu_init(int thr_id, int threads);

2
qubit/doom.cu

@ -10,8 +10,6 @@ extern "C" { @@ -10,8 +10,6 @@ extern "C" {
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
extern void qubit_luffa512_cpu_init(int thr_id, int threads);

2
qubit/qubit.cu

@ -14,8 +14,6 @@ extern "C" { @@ -14,8 +14,6 @@ extern "C" {
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];
extern void qubit_luffa512_cpu_init(int thr_id, int threads);

2
x11/fresh.cu

@ -14,8 +14,6 @@ extern "C" { @@ -14,8 +14,6 @@ extern "C" {
static uint32_t *d_hash[8];
extern int device_map[8];
extern void x11_shavite512_cpu_init(int thr_id, int threads);
extern void x11_shavite512_setBlock_80(void *pdata);
extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);

2
x11/s3.cu

@ -13,8 +13,6 @@ extern "C" { @@ -13,8 +13,6 @@ extern "C" {
#include <stdint.h>
extern int device_map[8];
static uint32_t *d_hash[8];
extern void x11_shavite512_cpu_init(int thr_id, int threads);

3
x11/x11.cu

@ -20,9 +20,6 @@ extern "C" @@ -20,9 +20,6 @@ extern "C"
#include <stdio.h>
#include <memory.h>
// in cpu-miner.c
extern int device_map[8];
static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);

2
x13/x13.cu

@ -23,8 +23,6 @@ extern "C" @@ -23,8 +23,6 @@ extern "C"
#include "cuda_helper.h"
extern int device_map[8];
static uint32_t *d_hash[8];

4
x15/whirlpool.cu

@ -7,10 +7,8 @@ extern "C" @@ -7,10 +7,8 @@ extern "C"
#include "miner.h"
}
// from cpu-miner.c
extern int device_map[8];
#include "cuda_helper.h"
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode);

3
x15/x14.cu

@ -26,9 +26,6 @@ extern "C" { @@ -26,9 +26,6 @@ extern "C" {
#include "cuda_helper.h"
// from cpu-miner.c
extern int device_map[8];
// Memory for the hash functions
static uint32_t *d_hash[8];

3
x15/x15.cu

@ -27,9 +27,6 @@ extern "C" { @@ -27,9 +27,6 @@ extern "C" {
#include "cuda_helper.h"
// from cpu-miner.c
extern int device_map[8];
// Memory for the hash functions
static uint32_t *d_hash[8];

3
x17/x17.cu

@ -33,9 +33,6 @@ extern "C" @@ -33,9 +33,6 @@ extern "C"
static uint32_t *d_hash[8];
// in cpu-miner.c
extern int device_map[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);

Loading…
Cancel
Save