mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-10 23:08:02 +00:00
Allow test of SM 2.1/3.0 binaries on newer cards
Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code...
This commit is contained in:
parent
4426700e31
commit
38e6672d70
@ -295,7 +295,7 @@ void skein256_gpu_hash_32_v30(uint32_t threads, uint32_t startNounce, uint64_t *
|
|||||||
__host__
|
__host__
|
||||||
void skein256_cpu_init(int thr_id, uint32_t threads)
|
void skein256_cpu_init(int thr_id, uint32_t threads)
|
||||||
{
|
{
|
||||||
//empty
|
cuda_get_arch(thr_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
@ -306,7 +306,7 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui
|
|||||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
if (device_sm[device_map[thr_id]] >= 320)
|
if (device_sm[device_map[thr_id]] > 300 && cuda_arch[device_map[thr_id]] > 300)
|
||||||
skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
|
skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
|
||||||
else
|
else
|
||||||
skein256_gpu_hash_32_v30<<<grid, block>>>(threads, startNounce, d_outputHash);
|
skein256_gpu_hash_32_v30<<<grid, block>>>(threads, startNounce, d_outputHash);
|
||||||
|
@ -196,3 +196,28 @@ uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNoun
|
|||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Function to get the compiled Shader Model version */
|
||||||
|
int cuda_arch[MAX_GPUS] = { 0 };
|
||||||
|
__global__
|
||||||
|
void nvcc_get_arch(int *d_version)
|
||||||
|
{
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
*d_version = __CUDA_ARCH__;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
int cuda_get_arch(int thr_id)
|
||||||
|
{
|
||||||
|
int *d_version;
|
||||||
|
int dev_id = device_map[thr_id];
|
||||||
|
if (cuda_arch[dev_id] == 0) {
|
||||||
|
// only do it once...
|
||||||
|
cudaMalloc(&d_version, sizeof(int));
|
||||||
|
nvcc_get_arch <<< 1, 1 >>> (d_version);
|
||||||
|
cudaMemcpy(&cuda_arch[dev_id], d_version, sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
cudaFree(d_version);
|
||||||
|
}
|
||||||
|
return cuda_arch[dev_id];
|
||||||
|
}
|
||||||
|
@ -20,7 +20,10 @@
|
|||||||
extern "C" short device_map[MAX_GPUS];
|
extern "C" short device_map[MAX_GPUS];
|
||||||
extern "C" long device_sm[MAX_GPUS];
|
extern "C" long device_sm[MAX_GPUS];
|
||||||
|
|
||||||
|
extern int cuda_arch[MAX_GPUS];
|
||||||
|
|
||||||
// common functions
|
// common functions
|
||||||
|
extern int cuda_get_arch(int thr_id);
|
||||||
extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
|
extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
|
||||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||||
extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
|
extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
|
||||||
|
@ -133,15 +133,17 @@ __global__ void __launch_bounds__(TPB, THF)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
|
||||||
__host__ void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
|
|
||||||
{
|
{
|
||||||
if (device_sm[device_map[thr_id]] < 300)
|
int dev_id = device_map[thr_id];
|
||||||
|
cuda_get_arch(thr_id);
|
||||||
|
if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
|
||||||
quark_groestl512_sm20_init(thr_id, threads);
|
quark_groestl512_sm20_init(thr_id, threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
__host__
|
||||||
|
void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||||
{
|
{
|
||||||
int threadsperblock = TPB;
|
int threadsperblock = TPB;
|
||||||
|
|
||||||
@ -153,11 +155,10 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_
|
|||||||
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
|
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs
|
int dev_id = device_map[thr_id];
|
||||||
size_t shared_size = 0;
|
|
||||||
|
|
||||||
if (device_sm[device_map[thr_id]] >= 300)
|
if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)
|
||||||
quark_groestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
|
quark_groestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
|
||||||
else
|
else
|
||||||
quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
|
quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
|
||||||
|
|
||||||
@ -165,7 +166,8 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_
|
|||||||
MyStreamSynchronize(NULL, order, thr_id);
|
MyStreamSynchronize(NULL, order, thr_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
__host__
|
||||||
|
void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||||
{
|
{
|
||||||
const int factor = THF;
|
const int factor = THF;
|
||||||
int threadsperblock = TPB;
|
int threadsperblock = TPB;
|
||||||
@ -173,10 +175,10 @@ __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, u
|
|||||||
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
|
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
size_t shared_size = 0;
|
int dev_id = device_map[thr_id];
|
||||||
|
|
||||||
if (device_sm[device_map[thr_id]] >= 300)
|
if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)
|
||||||
quark_doublegroestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
|
quark_doublegroestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
|
||||||
else
|
else
|
||||||
quark_doublegroestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
|
quark_doublegroestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
|
||||||
|
|
||||||
|
@ -248,7 +248,9 @@ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
if (device_sm[device_map[thr_id]] >= 320)
|
int dev_id = device_map[thr_id];
|
||||||
|
|
||||||
|
if (device_sm[dev_id] >= 320)
|
||||||
quark_keccak512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
quark_keccak512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||||
else
|
else
|
||||||
quark_keccak512_gpu_hash_64_v30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
quark_keccak512_gpu_hash_64_v30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||||
|
@ -11,7 +11,7 @@ static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes
|
|||||||
#define SHL(x, n) ((x) << (n))
|
#define SHL(x, n) ((x) << (n))
|
||||||
#define SHR(x, n) ((x) >> (n))
|
#define SHR(x, n) ((x) >> (n))
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= 320
|
#if __CUDA_ARCH__ > 300
|
||||||
__device__
|
__device__
|
||||||
uint64_t skein_rotl64(const uint64_t x, const int offset)
|
uint64_t skein_rotl64(const uint64_t x, const int offset)
|
||||||
{
|
{
|
||||||
@ -443,11 +443,11 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
|
|||||||
TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
|
TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
|
||||||
|
|
||||||
// fertig
|
// fertig
|
||||||
uint64_t *outpHash = &g_hash[8 * hashPosition];
|
uint2 *outpHash = (uint2*) (&g_hash[hashPosition * 8]);
|
||||||
|
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(int i=0; i<8; i++)
|
for(int i=0; i<8; i++)
|
||||||
outpHash[i] = devectorize(p[i]);
|
outpHash[i] = p[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -724,6 +724,8 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp
|
|||||||
__host__
|
__host__
|
||||||
void quark_skein512_cpu_init(int thr_id, uint32_t threads)
|
void quark_skein512_cpu_init(int thr_id, uint32_t threads)
|
||||||
{
|
{
|
||||||
|
// store the binary SM version
|
||||||
|
cuda_get_arch(thr_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
@ -734,8 +736,10 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
|
int dev_id = device_map[thr_id];
|
||||||
|
|
||||||
// uint2 uint64 variants for SM 3.2+
|
// uint2 uint64 variants for SM 3.2+
|
||||||
if (device_sm[device_map[thr_id]] >= 320)
|
if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
|
||||||
quark_skein512_gpu_hash_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
quark_skein512_gpu_hash_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||||
else
|
else
|
||||||
quark_skein512_gpu_hash_64_v30 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
quark_skein512_gpu_hash_64_v30 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||||
|
Loading…
Reference in New Issue
Block a user