bmw512: save a few KBs, ifdef 80-bytes kernel
was only used by animecoin Also ifdef SM 3.0 compat. code to be ignored on recent archs
This commit is contained in:
parent
3b7ef923c7
commit
957d919a6a
@ -1,9 +1,13 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
|
|
||||||
|
#undef WANT_BMW512_80
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
|
#ifdef WANT_BMW512_80
|
||||||
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
|
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "cuda_bmw512_30.cu"
|
#include "cuda_bmw512_30.cu"
|
||||||
|
|
||||||
@ -21,7 +25,8 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi
|
|||||||
q[i+8] + ROL(q[i+9], 37) + q[i+10] + ROL(q[i+11], 43) + \
|
q[i+8] + ROL(q[i+9], 37) + q[i+10] + ROL(q[i+11], 43) + \
|
||||||
q[i+12] + ROL(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
|
q[i+12] + ROL(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
|
||||||
|
|
||||||
__device__ void Compression512_64_first(uint2 *msg, uint2 *hash)
|
__device__
|
||||||
|
void Compression512_64_first(uint2 *msg, uint2 *hash)
|
||||||
{
|
{
|
||||||
// Compression ref. implementation
|
// Compression ref. implementation
|
||||||
uint2 q[32];
|
uint2 q[32];
|
||||||
@ -380,6 +385,8 @@ void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef WANT_BMW512_80
|
||||||
|
|
||||||
__global__ __launch_bounds__(256, 2)
|
__global__ __launch_bounds__(256, 2)
|
||||||
void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
|
void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
|
||||||
{
|
{
|
||||||
@ -436,12 +443,6 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
|
||||||
void quark_bmw512_cpu_init(int thr_id, uint32_t threads)
|
|
||||||
{
|
|
||||||
cuda_get_arch(thr_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
void quark_bmw512_cpu_setBlock_80(void *pdata)
|
void quark_bmw512_cpu_setBlock_80(void *pdata)
|
||||||
{
|
{
|
||||||
@ -454,6 +455,28 @@ void quark_bmw512_cpu_setBlock_80(void *pdata)
|
|||||||
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
|
||||||
|
{
|
||||||
|
const uint32_t threadsperblock = 128;
|
||||||
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
int dev_id = device_map[thr_id];
|
||||||
|
|
||||||
|
if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
|
||||||
|
quark_bmw512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
|
||||||
|
else
|
||||||
|
quark_bmw512_gpu_hash_80_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void quark_bmw512_cpu_init(int thr_id, uint32_t threads)
|
||||||
|
{
|
||||||
|
cuda_get_arch(thr_id);
|
||||||
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||||
{
|
{
|
||||||
@ -467,17 +490,3 @@ void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce
|
|||||||
else
|
else
|
||||||
quark_bmw512_gpu_hash_64_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
quark_bmw512_gpu_hash_64_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
|
||||||
void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
|
|
||||||
{
|
|
||||||
const uint32_t threadsperblock = 128;
|
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
|
||||||
dim3 block(threadsperblock);
|
|
||||||
|
|
||||||
int dev_id = device_map[thr_id];
|
|
||||||
if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
|
|
||||||
quark_bmw512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
|
|
||||||
else
|
|
||||||
quark_bmw512_gpu_hash_80_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
|
|
||||||
}
|
|
||||||
|
@ -12,7 +12,9 @@
|
|||||||
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
|
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
|
||||||
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
|
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
|
||||||
|
|
||||||
static __constant__ uint64_t d_constMem[16] = {
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 500
|
||||||
|
|
||||||
|
__constant__ uint64_t d_constMem[16] = {
|
||||||
SPH_C64(0x8081828384858687),
|
SPH_C64(0x8081828384858687),
|
||||||
SPH_C64(0x88898A8B8C8D8E8F),
|
SPH_C64(0x88898A8B8C8D8E8F),
|
||||||
SPH_C64(0x9091929394959697),
|
SPH_C64(0x9091929394959697),
|
||||||
@ -213,6 +215,8 @@ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef WANT_BMW512_80
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
|
void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
|
||||||
{
|
{
|
||||||
@ -250,3 +254,11 @@ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_
|
|||||||
outpHash[i] = ((uint2*)message)[i+8];
|
outpHash[i] = ((uint2*)message)[i+8];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else /* stripped stubs for other archs */
|
||||||
|
__global__ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
|
||||||
|
__global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user