neoscrypt: remove warnings and rename host funcs

also reduce the few errors on coins using shared mem
This commit is contained in:
Tanguy Pruvot 2016-07-19 22:00:28 +02:00
parent 6abee0659e
commit be8be03eb8
2 changed files with 36 additions and 37 deletions

View File

@ -15,6 +15,7 @@ typedef uint48 uint4x2;
#ifdef __INTELLISENSE__
#define __CUDA_ARCH__ 500
#define __byte_perm(x,y,c) x
#define __shfl(x,y,c) x
#define atomicExch(p,x) x
#endif
@ -80,26 +81,6 @@ __constant__ uint32_t BLAKE2S_SIGMA[10][16] = {
#define shf_r_clamp32(out,a,b,shift) \
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(out) : "r"(a), "r"(b), "r"(shift));
__device__ __forceinline__
static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
{
#if __CUDA_ARCH__ >= 320
uint32_t shift = 32U - shift2;
asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift));
asm("shr.b32 %0, %1, %2;" : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift));
#else
// to check
shift256R(ret, vec4, shift2);
#endif
}
#if __CUDA_ARCH__ >= 300
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
@ -165,6 +146,26 @@ __device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_
#if __CUDA_ARCH__ < 500
__device__ __forceinline__
static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
{
#if __CUDA_ARCH__ >= 320
uint32_t shift = 32U - shift2;
asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift));
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift));
asm("shr.b32 %0, %1, %2;" : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift));
#else
// to check
shift256R(ret, vec4, shift2);
#endif
}
#define BLAKE(a, b, c, d, key1, key2) { \
a += key1; \
a += b; d = rotateL(d^a, 16); \
@ -721,7 +722,6 @@ static __forceinline__ __device__
void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data)
{
uint2x4 output[8];
uchar4 bufhelper;
uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U];
uint32_t qbuf, rbuf, bitbuf;
uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
@ -787,6 +787,9 @@ void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const
for (int k = 0; k<8; k++)
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf));
#else
//#error SM 3.0 code missing here
printf("", data18, data20);
#endif
Blake2S(input, input, key);
}
@ -1095,6 +1098,7 @@ uint32_t fastkdf32_v1(uint32_t thread, const uint32_t nonce, uint32_t* const sal
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
#else
//#error SM 3.0 code missing here
printf("", data18, data20);
#endif
for (int k = 0; k < 9; k++) {
B0[(k + qbuf) & 0x3f] = temp[k];
@ -1465,7 +1469,7 @@ static __thread uint32_t *Trans2 = NULL; // 2 streams
static __thread uint32_t *Trans3 = NULL; // 2 streams
__host__
void neoscrypt_init_2stream(int thr_id, uint32_t threads)
void neoscrypt_init(int thr_id, uint32_t threads)
{
CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads)));
@ -1480,7 +1484,7 @@ void neoscrypt_init_2stream(int thr_id, uint32_t threads)
}
__host__
void neoscrypt_free_2stream(int thr_id)
void neoscrypt_free(int thr_id)
{
cudaFree(d_NNonce[thr_id]);
@ -1491,18 +1495,15 @@ void neoscrypt_free_2stream(int thr_id)
}
__host__
void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum)
void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum)
{
CUDA_SAFE_CALL(cudaMemset(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t)));
const int threadsperblock = TPB;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
const int threadsperblock2 = TPB2;
dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2);
dim3 block2(threadsperblock2);
const int threadsperblock = TPB;
dim3 grid3((threads * 4 + threadsperblock - 1) / threadsperblock);
dim3 block3(4, threadsperblock >> 2);

View File

@ -6,9 +6,9 @@
extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);
extern void neoscrypt_init_2stream(int thr_id, uint32_t threads);
extern void neoscrypt_free_2stream(int thr_id);
extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);
extern void neoscrypt_init(int thr_id, uint32_t threads);
extern void neoscrypt_free(int thr_id);
extern void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);
static bool init[MAX_GPUS] = { 0 };
@ -21,14 +21,12 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
int dev_id = device_map[thr_id];
int intensity = is_windows() ? 18 : 19;
if (strstr(device_name[dev_id], "GTX 10")) intensity = 20; // also need more than 2GB
if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
throughput = throughput / 32; /* set for max intensity ~= 20 */
api_set_throughput(thr_id, throughput);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce + 1);
if (opt_benchmark)
ptarget[7] = 0x00ff;
@ -49,7 +47,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
}
gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);
neoscrypt_init_2stream(thr_id, throughput);
neoscrypt_init(thr_id, throughput);
init[thr_id] = true;
}
@ -66,7 +64,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
do {
uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum);
neoscrypt_hash_k4(thr_id, throughput, pdata[19], foundNonces, have_stratum);
*hashes_done = pdata[19] - first_nonce + throughput;
@ -111,7 +109,7 @@ void free_neoscrypt(int thr_id)
cudaThreadSynchronize();
neoscrypt_free_2stream(thr_id);
neoscrypt_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();