Browse Source

lbry maxwell and pascal update (up to 10% on pascal)

Based on alexis78 work and sponsored by LBRY.IO team (thanks)

Release 1.8.2, use cuda 8 for x86
2upstream 1.8.2-tpruvot
Tanguy Pruvot 8 years ago
parent
commit
9f2ed5135b
  1. 2
      Makefile.am
  2. 7
      README.txt
  3. 2
      ccminer.cpp
  4. 5
      ccminer.vcxproj
  5. 2
      cuda_helper.h
  6. 1055
      lbry/cuda_lbry_merged.cu
  7. 876
      lbry/cuda_sha256_lbry.cu
  8. 38
      lbry/cuda_sha512_lbry.cu
  9. 61
      lbry/lbry.cu

2
Makefile.am

@ -52,7 +52,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -52,7 +52,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \
sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
sph/ripemd.c sph/sph_sha2.c \
lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \
lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \

7
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccMiner 1.8.2 (August 2016) "Veltor algo Thor's Riddle streebog"
ccMiner 1.8.2 (Sept 2016) "Veltor and lbry boost"
---------------------------------------------------------------
***************************************************************
@ -245,9 +245,10 @@ features. @@ -245,9 +245,10 @@ features.
>>> RELEASE HISTORY <<<
Aug. 20th 2016 v1.8.2
Sep. 22th 2016 v1.8.2
lbry improvements by Alexis Provos
Prevent Windows hibernate while mining
veltor algo
veltor algo (basic implementation)
Aug. 10th 2016 v1.8.1
SIA Blake2-B Algo (getwork over stratum for Suprnova)

2
ccminer.cpp

@ -2430,7 +2430,7 @@ static void *miner_thread(void *userdata) @@ -2430,7 +2430,7 @@ static void *miner_thread(void *userdata)
pthread_mutex_lock(&stats_lock);
thr_hashrates[thr_id] = hashes_done / dtime;
thr_hashrates[thr_id] *= rate_factor;
if (loopcnt > 1) // ignore first (init time)
if (loopcnt > 2) // ignore first (init time)
stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
pthread_mutex_unlock(&stats_lock);
}

5
ccminer.vcxproj

@ -39,7 +39,7 @@ @@ -39,7 +39,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
</ImportGroup>
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
@ -429,6 +429,7 @@ @@ -429,6 +429,7 @@
<CudaCompile Include="Algo256\cuda_skein256.cu" />
<CudaCompile Include="lbry\cuda_sha256_lbry.cu" />
<CudaCompile Include="lbry\cuda_sha512_lbry.cu" />
<CudaCompile Include="lbry\cuda_lbry_merged.cu" />
<CudaCompile Include="lbry\lbry.cu" />
<CudaCompile Include="pentablake.cu">
<MaxRegCount>80</MaxRegCount>
@ -534,7 +535,7 @@ @@ -534,7 +535,7 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
</ImportGroup>
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />

2
cuda_helper.h

@ -481,7 +481,7 @@ static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2( @@ -481,7 +481,7 @@ static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(
static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) {
#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
uint2 result;
asm("{ // uint2 a+b \n\t"
"add.cc.u32 %0, %2, %4; \n\t"

1055
lbry/cuda_lbry_merged.cu

File diff suppressed because it is too large Load Diff

876
lbry/cuda_sha256_lbry.cu

File diff suppressed because it is too large Load Diff

38
lbry/cuda_sha512_lbry.cu

@ -1,20 +1,16 @@ @@ -1,20 +1,16 @@
/**
* sha-512 CUDA implementation.
* Tanguy Pruvot and Provos Alexis - JUL 2016
* Tanguy Pruvot and Provos Alexis - Jul / Sep 2016
* Sponsored by LBRY.IO team
*/
//#define USE_ROT_ASM_OPT 0
#include <cuda_helper.h>
#include <cuda_vector_uint2x4.h>
#include "miner.h"
static __constant__
#if __CUDA_ARCH__ > 500
_ALIGN(16)
#else
_ALIGN(8)
#endif
uint64_t K_512[80] = {
#include <miner.h>
static __constant__ _ALIGN(8) uint64_t K_512[80] = {
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
@ -58,12 +54,12 @@ uint64_t ROR64_8(const uint64_t u64) { @@ -58,12 +54,12 @@ uint64_t ROR64_8(const uint64_t u64) {
#define andor64(a,b,c) ((a & (b | c)) | (b & c))
#define xandx64(e,f,g) (g ^ (e & (g ^ f)))
static __device__ __forceinline__
void sha512_step2(uint64_t* r, const uint64_t W, const uint64_t K, const int ord)
__device__ __forceinline__
static void sha512_step2(uint64_t *const r,const uint64_t W,const uint64_t K, const int ord)
{
const uint64_t T1 = r[(15-ord) & 7] + K + W + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]);
r[(15-ord) & 7] = andor64(r[(8-ord) & 7],r[(9-ord) & 7],r[(10-ord) & 7]) + bsg5_0(r[(8-ord) & 7]) + T1;
r[(11-ord) & 7] += T1;
r[(11-ord) & 7]+= T1;
}
/**************************************************************************************************/
@ -76,7 +72,6 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash) @@ -76,7 +72,6 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
uint64_t r[8];
uint64_t W[16];
if (thread < threads)
@ -91,7 +86,7 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash) @@ -91,7 +86,7 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
W[4] = 0x8000000000000000; // end tag
#pragma unroll
for (int i = 5; i < 15; i++) W[i] = 0;
for (uint32_t i = 5; i < 15; i++) W[i] = 0;
W[15] = 0x100; // 256 bits
@ -100,31 +95,32 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash) @@ -100,31 +95,32 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
sha512_step2(r, W[i], K_512[i], i&7);
}
#pragma unroll
for (int i = 16; i < 80; i+=16) {
#pragma unroll 5
for (uint32_t i = 16; i < 80; i+=16){
#pragma unroll
for (int j = 0; j<16; j++) {
for (uint32_t j = 0; j<16; j++){
W[(i + j) & 15] += W[((i + j) - 7) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]);
}
#pragma unroll
for (int j = 0; j<16; j++) {
for (uint32_t j = 0; j<16; j++){
sha512_step2(r, W[j], K_512[i+j], (i+j)&7);
}
}
#pragma unroll 8
for (int i = 0; i < 8; i++)
for (uint32_t i = 0; i < 8; i++)
r[i] = cuda_swab64(r[i] + IV512[i]);
*(uint2x4*)&pHash[0] = *(uint2x4*)&r[0];
*(uint2x4*)&pHash[4] = *(uint2x4*)&r[4];
}
}
__host__
void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash)
{
const uint32_t threadsperblock = 512;
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);

61
lbry/lbry.cu

@ -1,8 +1,9 @@ @@ -1,8 +1,9 @@
/**
* Lbry CUDA Implementation
* Lbry Algo (sha-256 / sha-512 / ripemd)
*
* by tpruvot@github - July 2016
* tpruvot and Provos Alexis - Jul / Sep 2016
*
* Sponsored by LBRY.IO team
*/
#include <string.h>
@ -64,11 +65,14 @@ extern "C" void lbry_hash(void* output, const void* input) @@ -64,11 +65,14 @@ extern "C" void lbry_hash(void* output, const void* input)
extern void lbry_sha256_init(int thr_id);
extern void lbry_sha256_free(int thr_id);
extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget);
extern void lbry_sha256_setBlock_112(uint32_t *pdata);
extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash);
extern void lbry_sha512_init(int thr_id);
extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash);
extern int lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce);
extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce, const uint64_t target64);
extern void lbry_sha256_setBlock_112_merged(uint32_t *pdata);
extern void lbry_merged(int thr_id,uint32_t startNonce, uint32_t threads, uint32_t *d_resNonce, const uint64_t target64);
static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
return iftrue ? swab32(val) : val;
@ -97,7 +101,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -97,7 +101,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
if (device_sm[dev_id] < 350) intensity = 18;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) {
ptarget[7] = 0xf;
@ -107,17 +111,16 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -107,17 +111,16 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage (linux)
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
CUDA_LOG_ERROR();
}
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
if(device_sm[dev_id] <= 500)
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 8 * sizeof(uint64_t) * throughput));
lbry_sha256_init(thr_id);
lbry_sha512_init(thr_id);
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
CUDA_LOG_ERROR();
init[thr_id] = true;
@ -127,29 +130,29 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -127,29 +130,29 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]);
}
lbry_sha256_setBlock_112(endiandata, ptarget);
if(device_sm[dev_id] <= 500)
lbry_sha256_setBlock_112(endiandata);
else
lbry_sha256_setBlock_112_merged(endiandata);
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
do {
// Hash with CUDA
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]);
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
int err = lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
if (err) {
// reinit
free_lbry(thr_id);
return -1;
if(device_sm[dev_id] <= 500){
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]);
lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
}else{
lbry_merged(thr_id,pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
}
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
if (resNonces[0] != UINT32_MAX)
{
const uint32_t startNonce = pdata[LBC_NONCE_OFT32];
resNonces[0] += startNonce;
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap);
@ -157,12 +160,13 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -157,12 +160,13 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
int res = 1;
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
work->nonces[0] = swab32_if(resNonces[0], swap);
work_set_target_ratio(work, vhash);
if (resNonces[1] != UINT32_MAX) {
resNonces[1] += startNonce;
if (opt_debug)
gpulog(LOG_BLUE, thr_id, "found second nonce %08x", resNonces[1]);
gpulog(LOG_BLUE, thr_id, "Found second nonce %08x", swab32(resNonces[1]));
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap);
lbry_hash(vhash, endiandata);
work->nonces[1] = swab32_if(resNonces[1], swap);
@ -189,22 +193,23 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -189,22 +193,23 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + 1;
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_lbry(int thr_id)
void free_lbry(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
if(device_sm[device_map[thr_id]]<=500)
cudaFree(d_hash[thr_id]);
cudaFree(d_resNonce[thr_id]);
lbry_sha256_free(thr_id);
init[thr_id] = false;

Loading…
Cancel
Save