Browse Source

lbry maxwell and pascal update (up to 10% on pascal)

Based on alexis78 work and sponsored by LBRY.IO team (thanks)

Release 1.8.2, use cuda 8 for x86
2upstream 1.8.2-tpruvot
Tanguy Pruvot 8 years ago
parent
commit
9f2ed5135b
  1. 2
      Makefile.am
  2. 7
      README.txt
  3. 2
      ccminer.cpp
  4. 5
      ccminer.vcxproj
  5. 2
      cuda_helper.h
  6. 1055
      lbry/cuda_lbry_merged.cu
  7. 870
      lbry/cuda_sha256_lbry.cu
  8. 38
      lbry/cuda_sha512_lbry.cu
  9. 55
      lbry/lbry.cu

2
Makefile.am

@ -52,7 +52,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \ sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \
sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \ sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
sph/ripemd.c sph/sph_sha2.c \ sph/ripemd.c sph/sph_sha2.c \
lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \ lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \

7
README.txt

@ -1,5 +1,5 @@
ccMiner 1.8.2 (August 2016) "Veltor algo Thor's Riddle streebog" ccMiner 1.8.2 (Sept 2016) "Veltor and lbry boost"
--------------------------------------------------------------- ---------------------------------------------------------------
*************************************************************** ***************************************************************
@ -245,9 +245,10 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Aug. 20th 2016 v1.8.2 Sep. 22th 2016 v1.8.2
lbry improvements by Alexis Provos
Prevent Windows hibernate while mining Prevent Windows hibernate while mining
veltor algo veltor algo (basic implementation)
Aug. 10th 2016 v1.8.1 Aug. 10th 2016 v1.8.1
SIA Blake2-B Algo (getwork over stratum for Suprnova) SIA Blake2-B Algo (getwork over stratum for Suprnova)

2
ccminer.cpp

@ -2430,7 +2430,7 @@ static void *miner_thread(void *userdata)
pthread_mutex_lock(&stats_lock); pthread_mutex_lock(&stats_lock);
thr_hashrates[thr_id] = hashes_done / dtime; thr_hashrates[thr_id] = hashes_done / dtime;
thr_hashrates[thr_id] *= rate_factor; thr_hashrates[thr_id] *= rate_factor;
if (loopcnt > 1) // ignore first (init time) if (loopcnt > 2) // ignore first (init time)
stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height); stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
pthread_mutex_unlock(&stats_lock); pthread_mutex_unlock(&stats_lock);
} }

5
ccminer.vcxproj

@ -39,7 +39,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'"> <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" /> <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'"> <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" /> <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
@ -429,6 +429,7 @@
<CudaCompile Include="Algo256\cuda_skein256.cu" /> <CudaCompile Include="Algo256\cuda_skein256.cu" />
<CudaCompile Include="lbry\cuda_sha256_lbry.cu" /> <CudaCompile Include="lbry\cuda_sha256_lbry.cu" />
<CudaCompile Include="lbry\cuda_sha512_lbry.cu" /> <CudaCompile Include="lbry\cuda_sha512_lbry.cu" />
<CudaCompile Include="lbry\cuda_lbry_merged.cu" />
<CudaCompile Include="lbry\lbry.cu" /> <CudaCompile Include="lbry\lbry.cu" />
<CudaCompile Include="pentablake.cu"> <CudaCompile Include="pentablake.cu">
<MaxRegCount>80</MaxRegCount> <MaxRegCount>80</MaxRegCount>
@ -534,7 +535,7 @@
</ItemGroup> </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'"> <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" /> <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'"> <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" /> <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />

2
cuda_helper.h

@ -481,7 +481,7 @@ static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(
static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) { static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) {
#ifdef __CUDA_ARCH__ #if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
uint2 result; uint2 result;
asm("{ // uint2 a+b \n\t" asm("{ // uint2 a+b \n\t"
"add.cc.u32 %0, %2, %4; \n\t" "add.cc.u32 %0, %2, %4; \n\t"

1055
lbry/cuda_lbry_merged.cu

File diff suppressed because it is too large Load Diff

870
lbry/cuda_sha256_lbry.cu

File diff suppressed because it is too large Load Diff

38
lbry/cuda_sha512_lbry.cu

@ -1,20 +1,16 @@
/** /**
* sha-512 CUDA implementation. * sha-512 CUDA implementation.
* Tanguy Pruvot and Provos Alexis - JUL 2016 * Tanguy Pruvot and Provos Alexis - Jul / Sep 2016
* Sponsored by LBRY.IO team
*/ */
//#define USE_ROT_ASM_OPT 0 //#define USE_ROT_ASM_OPT 0
#include <cuda_helper.h> #include <cuda_helper.h>
#include <cuda_vector_uint2x4.h> #include <cuda_vector_uint2x4.h>
#include "miner.h"
#include <miner.h>
static __constant__
#if __CUDA_ARCH__ > 500 static __constant__ _ALIGN(8) uint64_t K_512[80] = {
_ALIGN(16)
#else
_ALIGN(8)
#endif
uint64_t K_512[80] = {
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
@ -58,12 +54,12 @@ uint64_t ROR64_8(const uint64_t u64) {
#define andor64(a,b,c) ((a & (b | c)) | (b & c)) #define andor64(a,b,c) ((a & (b | c)) | (b & c))
#define xandx64(e,f,g) (g ^ (e & (g ^ f))) #define xandx64(e,f,g) (g ^ (e & (g ^ f)))
static __device__ __forceinline__ __device__ __forceinline__
void sha512_step2(uint64_t* r, const uint64_t W, const uint64_t K, const int ord) static void sha512_step2(uint64_t *const r,const uint64_t W,const uint64_t K, const int ord)
{ {
const uint64_t T1 = r[(15-ord) & 7] + K + W + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]); const uint64_t T1 = r[(15-ord) & 7] + K + W + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]);
r[(15-ord) & 7] = andor64(r[(8-ord) & 7],r[(9-ord) & 7],r[(10-ord) & 7]) + bsg5_0(r[(8-ord) & 7]) + T1; r[(15-ord) & 7] = andor64(r[(8-ord) & 7],r[(9-ord) & 7],r[(10-ord) & 7]) + bsg5_0(r[(8-ord) & 7]) + T1;
r[(11-ord) & 7] += T1; r[(11-ord) & 7]+= T1;
} }
/**************************************************************************************************/ /**************************************************************************************************/
@ -76,7 +72,6 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
}; };
uint64_t r[8]; uint64_t r[8];
uint64_t W[16]; uint64_t W[16];
if (thread < threads) if (thread < threads)
@ -91,7 +86,7 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
W[4] = 0x8000000000000000; // end tag W[4] = 0x8000000000000000; // end tag
#pragma unroll #pragma unroll
for (int i = 5; i < 15; i++) W[i] = 0; for (uint32_t i = 5; i < 15; i++) W[i] = 0;
W[15] = 0x100; // 256 bits W[15] = 0x100; // 256 bits
@ -100,31 +95,32 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
sha512_step2(r, W[i], K_512[i], i&7); sha512_step2(r, W[i], K_512[i], i&7);
} }
#pragma unroll 5
for (uint32_t i = 16; i < 80; i+=16){
#pragma unroll #pragma unroll
for (int i = 16; i < 80; i+=16) { for (uint32_t j = 0; j<16; j++){
#pragma unroll
for (int j = 0; j<16; j++) {
W[(i + j) & 15] += W[((i + j) - 7) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]); W[(i + j) & 15] += W[((i + j) - 7) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]);
} }
#pragma unroll #pragma unroll
for (int j = 0; j<16; j++) { for (uint32_t j = 0; j<16; j++){
sha512_step2(r, W[j], K_512[i+j], (i+j)&7); sha512_step2(r, W[j], K_512[i+j], (i+j)&7);
} }
} }
#pragma unroll 8 #pragma unroll 8
for (int i = 0; i < 8; i++) for (uint32_t i = 0; i < 8; i++)
r[i] = cuda_swab64(r[i] + IV512[i]); r[i] = cuda_swab64(r[i] + IV512[i]);
*(uint2x4*)&pHash[0] = *(uint2x4*)&r[0]; *(uint2x4*)&pHash[0] = *(uint2x4*)&r[0];
*(uint2x4*)&pHash[4] = *(uint2x4*)&r[4]; *(uint2x4*)&pHash[4] = *(uint2x4*)&r[4];
} }
} }
__host__ __host__
void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash) void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash)
{ {
const uint32_t threadsperblock = 512; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

55
lbry/lbry.cu

@ -1,8 +1,9 @@
/** /**
* Lbry CUDA Implementation * Lbry Algo (sha-256 / sha-512 / ripemd)
* *
* by tpruvot@github - July 2016 * tpruvot and Provos Alexis - Jul / Sep 2016
* *
* Sponsored by LBRY.IO team
*/ */
#include <string.h> #include <string.h>
@ -64,11 +65,14 @@ extern "C" void lbry_hash(void* output, const void* input)
extern void lbry_sha256_init(int thr_id); extern void lbry_sha256_init(int thr_id);
extern void lbry_sha256_free(int thr_id); extern void lbry_sha256_free(int thr_id);
extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget); extern void lbry_sha256_setBlock_112(uint32_t *pdata);
extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash); extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash);
extern void lbry_sha512_init(int thr_id); extern void lbry_sha512_init(int thr_id);
extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash); extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash);
extern int lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce); extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce, const uint64_t target64);
extern void lbry_sha256_setBlock_112_merged(uint32_t *pdata);
extern void lbry_merged(int thr_id,uint32_t startNonce, uint32_t threads, uint32_t *d_resNonce, const uint64_t target64);
static __inline uint32_t swab32_if(uint32_t val, bool iftrue) { static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
return iftrue ? swab32(val) : val; return iftrue ? swab32(val) : val;
@ -97,7 +101,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
if (device_sm[dev_id] < 350) intensity = 18; if (device_sm[dev_id] < 350) intensity = 18;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) { if (opt_benchmark) {
ptarget[7] = 0xf; ptarget[7] = 0xf;
@ -107,17 +111,16 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
cudaSetDevice(dev_id); cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) { if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset(); cudaDeviceReset();
// reduce cpu usage (linux) // reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
CUDA_LOG_ERROR(); CUDA_LOG_ERROR();
} }
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); if(device_sm[dev_id] <= 500)
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 8 * sizeof(uint64_t) * throughput));
lbry_sha256_init(thr_id); CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
lbry_sha512_init(thr_id);
CUDA_LOG_ERROR(); CUDA_LOG_ERROR();
init[thr_id] = true; init[thr_id] = true;
@ -127,29 +130,29 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]); be32enc(&endiandata[i], pdata[i]);
} }
lbry_sha256_setBlock_112(endiandata, ptarget); if(device_sm[dev_id] <= 500)
lbry_sha256_setBlock_112(endiandata);
else
lbry_sha256_setBlock_112_merged(endiandata);
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
do { do {
// Hash with CUDA // Hash with CUDA
if(device_sm[dev_id] <= 500){
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]); lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]); lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]);
lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; }else{
int err = lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); lbry_merged(thr_id,pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
if (err) {
// reinit
free_lbry(thr_id);
return -1;
} }
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
if (resNonces[0] != UINT32_MAX) if (resNonces[0] != UINT32_MAX)
{ {
const uint32_t startNonce = pdata[LBC_NONCE_OFT32]; const uint32_t startNonce = pdata[LBC_NONCE_OFT32];
resNonces[0] += startNonce; resNonces[0] += startNonce;
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap); endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap);
@ -157,12 +160,13 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
int res = 1; int res = 1;
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
work->nonces[0] = swab32_if(resNonces[0], swap); work->nonces[0] = swab32_if(resNonces[0], swap);
work_set_target_ratio(work, vhash); work_set_target_ratio(work, vhash);
if (resNonces[1] != UINT32_MAX) { if (resNonces[1] != UINT32_MAX) {
resNonces[1] += startNonce; resNonces[1] += startNonce;
if (opt_debug) if (opt_debug)
gpulog(LOG_BLUE, thr_id, "found second nonce %08x", resNonces[1]); gpulog(LOG_BLUE, thr_id, "Found second nonce %08x", swab32(resNonces[1]));
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap); endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap);
lbry_hash(vhash, endiandata); lbry_hash(vhash, endiandata);
work->nonces[1] = swab32_if(resNonces[1], swap); work->nonces[1] = swab32_if(resNonces[1], swap);
@ -189,22 +193,23 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
} while (!work_restart[thr_id].restart); } while (!work_restart[thr_id].restart);
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + 1; *hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce;
return 0; return 0;
} }
// cleanup // cleanup
extern "C" void free_lbry(int thr_id) void free_lbry(int thr_id)
{ {
if (!init[thr_id]) if (!init[thr_id])
return; return;
cudaThreadSynchronize(); cudaThreadSynchronize();
if(device_sm[device_map[thr_id]]<=500)
cudaFree(d_hash[thr_id]); cudaFree(d_hash[thr_id]);
cudaFree(d_resNonce[thr_id]); cudaFree(d_resNonce[thr_id]);
lbry_sha256_free(thr_id);
init[thr_id] = false; init[thr_id] = false;

Loading…
Cancel
Save