mirror of
https://github.com/GOSTSec/ccminer
synced 2025-03-09 12:01:09 +00:00
lbry maxwell and pascal update (up to 10% on pascal)
Based on alexis78 work and sponsored by LBRY.IO team (thanks) Release 1.8.2, use cuda 8 for x86
This commit is contained in:
parent
f982fd70d5
commit
9f2ed5135b
@ -52,7 +52,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
|
||||
sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \
|
||||
sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
|
||||
sph/ripemd.c sph/sph_sha2.c \
|
||||
lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \
|
||||
lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
|
||||
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
|
||||
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
|
||||
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
ccMiner 1.8.2 (August 2016) "Veltor algo Thor's Riddle streebog"
|
||||
ccMiner 1.8.2 (Sept 2016) "Veltor and lbry boost"
|
||||
---------------------------------------------------------------
|
||||
|
||||
***************************************************************
|
||||
@ -245,9 +245,10 @@ features.
|
||||
|
||||
>>> RELEASE HISTORY <<<
|
||||
|
||||
Aug. 20th 2016 v1.8.2
|
||||
Sep. 22th 2016 v1.8.2
|
||||
lbry improvements by Alexis Provos
|
||||
Prevent Windows hibernate while mining
|
||||
veltor algo
|
||||
veltor algo (basic implementation)
|
||||
|
||||
Aug. 10th 2016 v1.8.1
|
||||
SIA Blake2-B Algo (getwork over stratum for Suprnova)
|
||||
|
@ -2430,7 +2430,7 @@ static void *miner_thread(void *userdata)
|
||||
pthread_mutex_lock(&stats_lock);
|
||||
thr_hashrates[thr_id] = hashes_done / dtime;
|
||||
thr_hashrates[thr_id] *= rate_factor;
|
||||
if (loopcnt > 1) // ignore first (init time)
|
||||
if (loopcnt > 2) // ignore first (init time)
|
||||
stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
|
||||
pthread_mutex_unlock(&stats_lock);
|
||||
}
|
||||
|
@ -39,7 +39,7 @@
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'">
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'">
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
|
||||
@ -429,6 +429,7 @@
|
||||
<CudaCompile Include="Algo256\cuda_skein256.cu" />
|
||||
<CudaCompile Include="lbry\cuda_sha256_lbry.cu" />
|
||||
<CudaCompile Include="lbry\cuda_sha512_lbry.cu" />
|
||||
<CudaCompile Include="lbry\cuda_lbry_merged.cu" />
|
||||
<CudaCompile Include="lbry\lbry.cu" />
|
||||
<CudaCompile Include="pentablake.cu">
|
||||
<MaxRegCount>80</MaxRegCount>
|
||||
@ -534,7 +535,7 @@
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'">
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'">
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
|
||||
|
@ -481,7 +481,7 @@ static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(
|
||||
static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
|
||||
|
||||
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
|
||||
uint2 result;
|
||||
asm("{ // uint2 a+b \n\t"
|
||||
"add.cc.u32 %0, %2, %4; \n\t"
|
||||
|
1055
lbry/cuda_lbry_merged.cu
Normal file
1055
lbry/cuda_lbry_merged.cu
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,20 +1,16 @@
|
||||
/**
|
||||
* sha-512 CUDA implementation.
|
||||
* Tanguy Pruvot and Provos Alexis - JUL 2016
|
||||
* Tanguy Pruvot and Provos Alexis - Jul / Sep 2016
|
||||
* Sponsored by LBRY.IO team
|
||||
*/
|
||||
|
||||
//#define USE_ROT_ASM_OPT 0
|
||||
#include <cuda_helper.h>
|
||||
#include <cuda_vector_uint2x4.h>
|
||||
#include "miner.h"
|
||||
|
||||
static __constant__
|
||||
#if __CUDA_ARCH__ > 500
|
||||
_ALIGN(16)
|
||||
#else
|
||||
_ALIGN(8)
|
||||
#endif
|
||||
uint64_t K_512[80] = {
|
||||
#include <miner.h>
|
||||
|
||||
static __constant__ _ALIGN(8) uint64_t K_512[80] = {
|
||||
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
|
||||
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
|
||||
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
|
||||
@ -58,12 +54,12 @@ uint64_t ROR64_8(const uint64_t u64) {
|
||||
#define andor64(a,b,c) ((a & (b | c)) | (b & c))
|
||||
#define xandx64(e,f,g) (g ^ (e & (g ^ f)))
|
||||
|
||||
static __device__ __forceinline__
|
||||
void sha512_step2(uint64_t* r, const uint64_t W, const uint64_t K, const int ord)
|
||||
__device__ __forceinline__
|
||||
static void sha512_step2(uint64_t *const r,const uint64_t W,const uint64_t K, const int ord)
|
||||
{
|
||||
const uint64_t T1 = r[(15-ord) & 7] + K + W + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]);
|
||||
r[(15-ord) & 7] = andor64(r[(8-ord) & 7],r[(9-ord) & 7],r[(10-ord) & 7]) + bsg5_0(r[(8-ord) & 7]) + T1;
|
||||
r[(11-ord) & 7] += T1;
|
||||
r[(11-ord) & 7]+= T1;
|
||||
}
|
||||
|
||||
/**************************************************************************************************/
|
||||
@ -76,7 +72,6 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
|
||||
uint64_t r[8];
|
||||
uint64_t W[16];
|
||||
if (thread < threads)
|
||||
@ -91,7 +86,7 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
|
||||
W[4] = 0x8000000000000000; // end tag
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 5; i < 15; i++) W[i] = 0;
|
||||
for (uint32_t i = 5; i < 15; i++) W[i] = 0;
|
||||
|
||||
W[15] = 0x100; // 256 bits
|
||||
|
||||
@ -100,31 +95,32 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
|
||||
sha512_step2(r, W[i], K_512[i], i&7);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 16; i < 80; i+=16) {
|
||||
#pragma unroll 5
|
||||
for (uint32_t i = 16; i < 80; i+=16){
|
||||
#pragma unroll
|
||||
for (int j = 0; j<16; j++) {
|
||||
for (uint32_t j = 0; j<16; j++){
|
||||
W[(i + j) & 15] += W[((i + j) - 7) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j = 0; j<16; j++) {
|
||||
for (uint32_t j = 0; j<16; j++){
|
||||
sha512_step2(r, W[j], K_512[i+j], (i+j)&7);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll 8
|
||||
for (int i = 0; i < 8; i++)
|
||||
for (uint32_t i = 0; i < 8; i++)
|
||||
r[i] = cuda_swab64(r[i] + IV512[i]);
|
||||
|
||||
*(uint2x4*)&pHash[0] = *(uint2x4*)&r[0];
|
||||
*(uint2x4*)&pHash[4] = *(uint2x4*)&r[4];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
__host__
|
||||
void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash)
|
||||
{
|
||||
const uint32_t threadsperblock = 512;
|
||||
const uint32_t threadsperblock = 256;
|
||||
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
61
lbry/lbry.cu
61
lbry/lbry.cu
@ -1,8 +1,9 @@
|
||||
/**
|
||||
* Lbry CUDA Implementation
|
||||
* Lbry Algo (sha-256 / sha-512 / ripemd)
|
||||
*
|
||||
* by tpruvot@github - July 2016
|
||||
* tpruvot and Provos Alexis - Jul / Sep 2016
|
||||
*
|
||||
* Sponsored by LBRY.IO team
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
@ -64,11 +65,14 @@ extern "C" void lbry_hash(void* output, const void* input)
|
||||
|
||||
extern void lbry_sha256_init(int thr_id);
|
||||
extern void lbry_sha256_free(int thr_id);
|
||||
extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget);
|
||||
extern void lbry_sha256_setBlock_112(uint32_t *pdata);
|
||||
extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash);
|
||||
extern void lbry_sha512_init(int thr_id);
|
||||
extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash);
|
||||
extern int lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce);
|
||||
extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce, const uint64_t target64);
|
||||
|
||||
extern void lbry_sha256_setBlock_112_merged(uint32_t *pdata);
|
||||
extern void lbry_merged(int thr_id,uint32_t startNonce, uint32_t threads, uint32_t *d_resNonce, const uint64_t target64);
|
||||
|
||||
static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
|
||||
return iftrue ? swab32(val) : val;
|
||||
@ -97,7 +101,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
if (device_sm[dev_id] < 350) intensity = 18;
|
||||
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
|
||||
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
|
||||
if (opt_benchmark) {
|
||||
ptarget[7] = 0xf;
|
||||
@ -107,17 +111,16 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
cudaSetDevice(dev_id);
|
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) {
|
||||
cudaDeviceReset();
|
||||
// reduce cpu usage (linux)
|
||||
// reduce cpu usage
|
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
|
||||
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
|
||||
CUDA_LOG_ERROR();
|
||||
}
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
|
||||
if(device_sm[dev_id] <= 500)
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 8 * sizeof(uint64_t) * throughput));
|
||||
|
||||
lbry_sha256_init(thr_id);
|
||||
lbry_sha512_init(thr_id);
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
|
||||
CUDA_LOG_ERROR();
|
||||
|
||||
init[thr_id] = true;
|
||||
@ -127,29 +130,29 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
|
||||
lbry_sha256_setBlock_112(endiandata, ptarget);
|
||||
if(device_sm[dev_id] <= 500)
|
||||
lbry_sha256_setBlock_112(endiandata);
|
||||
else
|
||||
lbry_sha256_setBlock_112_merged(endiandata);
|
||||
|
||||
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
|
||||
|
||||
do {
|
||||
// Hash with CUDA
|
||||
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
|
||||
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]);
|
||||
|
||||
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
|
||||
int err = lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
|
||||
if (err) {
|
||||
// reinit
|
||||
free_lbry(thr_id);
|
||||
return -1;
|
||||
if(device_sm[dev_id] <= 500){
|
||||
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
|
||||
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]);
|
||||
lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
|
||||
}else{
|
||||
lbry_merged(thr_id,pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
|
||||
}
|
||||
|
||||
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
|
||||
cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
|
||||
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
|
||||
|
||||
if (resNonces[0] != UINT32_MAX)
|
||||
{
|
||||
const uint32_t startNonce = pdata[LBC_NONCE_OFT32];
|
||||
|
||||
resNonces[0] += startNonce;
|
||||
|
||||
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap);
|
||||
@ -157,12 +160,13 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
|
||||
int res = 1;
|
||||
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
|
||||
work->nonces[0] = swab32_if(resNonces[0], swap);
|
||||
work_set_target_ratio(work, vhash);
|
||||
if (resNonces[1] != UINT32_MAX) {
|
||||
resNonces[1] += startNonce;
|
||||
if (opt_debug)
|
||||
gpulog(LOG_BLUE, thr_id, "found second nonce %08x", resNonces[1]);
|
||||
gpulog(LOG_BLUE, thr_id, "Found second nonce %08x", swab32(resNonces[1]));
|
||||
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap);
|
||||
lbry_hash(vhash, endiandata);
|
||||
work->nonces[1] = swab32_if(resNonces[1], swap);
|
||||
@ -189,22 +193,23 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
} while (!work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + 1;
|
||||
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// cleanup
|
||||
extern "C" void free_lbry(int thr_id)
|
||||
void free_lbry(int thr_id)
|
||||
{
|
||||
if (!init[thr_id])
|
||||
return;
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
cudaFree(d_hash[thr_id]);
|
||||
if(device_sm[device_map[thr_id]]<=500)
|
||||
cudaFree(d_hash[thr_id]);
|
||||
|
||||
cudaFree(d_resNonce[thr_id]);
|
||||
lbry_sha256_free(thr_id);
|
||||
|
||||
init[thr_id] = false;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user