mirror of
https://github.com/GOSTSec/ccminer
synced 2025-08-30 07:42:11 +00:00
lyra2RE: link the merged blake/keccak kernel into algos
old keccak256_gpu_hash_32 kernel commented to reduce binary size compat. not yet tested on old cards
This commit is contained in:
parent
18d29914ec
commit
b70409ab5b
@ -2,6 +2,8 @@
|
||||
* Blake-256 Cuda Kernel (Tested on SM 5.0)
|
||||
*
|
||||
* Tanguy Pruvot - Nov. 2014
|
||||
*
|
||||
* + merged blake+keccak kernel for lyra2v2
|
||||
*/
|
||||
extern "C" {
|
||||
#include "sph/sph_blake.h"
|
||||
@ -14,20 +16,17 @@ extern "C" {
|
||||
#ifdef __INTELLISENSE__
|
||||
/* just for vstudio code colors */
|
||||
__device__ uint32_t __byte_perm(uint32_t a, uint32_t b, uint32_t c);
|
||||
|
||||
#endif
|
||||
|
||||
#define UINT2(x,y) make_uint2(x,y)
|
||||
|
||||
__device__ __inline__ uint2 ROR8(const uint2 a)
|
||||
{
|
||||
__device__ __inline__ uint2 ROR8(const uint2 a) {
|
||||
uint2 result;
|
||||
result.x = __byte_perm(a.y, a.x, 0x0765);
|
||||
result.y = __byte_perm(a.x, a.y, 0x0765);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static __device__ uint64_t cuda_swab32ll(uint64_t x) {
|
||||
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
|
||||
}
|
||||
@ -297,9 +296,9 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
if (thread < threads)
|
||||
{
|
||||
const uint32_t nonce = startNonce + thread;
|
||||
uint32_t h[8];
|
||||
// uint32_t input[4];
|
||||
const uint32_t T0 = 640;
|
||||
|
||||
uint32_t h[8];
|
||||
#pragma unroll 8
|
||||
for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
|
||||
|
||||
@ -311,8 +310,7 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
0, 1, 0, 640
|
||||
};
|
||||
|
||||
const uint32_t u256[16] =
|
||||
{
|
||||
const uint32_t u256[16] = {
|
||||
0x243F6A88, 0x85A308D3,
|
||||
0x13198A2E, 0x03707344,
|
||||
0xA4093822, 0x299F31D0,
|
||||
@ -323,8 +321,7 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
0x3F84D5B5, 0xB5470917
|
||||
};
|
||||
|
||||
uint32_t m[16] =
|
||||
{
|
||||
uint32_t m[16] = {
|
||||
c_data[0], c_data[1], c_data[2], nonce,
|
||||
c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
|
||||
c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
|
||||
@ -380,7 +377,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
GSPREC(1, 6, 0xB, 0xC, 5, 10);
|
||||
GSPREC(2, 7, 0x8, 0xD, 4, 0);
|
||||
GSPREC(3, 4, 0x9, 0xE, 15, 8);
|
||||
|
||||
// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
GSPREC(0, 4, 0x8, 0xC, 9, 0);
|
||||
GSPREC(1, 5, 0x9, 0xD, 5, 7);
|
||||
@ -399,7 +395,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
GSPREC(1, 6, 0xB, 0xC, 7, 5);
|
||||
GSPREC(2, 7, 0x8, 0xD, 15, 14);
|
||||
GSPREC(3, 4, 0x9, 0xE, 1, 9);
|
||||
|
||||
// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
GSPREC(0, 4, 0x8, 0xC, 12, 5);
|
||||
GSPREC(1, 5, 0x9, 0xD, 1, 15);
|
||||
@ -409,7 +404,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
GSPREC(1, 6, 0xB, 0xC, 6, 3);
|
||||
GSPREC(2, 7, 0x8, 0xD, 9, 2);
|
||||
GSPREC(3, 4, 0x9, 0xE, 8, 11);
|
||||
|
||||
// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
GSPREC(0, 4, 0x8, 0xC, 13, 11);
|
||||
GSPREC(1, 5, 0x9, 0xD, 7, 14);
|
||||
@ -446,7 +440,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
GSPREC(1, 6, 0xB, 0xC, 10, 11);
|
||||
GSPREC(2, 7, 0x8, 0xD, 12, 13);
|
||||
GSPREC(3, 4, 0x9, 0xE, 14, 15);
|
||||
|
||||
// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
GSPREC(0, 4, 0x8, 0xC, 14, 10);
|
||||
GSPREC(1, 5, 0x9, 0xD, 4, 8);
|
||||
@ -456,7 +449,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
GSPREC(1, 6, 0xB, 0xC, 0, 2);
|
||||
GSPREC(2, 7, 0x8, 0xD, 11, 7);
|
||||
GSPREC(3, 4, 0x9, 0xE, 5, 3);
|
||||
|
||||
// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
GSPREC(0, 4, 0x8, 0xC, 11, 8);
|
||||
GSPREC(1, 5, 0x9, 0xD, 12, 0);
|
||||
@ -476,9 +468,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
GSPREC(2, 7, 0x8, 0xD, 4, 0);
|
||||
GSPREC(3, 4, 0x9, 0xE, 15, 8);
|
||||
|
||||
|
||||
|
||||
|
||||
h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
|
||||
h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
|
||||
h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
|
||||
@ -501,14 +490,12 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
|
||||
|
||||
keccak_gpu_state[16] = UINT2(0, 0x80000000);
|
||||
keccak_block(keccak_gpu_state);
|
||||
|
||||
uint64_t *outputHash = (uint64_t *)Hash;
|
||||
#pragma unroll 4
|
||||
for (int i = 0; i<4; i++)
|
||||
outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(256, 3)
|
||||
@ -568,6 +555,8 @@ void blake256_cpu_init(int thr_id, uint32_t threads)
|
||||
cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
/** for lyra2v2 **/
|
||||
|
||||
__host__
|
||||
void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
|
||||
{
|
||||
|
@ -356,6 +356,7 @@ void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce,
|
||||
|
||||
cubehash256_gpu_hash_32 <<<grid, block >>> (threads, startNounce, (uint2*)d_hash);
|
||||
}
|
||||
|
||||
__host__
|
||||
void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream)
|
||||
{
|
||||
|
@ -212,6 +212,7 @@ void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, ui
|
||||
memcpy(resNonces, h_nonces[thr_id], NBN*sizeof(uint32_t));
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if __CUDA_ARCH__ <= 500
|
||||
__global__ __launch_bounds__(TPB50, 2)
|
||||
#else
|
||||
@ -306,6 +307,7 @@ void keccak256_cpu_hash_32(const int thr_id,const uint32_t threads, uint2* d_has
|
||||
|
||||
keccak256_gpu_hash_32 <<<grid, block>>> (threads, d_hash);
|
||||
}
|
||||
#endif
|
||||
|
||||
__host__
|
||||
void keccak256_setBlock_80(uint64_t *endiandata)
|
||||
|
@ -231,6 +231,7 @@ uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounc
|
||||
return result;
|
||||
}
|
||||
|
||||
#if 0
|
||||
__global__ __launch_bounds__(256,3)
|
||||
void keccak256_sm3_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
|
||||
{
|
||||
@ -282,6 +283,7 @@ void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, u
|
||||
keccak256_sm3_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
#endif
|
||||
|
||||
__host__
|
||||
void keccak256_sm3_setBlock_80(void *pdata,const void *pTargetIn)
|
||||
|
@ -164,7 +164,7 @@
|
||||
#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "2.2.3"
|
||||
#define PACKAGE_VERSION "2.2.4"
|
||||
|
||||
/* If using the C implementation of alloca, define if you know the
|
||||
direction of stack growth for your system; otherwise it will be
|
||||
|
@ -1,4 +1,4 @@
|
||||
AC_INIT([ccminer], [2.2.3], [], [ccminer], [http://github.com/tpruvot/ccminer])
|
||||
AC_INIT([ccminer], [2.2.4], [], [ccminer], [http://github.com/tpruvot/ccminer])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
@ -13,12 +13,14 @@ static uint64_t* d_hash[MAX_GPUS];
|
||||
static uint64_t* d_matrix[MAX_GPUS];
|
||||
|
||||
extern void blake256_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
|
||||
extern void blake256_cpu_setBlock_80(uint32_t *pdata);
|
||||
//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
|
||||
|
||||
extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
|
||||
extern void keccak256_sm3_init(int thr_id, uint32_t threads);
|
||||
extern void keccak256_sm3_free(int thr_id);
|
||||
//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
|
||||
//extern void keccak256_sm3_init(int thr_id, uint32_t threads);
|
||||
//extern void keccak256_sm3_free(int thr_id);
|
||||
|
||||
extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
|
||||
|
||||
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
|
||||
extern void skein256_cpu_init(int thr_id, uint32_t threads);
|
||||
@ -98,10 +100,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
|
||||
|
||||
blake256_cpu_init(thr_id, throughput);
|
||||
keccak256_sm3_init(thr_id, throughput);
|
||||
//keccak256_sm3_init(thr_id, throughput);
|
||||
skein256_cpu_init(thr_id, throughput);
|
||||
groestl256_cpu_init(thr_id, throughput);
|
||||
|
||||
//cuda_get_arch(thr_id);
|
||||
if (device_sm[dev_id] >= 500)
|
||||
{
|
||||
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
|
||||
@ -124,8 +127,9 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
do {
|
||||
int order = 0;
|
||||
|
||||
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
|
||||
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
|
||||
@ -187,7 +191,7 @@ extern "C" void free_lyra2(int thr_id)
|
||||
cudaFree(d_hash[thr_id]);
|
||||
cudaFree(d_matrix[thr_id]);
|
||||
|
||||
keccak256_sm3_free(thr_id);
|
||||
//keccak256_sm3_free(thr_id);
|
||||
groestl256_cpu_free(thr_id);
|
||||
|
||||
init[thr_id] = false;
|
||||
|
@ -14,12 +14,14 @@ static uint64_t *d_hash[MAX_GPUS];
|
||||
static uint64_t* d_matrix[MAX_GPUS];
|
||||
|
||||
extern void blake256_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
|
||||
extern void blake256_cpu_setBlock_80(uint32_t *pdata);
|
||||
//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
|
||||
|
||||
extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
|
||||
extern void keccak256_sm3_init(int thr_id, uint32_t threads);
|
||||
extern void keccak256_sm3_free(int thr_id);
|
||||
//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
|
||||
//extern void keccak256_sm3_init(int thr_id, uint32_t threads);
|
||||
//extern void keccak256_sm3_free(int thr_id);
|
||||
|
||||
extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
|
||||
|
||||
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
|
||||
extern void skein256_cpu_init(int thr_id, uint32_t threads);
|
||||
@ -103,10 +105,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
|
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
|
||||
|
||||
blake256_cpu_init(thr_id, throughput);
|
||||
keccak256_sm3_init(thr_id,throughput);
|
||||
//keccak256_sm3_init(thr_id,throughput);
|
||||
skein256_cpu_init(thr_id, throughput);
|
||||
bmw256_cpu_init(thr_id, throughput);
|
||||
|
||||
cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256
|
||||
|
||||
// SM 3 implentation requires a bit more memory
|
||||
if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
|
||||
matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
|
||||
@ -130,8 +134,9 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
|
||||
do {
|
||||
int order = 0;
|
||||
|
||||
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
@ -196,7 +201,7 @@ extern "C" void free_lyra2v2(int thr_id)
|
||||
cudaFree(d_matrix[thr_id]);
|
||||
|
||||
bmw256_cpu_free(thr_id);
|
||||
keccak256_sm3_free(thr_id);
|
||||
//keccak256_sm3_free(thr_id);
|
||||
|
||||
init[thr_id] = false;
|
||||
|
||||
|
@ -60,6 +60,7 @@ extern "C" int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce
|
||||
CUDA_LOG_ERROR();
|
||||
}
|
||||
|
||||
cuda_get_arch(thr_id);
|
||||
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
|
||||
if (device_sm[dev_id] <= 500) intensity = 15;
|
||||
throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
|
||||
|
Loading…
x
Reference in New Issue
Block a user