Browse Source

lyra2RE: link the merged blake/keccak kernel into algos

old keccak256_gpu_hash_32 kernel commented to reduce binary size

compat. not yet tested on old cards
pull/2/head
Tanguy Pruvot 7 years ago
parent
commit
b70409ab5b
  1. 71
      Algo256/cuda_blake256.cu
  2. 11
      Algo256/cuda_cubehash256.cu
  3. 2
      Algo256/cuda_keccak256.cu
  4. 2
      Algo256/cuda_keccak256_sm3.cu
  5. 2
      compat/ccminer-config.h
  6. 2
      configure.ac
  7. 20
      lyra2/lyra2RE.cu
  8. 21
      lyra2/lyra2REv2.cu
  9. 1
      lyra2/lyra2Z.cu

71
Algo256/cuda_blake256.cu

@ -1,8 +1,10 @@
/** /**
* Blake-256 Cuda Kernel (Tested on SM 5.0) * Blake-256 Cuda Kernel (Tested on SM 5.0)
* *
* Tanguy Pruvot - Nov. 2014 * Tanguy Pruvot - Nov. 2014
*/ *
* + merged blake+keccak kernel for lyra2v2
*/
extern "C" { extern "C" {
#include "sph/sph_blake.h" #include "sph/sph_blake.h"
} }
@ -14,20 +16,17 @@ extern "C" {
#ifdef __INTELLISENSE__ #ifdef __INTELLISENSE__
/* just for vstudio code colors */ /* just for vstudio code colors */
__device__ uint32_t __byte_perm(uint32_t a, uint32_t b, uint32_t c); __device__ uint32_t __byte_perm(uint32_t a, uint32_t b, uint32_t c);
#endif #endif
#define UINT2(x,y) make_uint2(x,y) #define UINT2(x,y) make_uint2(x,y)
__device__ __inline__ uint2 ROR8(const uint2 a) __device__ __inline__ uint2 ROR8(const uint2 a) {
{
uint2 result; uint2 result;
result.x = __byte_perm(a.y, a.x, 0x0765); result.x = __byte_perm(a.y, a.x, 0x0765);
result.y = __byte_perm(a.x, a.y, 0x0765); result.y = __byte_perm(a.x, a.y, 0x0765);
return result; return result;
} }
static __device__ uint64_t cuda_swab32ll(uint64_t x) { static __device__ uint64_t cuda_swab32ll(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
} }
@ -193,12 +192,12 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3
m[2] = block[2]; m[2] = block[2];
m[3] = block[3]; m[3] = block[3];
#pragma unroll #pragma unroll
for (int i = 4; i < 16; i++) { for (int i = 4; i < 16; i++) {
m[i] = c_Padding[i]; m[i] = c_Padding[i];
} }
#pragma unroll 8 #pragma unroll 8
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++)
v[i] = h[i]; v[i] = h[i];
@ -212,7 +211,7 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3
v[14] = u256[6]; v[14] = u256[6];
v[15] = u256[7]; v[15] = u256[7];
#pragma unroll 14 #pragma unroll 14
for (int r = 0; r < 14; r++) { for (int r = 0; r < 14; r++) {
/* column step */ /* column step */
GS2(0, 4, 0x8, 0xC, 0x0); GS2(0, 4, 0x8, 0xC, 0x0);
@ -226,7 +225,7 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3
GS2(3, 4, 0x9, 0xE, 0xE); GS2(3, 4, 0x9, 0xE, 0xE);
} }
#pragma unroll 16 #pragma unroll 16
for (int i = 0; i < 16; i++) { for (int i = 0; i < 16; i++) {
int j = i & 7; int j = i & 7;
h[j] ^= v[i]; h[j] ^= v[i];
@ -238,10 +237,10 @@ static void __forceinline__ __device__ keccak_block(uint2 *s)
uint2 bc[5], tmpxor[5], u, v; uint2 bc[5], tmpxor[5], u, v;
// uint2 s[25]; // uint2 s[25];
#pragma unroll 1 #pragma unroll 1
for (int i = 0; i < 24; i++) for (int i = 0; i < 24; i++)
{ {
#pragma unroll #pragma unroll
for (uint32_t x = 0; x < 5; x++) for (uint32_t x = 0; x < 5; x++)
tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
@ -297,10 +296,10 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
if (thread < threads) if (thread < threads)
{ {
const uint32_t nonce = startNonce + thread; const uint32_t nonce = startNonce + thread;
uint32_t h[8];
// uint32_t input[4];
const uint32_t T0 = 640; const uint32_t T0 = 640;
#pragma unroll 8
uint32_t h[8];
#pragma unroll 8
for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; } for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
uint32_t v[16]; uint32_t v[16];
@ -311,8 +310,7 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
0, 1, 0, 640 0, 1, 0, 640
}; };
const uint32_t u256[16] = const uint32_t u256[16] = {
{
0x243F6A88, 0x85A308D3, 0x243F6A88, 0x85A308D3,
0x13198A2E, 0x03707344, 0x13198A2E, 0x03707344,
0xA4093822, 0x299F31D0, 0xA4093822, 0x299F31D0,
@ -323,15 +321,14 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
0x3F84D5B5, 0xB5470917 0x3F84D5B5, 0xB5470917
}; };
uint32_t m[16] = uint32_t m[16] = {
{
c_data[0], c_data[1], c_data[2], nonce, c_data[0], c_data[1], c_data[2], nonce,
c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3], c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7], c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11] c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
}; };
#pragma unroll 8 #pragma unroll 8
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++)
v[i] = h[i]; v[i] = h[i];
@ -380,7 +377,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
GSPREC(1, 6, 0xB, 0xC, 5, 10); GSPREC(1, 6, 0xB, 0xC, 5, 10);
GSPREC(2, 7, 0x8, 0xD, 4, 0); GSPREC(2, 7, 0x8, 0xD, 4, 0);
GSPREC(3, 4, 0x9, 0xE, 15, 8); GSPREC(3, 4, 0x9, 0xE, 15, 8);
// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
GSPREC(0, 4, 0x8, 0xC, 9, 0); GSPREC(0, 4, 0x8, 0xC, 9, 0);
GSPREC(1, 5, 0x9, 0xD, 5, 7); GSPREC(1, 5, 0x9, 0xD, 5, 7);
@ -399,7 +395,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
GSPREC(1, 6, 0xB, 0xC, 7, 5); GSPREC(1, 6, 0xB, 0xC, 7, 5);
GSPREC(2, 7, 0x8, 0xD, 15, 14); GSPREC(2, 7, 0x8, 0xD, 15, 14);
GSPREC(3, 4, 0x9, 0xE, 1, 9); GSPREC(3, 4, 0x9, 0xE, 1, 9);
// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
GSPREC(0, 4, 0x8, 0xC, 12, 5); GSPREC(0, 4, 0x8, 0xC, 12, 5);
GSPREC(1, 5, 0x9, 0xD, 1, 15); GSPREC(1, 5, 0x9, 0xD, 1, 15);
@ -409,7 +404,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
GSPREC(1, 6, 0xB, 0xC, 6, 3); GSPREC(1, 6, 0xB, 0xC, 6, 3);
GSPREC(2, 7, 0x8, 0xD, 9, 2); GSPREC(2, 7, 0x8, 0xD, 9, 2);
GSPREC(3, 4, 0x9, 0xE, 8, 11); GSPREC(3, 4, 0x9, 0xE, 8, 11);
// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
GSPREC(0, 4, 0x8, 0xC, 13, 11); GSPREC(0, 4, 0x8, 0xC, 13, 11);
GSPREC(1, 5, 0x9, 0xD, 7, 14); GSPREC(1, 5, 0x9, 0xD, 7, 14);
@ -446,7 +440,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
GSPREC(1, 6, 0xB, 0xC, 10, 11); GSPREC(1, 6, 0xB, 0xC, 10, 11);
GSPREC(2, 7, 0x8, 0xD, 12, 13); GSPREC(2, 7, 0x8, 0xD, 12, 13);
GSPREC(3, 4, 0x9, 0xE, 14, 15); GSPREC(3, 4, 0x9, 0xE, 14, 15);
// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
GSPREC(0, 4, 0x8, 0xC, 14, 10); GSPREC(0, 4, 0x8, 0xC, 14, 10);
GSPREC(1, 5, 0x9, 0xD, 4, 8); GSPREC(1, 5, 0x9, 0xD, 4, 8);
@ -456,7 +449,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
GSPREC(1, 6, 0xB, 0xC, 0, 2); GSPREC(1, 6, 0xB, 0xC, 0, 2);
GSPREC(2, 7, 0x8, 0xD, 11, 7); GSPREC(2, 7, 0x8, 0xD, 11, 7);
GSPREC(3, 4, 0x9, 0xE, 5, 3); GSPREC(3, 4, 0x9, 0xE, 5, 3);
// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
GSPREC(0, 4, 0x8, 0xC, 11, 8); GSPREC(0, 4, 0x8, 0xC, 11, 8);
GSPREC(1, 5, 0x9, 0xD, 12, 0); GSPREC(1, 5, 0x9, 0xD, 12, 0);
@ -476,9 +468,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
GSPREC(2, 7, 0x8, 0xD, 4, 0); GSPREC(2, 7, 0x8, 0xD, 4, 0);
GSPREC(3, 4, 0x9, 0xE, 15, 8); GSPREC(3, 4, 0x9, 0xE, 15, 8);
h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]); h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]); h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]); h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
@ -501,14 +490,12 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
keccak_gpu_state[16] = UINT2(0, 0x80000000); keccak_gpu_state[16] = UINT2(0, 0x80000000);
keccak_block(keccak_gpu_state); keccak_block(keccak_gpu_state);
uint64_t *outputHash = (uint64_t *)Hash; uint64_t *outputHash = (uint64_t *)Hash;
#pragma unroll 4 #pragma unroll 4
for (int i = 0; i<4; i++) for (int i = 0; i<4; i++)
outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]); outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]);
} }
} }
__global__ __launch_bounds__(256, 3) __global__ __launch_bounds__(256, 3)
@ -520,16 +507,16 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
uint32_t h[8]; uint32_t h[8];
uint32_t input[4]; uint32_t input[4];
#pragma unroll #pragma unroll
for (int i = 0; i < 8; i++) h[i] = cpu_h[i]; for (int i = 0; i < 8; i++) h[i] = cpu_h[i];
#pragma unroll #pragma unroll
for (int i = 0; i < 3; ++i) input[i] = c_data[i]; for (int i = 0; i < 3; ++i) input[i] = c_data[i];
input[3] = startNonce + thread; input[3] = startNonce + thread;
blake256_compress2nd(h, input, 640); blake256_compress2nd(h, input, 640);
#pragma unroll #pragma unroll
for (int i = 0; i<4; i++) { for (int i = 0; i<4; i++) {
Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2 * i + 1])); Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2 * i + 1]));
} }
@ -568,6 +555,8 @@ void blake256_cpu_init(int thr_id, uint32_t threads)
cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
} }
/** for lyra2v2 **/
__host__ __host__
void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order) void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
{ {
@ -576,7 +565,7 @@ void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
blakeKeccak256_gpu_hash_80 << <grid, block >> > (threads, startNonce, (uint32_t *)Hash); blakeKeccak256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, (uint32_t *)Hash);
} }
__host__ __host__
@ -587,5 +576,5 @@ void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
blakeKeccak256_gpu_hash_80 << <grid, block, 0, stream >> > (threads, startNonce, (uint32_t *)Hash); blakeKeccak256_gpu_hash_80 <<<grid, block, 0, stream>>> (threads, startNonce, (uint32_t *)Hash);
} }

11
Algo256/cuda_cubehash256.cu

@ -267,9 +267,9 @@ void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
} }
#if __CUDA_ARCH__ >= 500 #if __CUDA_ARCH__ >= 500
__global__ __launch_bounds__(TPB50, 1) __global__ __launch_bounds__(TPB50, 1)
#else #else
__global__ __launch_bounds__(TPB35, 1) __global__ __launch_bounds__(TPB35, 1)
#endif #endif
void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{ {
@ -354,8 +354,9 @@ void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce,
dim3 grid((threads + tpb - 1) / tpb); dim3 grid((threads + tpb - 1) / tpb);
dim3 block(tpb); dim3 block(tpb);
cubehash256_gpu_hash_32 << <grid, block >> > (threads, startNounce, (uint2*)d_hash); cubehash256_gpu_hash_32 <<<grid, block >>> (threads, startNounce, (uint2*)d_hash);
} }
__host__ __host__
void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream) void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream)
{ {
@ -365,5 +366,5 @@ void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce,
dim3 grid((threads + tpb - 1) / tpb); dim3 grid((threads + tpb - 1) / tpb);
dim3 block(tpb); dim3 block(tpb);
cubehash256_gpu_hash_32 << <grid, block, 0, stream >> > (threads, startNounce, (uint2*)d_hash); cubehash256_gpu_hash_32 <<<grid, block, 0, stream >>> (threads, startNounce, (uint2*)d_hash);
} }

2
Algo256/cuda_keccak256.cu

@ -212,6 +212,7 @@ void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, ui
memcpy(resNonces, h_nonces[thr_id], NBN*sizeof(uint32_t)); memcpy(resNonces, h_nonces[thr_id], NBN*sizeof(uint32_t));
} }
#if 0
#if __CUDA_ARCH__ <= 500 #if __CUDA_ARCH__ <= 500
__global__ __launch_bounds__(TPB50, 2) __global__ __launch_bounds__(TPB50, 2)
#else #else
@ -306,6 +307,7 @@ void keccak256_cpu_hash_32(const int thr_id,const uint32_t threads, uint2* d_has
keccak256_gpu_hash_32 <<<grid, block>>> (threads, d_hash); keccak256_gpu_hash_32 <<<grid, block>>> (threads, d_hash);
} }
#endif
__host__ __host__
void keccak256_setBlock_80(uint64_t *endiandata) void keccak256_setBlock_80(uint64_t *endiandata)

2
Algo256/cuda_keccak256_sm3.cu

@ -231,6 +231,7 @@ uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounc
return result; return result;
} }
#if 0
__global__ __launch_bounds__(256,3) __global__ __launch_bounds__(256,3)
void keccak256_sm3_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) void keccak256_sm3_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
{ {
@ -282,6 +283,7 @@ void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, u
keccak256_sm3_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash); keccak256_sm3_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
#endif
__host__ __host__
void keccak256_sm3_setBlock_80(void *pdata,const void *pTargetIn) void keccak256_sm3_setBlock_80(void *pdata,const void *pTargetIn)

2
compat/ccminer-config.h

@ -164,7 +164,7 @@
#define PACKAGE_URL "http://github.com/tpruvot/ccminer" #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "2.2.3" #define PACKAGE_VERSION "2.2.4"
/* If using the C implementation of alloca, define if you know the /* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be direction of stack growth for your system; otherwise it will be

2
configure.ac

@ -1,4 +1,4 @@
AC_INIT([ccminer], [2.2.3], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_INIT([ccminer], [2.2.4], [], [ccminer], [http://github.com/tpruvot/ccminer])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

20
lyra2/lyra2RE.cu

@ -13,12 +13,14 @@ static uint64_t* d_hash[MAX_GPUS];
static uint64_t* d_matrix[MAX_GPUS]; static uint64_t* d_matrix[MAX_GPUS];
extern void blake256_cpu_init(int thr_id, uint32_t threads); extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void blake256_cpu_setBlock_80(uint32_t *pdata); extern void blake256_cpu_setBlock_80(uint32_t *pdata);
//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); //extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_sm3_init(int thr_id, uint32_t threads); //extern void keccak256_sm3_init(int thr_id, uint32_t threads);
extern void keccak256_sm3_free(int thr_id); //extern void keccak256_sm3_free(int thr_id);
extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void skein256_cpu_init(int thr_id, uint32_t threads);
@ -98,10 +100,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
blake256_cpu_init(thr_id, throughput); blake256_cpu_init(thr_id, throughput);
keccak256_sm3_init(thr_id, throughput); //keccak256_sm3_init(thr_id, throughput);
skein256_cpu_init(thr_id, throughput); skein256_cpu_init(thr_id, throughput);
groestl256_cpu_init(thr_id, throughput); groestl256_cpu_init(thr_id, throughput);
//cuda_get_arch(thr_id);
if (device_sm[dev_id] >= 500) if (device_sm[dev_id] >= 500)
{ {
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
@ -124,8 +127,9 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
do { do {
int order = 0; int order = 0;
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -187,7 +191,7 @@ extern "C" void free_lyra2(int thr_id)
cudaFree(d_hash[thr_id]); cudaFree(d_hash[thr_id]);
cudaFree(d_matrix[thr_id]); cudaFree(d_matrix[thr_id]);
keccak256_sm3_free(thr_id); //keccak256_sm3_free(thr_id);
groestl256_cpu_free(thr_id); groestl256_cpu_free(thr_id);
init[thr_id] = false; init[thr_id] = false;

21
lyra2/lyra2REv2.cu

@ -14,12 +14,14 @@ static uint64_t *d_hash[MAX_GPUS];
static uint64_t* d_matrix[MAX_GPUS]; static uint64_t* d_matrix[MAX_GPUS];
extern void blake256_cpu_init(int thr_id, uint32_t threads); extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void blake256_cpu_setBlock_80(uint32_t *pdata); extern void blake256_cpu_setBlock_80(uint32_t *pdata);
//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); //extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_sm3_init(int thr_id, uint32_t threads); //extern void keccak256_sm3_init(int thr_id, uint32_t threads);
extern void keccak256_sm3_free(int thr_id); //extern void keccak256_sm3_free(int thr_id);
extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void skein256_cpu_init(int thr_id, uint32_t threads);
@ -103,10 +105,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
blake256_cpu_init(thr_id, throughput); blake256_cpu_init(thr_id, throughput);
keccak256_sm3_init(thr_id,throughput); //keccak256_sm3_init(thr_id,throughput);
skein256_cpu_init(thr_id, throughput); skein256_cpu_init(thr_id, throughput);
bmw256_cpu_init(thr_id, throughput); bmw256_cpu_init(thr_id, throughput);
cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256
// SM 3 implentation requires a bit more memory // SM 3 implentation requires a bit more memory
if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500) if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
matrix_sz = 16 * sizeof(uint64_t) * 4 * 4; matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
@ -130,8 +134,9 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
do { do {
int order = 0; int order = 0;
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -196,7 +201,7 @@ extern "C" void free_lyra2v2(int thr_id)
cudaFree(d_matrix[thr_id]); cudaFree(d_matrix[thr_id]);
bmw256_cpu_free(thr_id); bmw256_cpu_free(thr_id);
keccak256_sm3_free(thr_id); //keccak256_sm3_free(thr_id);
init[thr_id] = false; init[thr_id] = false;

1
lyra2/lyra2Z.cu

@ -60,6 +60,7 @@ extern "C" int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce
CUDA_LOG_ERROR(); CUDA_LOG_ERROR();
} }
cuda_get_arch(thr_id);
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16; int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
if (device_sm[dev_id] <= 500) intensity = 15; if (device_sm[dev_id] <= 500) intensity = 15;
throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;

Loading…
Cancel
Save