Browse Source

import and adapt alexis optimised keccak256 for SM 5+

and increase default intensity for these recent cards
pull/2/head
Tanguy Pruvot 7 years ago
parent
commit
cf886b5907
  1. 570
      Algo256/cuda_keccak256.cu
  2. 309
      Algo256/cuda_keccak256_sm3.cu
  3. 61
      Algo256/keccak256.cu
  4. 3
      Makefile.am
  5. 7
      README.txt
  6. 3
      ccminer.cpp
  7. 3
      ccminer.vcxproj
  8. 3
      ccminer.vcxproj.filters
  9. 13
      lyra2/lyra2RE.cu
  10. 14
      lyra2/lyra2REv2.cu

570
Algo256/cuda_keccak256.cu

@ -1,309 +1,381 @@ @@ -1,309 +1,381 @@
#include "miner.h"
/**
* KECCAK-256 CUDA optimised implementation, based on ccminer-alexis code
*/
#include <miner.h>
extern "C" {
#include <stdint.h>
#include <memory.h>
}
#include "cuda_helper.h"
static const uint64_t host_keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull,
0x8000000080008081ull, 0x8000000000008009ull,
0x000000000000008aull, 0x0000000000000088ull,
0x0000000080008009ull, 0x000000008000000aull,
0x000000008000808bull, 0x800000000000008bull,
0x8000000000008089ull, 0x8000000000008003ull,
0x8000000000008002ull, 0x8000000000000080ull,
0x000000000000800aull, 0x800000008000000aull,
0x8000000080008081ull, 0x8000000000008080ull,
0x0000000080000001ull, 0x8000000080008008ull
};
uint32_t *d_nounce[MAX_GPUS];
uint32_t *d_KNonce[MAX_GPUS];
__constant__ uint32_t pTarget[8];
__constant__ uint64_t keccak_round_constants[24];
__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?)
#include <cuda_helper.h>
#include <cuda_vectors.h>
#if __CUDA_ARCH__ >= 350
__device__ __forceinline__
static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
{
size_t i;
uint2 t[5], u[5], v, w;
#define TPB52 1024
#define TPB50 384
#define NPT 2
#define NBN 2
#pragma unroll
for (i = 0; i < 24; i++) {
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
u[0] = t[4] ^ ROL2(t[1], 1);
u[1] = t[0] ^ ROL2(t[2], 1);
u[2] = t[1] ^ ROL2(t[3], 1);
u[3] = t[2] ^ ROL2(t[4], 1);
u[4] = t[3] ^ ROL2(t[0], 1);
static uint32_t *d_nonces[MAX_GPUS];
static uint32_t *h_nonces[MAX_GPUS];
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
__constant__ uint2 c_message48[6];
__constant__ uint2 c_mid[17];
/* rho pi: b[..] = rotl(a[..], ..) */
v = s[1];
s[1] = ROL2(s[6], 44);
s[6] = ROL2(s[9], 20);
s[9] = ROL2(s[22], 61);
s[22] = ROL2(s[14], 39);
s[14] = ROL2(s[20], 18);
s[20] = ROL2(s[2], 62);
s[2] = ROL2(s[12], 43);
s[12] = ROL2(s[13], 25);
s[13] = ROL2(s[19], 8);
s[19] = ROL2(s[23], 56);
s[23] = ROL2(s[15], 41);
s[15] = ROL2(s[4], 27);
s[4] = ROL2(s[24], 14);
s[24] = ROL2(s[21], 2);
s[21] = ROL2(s[8], 55);
s[8] = ROL2(s[16], 45);
s[16] = ROL2(s[5], 36);
s[5] = ROL2(s[3], 28);
s[3] = ROL2(s[18], 21);
s[18] = ROL2(s[17], 15);
s[17] = ROL2(s[11], 10);
s[11] = ROL2(s[7], 6);
s[7] = ROL2(s[10], 3);
s[10] = ROL2(v, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
__constant__ uint2 keccak_round_constants[24] = {
{ 0x00000001, 0x00000000 }, { 0x00008082, 0x00000000 }, { 0x0000808a, 0x80000000 }, { 0x80008000, 0x80000000 },
{ 0x0000808b, 0x00000000 }, { 0x80000001, 0x00000000 }, { 0x80008081, 0x80000000 }, { 0x00008009, 0x80000000 },
{ 0x0000008a, 0x00000000 }, { 0x00000088, 0x00000000 }, { 0x80008009, 0x00000000 }, { 0x8000000a, 0x00000000 },
{ 0x8000808b, 0x00000000 }, { 0x0000008b, 0x80000000 }, { 0x00008089, 0x80000000 }, { 0x00008003, 0x80000000 },
{ 0x00008002, 0x80000000 }, { 0x00000080, 0x80000000 }, { 0x0000800a, 0x00000000 }, { 0x8000000a, 0x80000000 },
{ 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 }, { 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 }
};
/* iota: a[0,0] ^= round constant */
s[0] ^= vectorize(keccak_round_constants[i]);
}
__device__ __forceinline__
uint2 xor3x(const uint2 a,const uint2 b,const uint2 c) {
uint2 result;
#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
#else
result = a^b^c;
#endif
return result;
}
__device__ __forceinline__
uint2 chi(const uint2 a,const uint2 b,const uint2 c) { // keccak chi
uint2 result;
#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
#else
result = a ^ (~b) & c;
#endif
return result;
}
__device__ __forceinline__
static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
uint64_t xor5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
{
size_t i;
uint64_t t[5], u[5], v, w;
/* absorb input */
uint64_t result;
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(d) ,"l"(e));
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
return result;
}
for (i = 0; i < 24; i++) {
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
#if __CUDA_ARCH__ <= 500
__global__ __launch_bounds__(TPB50, 2)
#else
__global__ __launch_bounds__(TPB52, 1)
#endif
void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint32_t *resNounce, const uint2 highTarget)
{
uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
uint2 s[25], t[5], v, w, u[5];
#if __CUDA_ARCH__ > 500
uint64_t step = gridDim.x * blockDim.x;
uint64_t maxNonce = startNonce + threads;
for(uint64_t nounce = startNonce + thread; nounce<maxNonce;nounce+=step) {
#else
uint32_t nounce = startNonce+thread;
if(thread<threads) {
#endif
s[ 9] = make_uint2(c_message48[0].x,cuda_swab32(nounce));
s[10] = keccak_round_constants[0];
t[ 4] = c_message48[1]^s[ 9];
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
u[0] = t[4] ^ ROTL64(t[1], 1);
u[1] = t[0] ^ ROTL64(t[2], 1);
u[2] = t[1] ^ ROTL64(t[3], 1);
u[3] = t[2] ^ ROTL64(t[4], 1);
u[4] = t[3] ^ ROTL64(t[0], 1);
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
/* rho pi: b[..] = rotl(a[..], ..) */
v = s[ 1];
s[ 1] = ROTL64(s[ 6], 44);
s[ 6] = ROTL64(s[ 9], 20);
s[ 9] = ROTL64(s[22], 61);
s[22] = ROTL64(s[14], 39);
s[14] = ROTL64(s[20], 18);
s[20] = ROTL64(s[ 2], 62);
s[ 2] = ROTL64(s[12], 43);
s[12] = ROTL64(s[13], 25);
s[13] = ROTL64(s[19], 8);
s[19] = ROTL64(s[23], 56);
s[23] = ROTL64(s[15], 41);
s[15] = ROTL64(s[ 4], 27);
s[ 4] = ROTL64(s[24], 14);
s[24] = ROTL64(s[21], 2);
s[21] = ROTL64(s[ 8], 55);
s[ 8] = ROTL64(s[16], 45);
s[16] = ROTL64(s[ 5], 36);
s[ 5] = ROTL64(s[ 3], 28);
s[ 3] = ROTL64(s[18], 21);
s[18] = ROTL64(s[17], 15);
s[17] = ROTL64(s[11], 10);
s[11] = ROTL64(s[ 7], 6);
s[ 7] = ROTL64(s[10], 3);
s[10] = ROTL64( v, 1);
u[ 0] = t[4] ^ c_mid[ 0];
u[ 1] = c_mid[ 1] ^ ROL2(t[4],1);
u[ 2] = c_mid[ 2];
/* thetarho pi: b[..] = rotl(a[..] ^ d[...], ..)*/
s[ 7] = ROL2(s[10]^u[0], 3);
s[10] = c_mid[ 3];
w = c_mid[ 4];
s[20] = c_mid[ 5];
s[ 6] = ROL2(s[ 9]^u[2],20);
s[ 9] = c_mid[ 6];
s[22] = c_mid[ 7];
s[14] = ROL2(u[0],18);
s[ 2] = c_mid[ 8];
s[12] = ROL2(u[1],25);
s[13] = c_mid[ 9];
s[19] = ROR8(u[1]);
s[23] = ROR2(u[0],23);
s[15] = c_mid[10];
s[ 4] = c_mid[11];
s[24] = c_mid[12];
s[21] = ROR2(c_message48[2]^u[1], 9);
s[ 8] = c_mid[13];
s[16] = ROR2(c_message48[3]^u[0],28);
s[ 5] = ROL2(c_message48[4]^u[1],28);
s[ 3] = ROL2(u[1],21);
s[18] = c_mid[14];
s[17] = c_mid[15];
s[11] = c_mid[16];
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
v = c_message48[5]^u[0];
s[ 0] = chi(v,w,s[ 2]);
s[ 1] = chi(w,s[ 2],s[ 3]);
s[ 2] = chi(s[ 2],s[ 3],s[ 4]);
s[ 3] = chi(s[ 3],s[ 4],v);
s[ 4] = chi(s[ 4],v,w);
v = s[ 5]; w = s[ 6]; s[ 5] = chi(v,w,s[ 7]); s[ 6] = chi(w,s[ 7],s[ 8]); s[ 7] = chi(s[ 7],s[ 8],s[ 9]); s[ 8] = chi(s[ 8],s[ 9],v);s[ 9] = chi(s[ 9],v,w);
v = s[10]; w = s[11]; s[10] = chi(v,w,s[12]); s[11] = chi(w,s[12],s[13]); s[12] = chi(s[12],s[13],s[14]); s[13] = chi(s[13],s[14],v);s[14] = chi(s[14],v,w);
v = s[15]; w = s[16]; s[15] = chi(v,w,s[17]); s[16] = chi(w,s[17],s[18]); s[17] = chi(s[17],s[18],s[19]); s[18] = chi(s[18],s[19],v);s[19] = chi(s[19],v,w);
v = s[20]; w = s[21]; s[20] = chi(v,w,s[22]); s[21] = chi(w,s[22],s[23]); s[22] = chi(s[22],s[23],s[24]); s[23] = chi(s[23],s[24],v);s[24] = chi(s[24],v,w);
/* iota: a[0,0] ^= round constant */
s[0] ^= keccak_round_constants[i];
}
}
#endif
__global__ __launch_bounds__(128,5)
void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
#if __CUDA_ARCH__ >= 350
uint2 keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++) {
if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]);
else keccak_gpu_state[i] = make_uint2(0, 0);
s[ 0] ^=keccak_round_constants[ 0];
#if __CUDA_ARCH__ > 500
#pragma unroll 22
#else
#pragma unroll 4
#endif
for (int i = 1; i < 23; i++) {
#pragma unroll
for(int j=0;j<5;j++) {
t[ j] = vectorize(xor5(devectorize(s[ j]),devectorize(s[j+5]),devectorize(s[j+10]),devectorize(s[j+15]),devectorize(s[j+20])));
}
/*theta*/
#pragma unroll
for(int j=0;j<5;j++) {
u[j] = ROL2(t[j], 1);
}
s[ 4] = xor3x(s[ 4], t[3], u[0]);s[ 9] = xor3x(s[ 9], t[3], u[0]);s[14] = xor3x(s[14], t[3], u[0]);s[19] = xor3x(s[19], t[3], u[0]);s[24] = xor3x(s[24], t[3], u[0]);
s[ 0] = xor3x(s[ 0], t[4], u[1]);s[ 5] = xor3x(s[ 5], t[4], u[1]);s[10] = xor3x(s[10], t[4], u[1]);s[15] = xor3x(s[15], t[4], u[1]);s[20] = xor3x(s[20], t[4], u[1]);
s[ 1] = xor3x(s[ 1], t[0], u[2]);s[ 6] = xor3x(s[ 6], t[0], u[2]);s[11] = xor3x(s[11], t[0], u[2]);s[16] = xor3x(s[16], t[0], u[2]);s[21] = xor3x(s[21], t[0], u[2]);
s[ 2] = xor3x(s[ 2], t[1], u[3]);s[ 7] = xor3x(s[ 7], t[1], u[3]);s[12] = xor3x(s[12], t[1], u[3]);s[17] = xor3x(s[17], t[1], u[3]);s[22] = xor3x(s[22], t[1], u[3]);
s[ 3] = xor3x(s[ 3], t[2], u[4]);s[ 8] = xor3x(s[ 8], t[2], u[4]);s[13] = xor3x(s[13], t[2], u[4]);s[18] = xor3x(s[18], t[2], u[4]);s[23] = xor3x(s[23], t[2], u[4]);
/*rho pi: b[..] = rotl(a[..] ^ d[...], ..)*/
v = s[ 1];
s[ 1] = ROL2(s[ 6],44); s[ 6] = ROL2(s[ 9],20); s[ 9] = ROL2(s[22],61); s[22] = ROL2(s[14],39);
s[14] = ROL2(s[20],18); s[20] = ROL2(s[ 2],62); s[ 2] = ROL2(s[12],43); s[12] = ROL2(s[13],25);
s[13] = ROL8(s[19]); s[19] = ROR8(s[23]); s[23] = ROL2(s[15],41); s[15] = ROL2(s[ 4],27);
s[ 4] = ROL2(s[24],14); s[24] = ROL2(s[21], 2); s[21] = ROL2(s[ 8],55); s[ 8] = ROL2(s[16],45);
s[16] = ROL2(s[ 5],36); s[ 5] = ROL2(s[ 3],28); s[ 3] = ROL2(s[18],21); s[18] = ROL2(s[17],15);
s[17] = ROL2(s[11],10); s[11] = ROL2(s[ 7], 6); s[ 7] = ROL2(s[10], 3); s[10] = ROL2(v, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
#pragma unroll
for(int j=0;j<25;j+=5) {
v=s[j];w=s[j + 1];s[j] = chi(s[j],s[j+1],s[j+2]);s[j+1] = chi(s[j+1],s[j+2],s[j+3]);s[j+2]=chi(s[j+2],s[j+3],s[j+4]);s[j+3]=chi(s[j+3],s[j+4],v);s[j+4]=chi(s[j+4],v,w);
}
/* iota: a[0,0] ^= round constant */
s[ 0] ^=keccak_round_constants[ i];
}
keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]);
keccak_gpu_state[9].y = cuda_swab32(nounce);
keccak_gpu_state[10] = make_uint2(1, 0);
keccak_gpu_state[16] = make_uint2(0, 0x80000000);
keccak_blockv35(keccak_gpu_state,keccak_round_constants);
if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) {resNounce[0] = nounce;}
#else
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++) {
if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
else keccak_gpu_state[i] = 0;
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
#pragma unroll 5
for(int j=0;j<5;j++) {
t[ j] = xor3x(xor3x(s[j+0],s[j+5],s[j+10]), s[j+15], s[j+20]);
}
s[24] = xor3x(s[24],t[3],ROL2(t[0],1));
s[18] = xor3x(s[18],t[2],ROL2(t[4],1));
s[ 0] = xor3x(s[ 0],t[4],ROL2(t[1],1));
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
s[24] = ROL2(s[24],14);
s[18] = ROL2(s[18],21);
if (devectorize(chi(s[18],s[24],s[ 0])) <= devectorize(highTarget)) {
// if(chi(s[18].x,s[24].x,s[0].x)<=highTarget.x) {
// if(chi(s[18].y,s[24].y,s[0].y)<=highTarget.y) {
const uint32_t tmp = atomicExch(&resNounce[0], nounce);
if (tmp != UINT32_MAX)
resNounce[1] = tmp;
// return;
// }
}
keccak_gpu_state[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
keccak_gpu_state[10] = 0x0000000000000001;
keccak_gpu_state[16] = 0x8000000000000000;
keccak_blockv30(keccak_gpu_state, keccak_round_constants);
if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) { resNounce[0] = nounce; }
#endif
}
}
__host__
uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, const uint2 highTarget)
{
uint32_t result = UINT32_MAX;
cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
keccak256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]);
MyStreamSynchronize(NULL, order, thr_id);
cudaMemcpy(d_nounce[thr_id], d_KNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
result = *d_nounce[thr_id];
uint32_t tpb;
dim3 grid;
if (device_sm[device_map[thr_id]] <= 500) {
tpb = TPB50;
grid.x = (threads + tpb-1)/tpb;
} else {
tpb = TPB52;
grid.x = (threads + (NPT*tpb)-1)/(NPT*tpb);
}
const dim3 block(tpb);
return result;
keccak256_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_nonces[thr_id], highTarget);
// cudaThreadSynchronize();
cudaMemcpy(h_nonces[thr_id], d_nonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
memcpy(resNonces, h_nonces[thr_id], NBN*sizeof(uint32_t));
}
__global__ __launch_bounds__(256,3)
void keccak256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
#if __CUDA_ARCH__ <= 500
__global__ __launch_bounds__(TPB50, 2)
#else
__global__ __launch_bounds__(TPB52, 1)
#endif
void keccak256_gpu_hash_32(uint32_t threads, uint2* outputHash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
uint2 keccak_gpu_state[25];
#pragma unroll 25
for (int i = 0; i<25; i++) {
if (i<4) keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
else keccak_gpu_state[i] = make_uint2(0, 0);
}
keccak_gpu_state[4] = make_uint2(1, 0);
keccak_gpu_state[16] = make_uint2(0, 0x80000000);
keccak_blockv35(keccak_gpu_state, keccak_round_constants);
uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
uint2 s[25], t[5], v, w, u[5];
#pragma unroll 4
for (int i=0; i<4; i++)
outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]);
#else
uint64_t keccak_gpu_state[25];
if(thread < threads) {
#pragma unroll 25
for (int i = 0; i<25; i++) {
if (i<4)
keccak_gpu_state[i] = outputHash[i*threads+thread];
else
keccak_gpu_state[i] = 0;
if (i<4) s[i] = __ldg(&outputHash[i*threads+thread]);
else s[i] = make_uint2(0, 0);
}
s[4] = keccak_round_constants[ 0];
s[16] = make_uint2(0, 0x80000000);
#if __CUDA_ARCH__ > 500
#pragma unroll
#else
#pragma unroll 4
#endif
for (uint32_t i = 0; i < 23; i++) {
/*theta*/
#pragma unroll 5
for(int j=0; j<5; j++) {
t[ j] = vectorize(xor5(devectorize(s[ j]),devectorize(s[j+5]),devectorize(s[j+10]),devectorize(s[j+15]),devectorize(s[j+20])));
}
/*theta*/
#pragma unroll 5
for(int j=0; j<5; j++) {
u[j] = ROL2(t[j], 1);
}
s[ 4] = xor3x(s[ 4], t[3], u[0]);s[ 9] = xor3x(s[ 9], t[3], u[0]);s[14] = xor3x(s[14], t[3], u[0]);s[19] = xor3x(s[19], t[3], u[0]);s[24] = xor3x(s[24], t[3], u[0]);
s[ 0] = xor3x(s[ 0], t[4], u[1]);s[ 5] = xor3x(s[ 5], t[4], u[1]);s[10] = xor3x(s[10], t[4], u[1]);s[15] = xor3x(s[15], t[4], u[1]);s[20] = xor3x(s[20], t[4], u[1]);
s[ 1] = xor3x(s[ 1], t[0], u[2]);s[ 6] = xor3x(s[ 6], t[0], u[2]);s[11] = xor3x(s[11], t[0], u[2]);s[16] = xor3x(s[16], t[0], u[2]);s[21] = xor3x(s[21], t[0], u[2]);
s[ 2] = xor3x(s[ 2], t[1], u[3]);s[ 7] = xor3x(s[ 7], t[1], u[3]);s[12] = xor3x(s[12], t[1], u[3]);s[17] = xor3x(s[17], t[1], u[3]);s[22] = xor3x(s[22], t[1], u[3]);
s[ 3] = xor3x(s[ 3], t[2], u[4]);s[ 8] = xor3x(s[ 8], t[2], u[4]);s[13] = xor3x(s[13], t[2], u[4]);s[18] = xor3x(s[18], t[2], u[4]);s[23] = xor3x(s[23], t[2], u[4]);
/*rho pi: b[..] = rotl(a[..] ^ d[...], ..)*/
v = s[ 1];
s[ 1] = ROL2(s[ 6],44); s[ 6] = ROL2(s[ 9],20); s[ 9] = ROL2(s[22],61); s[22] = ROL2(s[14],39);
s[14] = ROL2(s[20],18); s[20] = ROL2(s[ 2],62); s[ 2] = ROL2(s[12],43); s[12] = ROL2(s[13],25);
s[13] = ROL8(s[19]); s[19] = ROR8(s[23]); s[23] = ROL2(s[15],41); s[15] = ROL2(s[ 4],27);
s[ 4] = ROL2(s[24],14); s[24] = ROL2(s[21], 2); s[21] = ROL2(s[ 8],55); s[ 8] = ROL2(s[16],45);
s[16] = ROL2(s[ 5],36); s[ 5] = ROL2(s[ 3],28); s[ 3] = ROL2(s[18],21); s[18] = ROL2(s[17],15);
s[17] = ROL2(s[11],10); s[11] = ROL2(s[ 7], 6); s[ 7] = ROL2(s[10], 3); s[10] = ROL2(v, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
#pragma unroll 5
for(int j=0; j<25; j+=5) {
v=s[j];w=s[j + 1]; s[j] = chi(v,w,s[j+2]); s[j+1] = chi(w,s[j+2],s[j+3]); s[j+2]=chi(s[j+2],s[j+3],s[j+4]); s[j+3]=chi(s[j+3],s[j+4],v); s[j+4]=chi(s[j+4],v,w);
}
/* iota: a[0,0] ^= round constant */
s[ 0] ^=keccak_round_constants[ i];
}
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
#pragma unroll 5
for(int j=0;j<5;j++) {
t[ j] = xor3x(xor3x(s[j+0],s[j+5],s[j+10]), s[j+15], s[j+20]);
}
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
#pragma unroll 5
for(int j=0;j<5;j++) {
u[j] = ROL2(t[j],1);
}
keccak_gpu_state[4] = 0x0000000000000001;
keccak_gpu_state[16] = 0x8000000000000000;
/* thetarho pi: b[..] = rotl(a[..] ^ d[...], ..) //There's no need to perform theta and -store- the result since it's unique for each a[..]*/
s[ 4] = xor3x(s[24], t[3], u[0]);
s[ 0] = xor3x(s[ 0], t[4], u[1]);
s[ 1] = xor3x(s[ 6], t[0], u[2]);
s[ 2] = xor3x(s[12], t[1], u[3]);
s[ 3] = xor3x(s[18], t[2], u[4]);
s[ 1] = ROR2(s[ 1],20);
s[ 2] = ROR2(s[ 2],21);
s[ 3] = ROL2(s[ 3],21);
s[ 4] = ROL2(s[ 4],14);
keccak_blockv30(keccak_gpu_state, keccak_round_constants);
#pragma unroll 4
for (int i = 0; i<4; i++)
outputHash[i*threads + thread] = keccak_gpu_state[i];
#endif
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
outputHash[0*threads+thread] = chi(s[ 0],s[ 1],s[ 2]) ^ keccak_round_constants[23];
outputHash[1*threads+thread] = chi(s[ 1],s[ 2],s[ 3]);
outputHash[2*threads+thread] = chi(s[ 2],s[ 3],s[ 4]);
outputHash[3*threads+thread] = chi(s[ 3],s[ 4],s[ 0]);
}
}
__host__
void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
void keccak256_cpu_hash_32(const int thr_id,const uint32_t threads, uint2* d_hash)
{
const uint32_t threadsperblock = 256;
uint32_t tpb = TPB52;
if (device_sm[device_map[thr_id]] == 500) tpb = TPB50;
const dim3 grid((threads + tpb-1)/tpb);
const dim3 block(tpb);
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
keccak256_gpu_hash_32 <<<grid, block>>> (threads, d_hash);
}
keccak256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id);
__host__
void keccak256_setBlock_80(uint64_t *endiandata)
{
uint64_t midstate[17], s[25];
uint64_t t[5], u[5];
s[10] = 1; //(uint64_t)make_uint2(1, 0);
s[16] = ((uint64_t)1)<<63; //(uint64_t)make_uint2(0, 0x80000000);
t[0] = endiandata[0] ^ endiandata[5] ^ s[10];
t[1] = endiandata[1] ^ endiandata[6] ^ s[16];
t[2] = endiandata[2] ^ endiandata[7];
t[3] = endiandata[3] ^ endiandata[8];
midstate[ 0] = ROTL64(t[1], 1); //u[0] -partial
u[1] = t[ 0] ^ ROTL64(t[2], 1); //u[1]
u[2] = t[ 1] ^ ROTL64(t[3], 1); //u[2]
midstate[ 1] = t[ 2]; //u[3] -partial
midstate[ 2] = t[ 3] ^ ROTL64(t[0], 1); //u[4]
midstate[ 3] = ROTL64(endiandata[1]^u[1], 1); //v
midstate[ 4] = ROTL64(endiandata[6]^u[1], 44);
midstate[ 5] = ROTL64(endiandata[2]^u[2], 62);
midstate[ 6] = ROTL64(u[2], 61);
midstate[ 7] = ROTL64(midstate[2], 39);
midstate[ 8] = ROTL64(u[2], 43);
midstate[ 9] = ROTL64(midstate[2], 8);
midstate[10] = ROTL64(endiandata[4]^midstate[ 2],27);
midstate[11] = ROTL64(midstate[2], 14);
midstate[12] = ROTL64(u[1], 2);
midstate[13] = ROTL64(s[16] ^ u[1], 45);
midstate[14] = ROTL64(u[2],15);
midstate[15] = ROTL64(u[1],10);
midstate[16] = ROTL64(endiandata[7]^u[2], 6);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_mid, midstate,17*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
// pass only what's needed
uint64_t message48[6];
message48[0] = endiandata[9];
message48[1] = endiandata[4];
message48[2] = endiandata[8];
message48[3] = endiandata[5];
message48[4] = endiandata[3];
message48[5] = endiandata[0];
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_message48, message48, 6*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
}
__host__
void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
void keccak256_cpu_init(int thr_id)
{
unsigned char PaddedMessage[80];
memcpy(PaddedMessage, pdata, 80);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc(&d_nonces[thr_id], NBN*sizeof(uint32_t)));
//CUDA_SAFE_CALL(cudaMallocHost(&h_nonces[thr_id], NBN*sizeof(uint32_t)));
h_nonces[thr_id] = (uint32_t*) malloc(NBN * sizeof(uint32_t));
if(h_nonces[thr_id] == NULL) {
gpulog(LOG_ERR,thr_id,"Host memory allocation failed");
exit(EXIT_FAILURE);
}
}
__host__
void keccak256_cpu_init(int thr_id, uint32_t threads)
void keccak256_setOutput(int thr_id)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMemset(d_nonces[thr_id], 0xff, NBN*sizeof(uint32_t)));
}
__host__
void keccak256_cpu_free(int thr_id)
{
cudaFree(d_KNonce[thr_id]);
cudaFreeHost(d_nounce[thr_id]);
cudaFree(d_nonces[thr_id]);
//cudaFreeHost(h_nonces[thr_id]);
free(h_nonces[thr_id]);
}

309
Algo256/cuda_keccak256_sm3.cu

@ -0,0 +1,309 @@ @@ -0,0 +1,309 @@
#include "miner.h"
extern "C" {
#include <stdint.h>
#include <memory.h>
}
#include "cuda_helper.h"
static const uint64_t host_keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull,
0x8000000080008081ull, 0x8000000000008009ull,
0x000000000000008aull, 0x0000000000000088ull,
0x0000000080008009ull, 0x000000008000000aull,
0x000000008000808bull, 0x800000000000008bull,
0x8000000000008089ull, 0x8000000000008003ull,
0x8000000000008002ull, 0x8000000000000080ull,
0x000000000000800aull, 0x800000008000000aull,
0x8000000080008081ull, 0x8000000000008080ull,
0x0000000080000001ull, 0x8000000080008008ull
};
static uint32_t *d_nounce[MAX_GPUS];
static uint32_t *d_KNonce[MAX_GPUS];
__constant__ uint32_t pTarget[8];
__constant__ uint64_t keccak_round_constants[24];
__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?)
#if __CUDA_ARCH__ >= 350
__device__ __forceinline__
static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
{
size_t i;
uint2 t[5], u[5], v, w;
#pragma unroll
for (i = 0; i < 24; i++) {
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
u[0] = t[4] ^ ROL2(t[1], 1);
u[1] = t[0] ^ ROL2(t[2], 1);
u[2] = t[1] ^ ROL2(t[3], 1);
u[3] = t[2] ^ ROL2(t[4], 1);
u[4] = t[3] ^ ROL2(t[0], 1);
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
/* rho pi: b[..] = rotl(a[..], ..) */
v = s[1];
s[1] = ROL2(s[6], 44);
s[6] = ROL2(s[9], 20);
s[9] = ROL2(s[22], 61);
s[22] = ROL2(s[14], 39);
s[14] = ROL2(s[20], 18);
s[20] = ROL2(s[2], 62);
s[2] = ROL2(s[12], 43);
s[12] = ROL2(s[13], 25);
s[13] = ROL2(s[19], 8);
s[19] = ROL2(s[23], 56);
s[23] = ROL2(s[15], 41);
s[15] = ROL2(s[4], 27);
s[4] = ROL2(s[24], 14);
s[24] = ROL2(s[21], 2);
s[21] = ROL2(s[8], 55);
s[8] = ROL2(s[16], 45);
s[16] = ROL2(s[5], 36);
s[5] = ROL2(s[3], 28);
s[3] = ROL2(s[18], 21);
s[18] = ROL2(s[17], 15);
s[17] = ROL2(s[11], 10);
s[11] = ROL2(s[7], 6);
s[7] = ROL2(s[10], 3);
s[10] = ROL2(v, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
/* iota: a[0,0] ^= round constant */
s[0] ^= vectorize(keccak_round_constants[i]);
}
}
#else
__device__ __forceinline__
static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i;
uint64_t t[5], u[5], v, w;
/* absorb input */
for (i = 0; i < 24; i++) {
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
u[0] = t[4] ^ ROTL64(t[1], 1);
u[1] = t[0] ^ ROTL64(t[2], 1);
u[2] = t[1] ^ ROTL64(t[3], 1);
u[3] = t[2] ^ ROTL64(t[4], 1);
u[4] = t[3] ^ ROTL64(t[0], 1);
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
/* rho pi: b[..] = rotl(a[..], ..) */
v = s[ 1];
s[ 1] = ROTL64(s[ 6], 44);
s[ 6] = ROTL64(s[ 9], 20);
s[ 9] = ROTL64(s[22], 61);
s[22] = ROTL64(s[14], 39);
s[14] = ROTL64(s[20], 18);
s[20] = ROTL64(s[ 2], 62);
s[ 2] = ROTL64(s[12], 43);
s[12] = ROTL64(s[13], 25);
s[13] = ROTL64(s[19], 8);
s[19] = ROTL64(s[23], 56);
s[23] = ROTL64(s[15], 41);
s[15] = ROTL64(s[ 4], 27);
s[ 4] = ROTL64(s[24], 14);
s[24] = ROTL64(s[21], 2);
s[21] = ROTL64(s[ 8], 55);
s[ 8] = ROTL64(s[16], 45);
s[16] = ROTL64(s[ 5], 36);
s[ 5] = ROTL64(s[ 3], 28);
s[ 3] = ROTL64(s[18], 21);
s[18] = ROTL64(s[17], 15);
s[17] = ROTL64(s[11], 10);
s[11] = ROTL64(s[ 7], 6);
s[ 7] = ROTL64(s[10], 3);
s[10] = ROTL64( v, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
/* iota: a[0,0] ^= round constant */
s[0] ^= keccak_round_constants[i];
}
}
#endif
__global__ __launch_bounds__(128,5)
void keccak256_sm3_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
#if __CUDA_ARCH__ >= 350
uint2 keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++) {
if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]);
else keccak_gpu_state[i] = make_uint2(0, 0);
}
keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]);
keccak_gpu_state[9].y = cuda_swab32(nounce);
keccak_gpu_state[10] = make_uint2(1, 0);
keccak_gpu_state[16] = make_uint2(0, 0x80000000);
keccak_blockv35(keccak_gpu_state,keccak_round_constants);
if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) {resNounce[0] = nounce;}
#else
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++) {
if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
else keccak_gpu_state[i] = 0;
}
keccak_gpu_state[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
keccak_gpu_state[10] = 0x0000000000000001;
keccak_gpu_state[16] = 0x8000000000000000;
keccak_blockv30(keccak_gpu_state, keccak_round_constants);
if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) { resNounce[0] = nounce; }
#endif
}
}
__host__
uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{
uint32_t result = UINT32_MAX;
cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
keccak256_sm3_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]);
MyStreamSynchronize(NULL, order, thr_id);
cudaMemcpy(d_nounce[thr_id], d_KNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
result = *d_nounce[thr_id];
return result;
}
__global__ __launch_bounds__(256,3)
void keccak256_sm3_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
uint2 keccak_gpu_state[25];
#pragma unroll 25
for (int i = 0; i<25; i++) {
if (i<4) keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
else keccak_gpu_state[i] = make_uint2(0, 0);
}
keccak_gpu_state[4] = make_uint2(1, 0);
keccak_gpu_state[16] = make_uint2(0, 0x80000000);
keccak_blockv35(keccak_gpu_state, keccak_round_constants);
#pragma unroll 4
for (int i=0; i<4; i++)
outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]);
#else
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i = 0; i<25; i++) {
if (i<4)
keccak_gpu_state[i] = outputHash[i*threads+thread];
else
keccak_gpu_state[i] = 0;
}
keccak_gpu_state[4] = 0x0000000000000001;
keccak_gpu_state[16] = 0x8000000000000000;
keccak_blockv30(keccak_gpu_state, keccak_round_constants);
#pragma unroll 4
for (int i = 0; i<4; i++)
outputHash[i*threads + thread] = keccak_gpu_state[i];
#endif
}
}
__host__
void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
keccak256_sm3_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id);
}
__host__
void keccak256_sm3_setBlock_80(void *pdata,const void *pTargetIn)
{
unsigned char PaddedMessage[80];
memcpy(PaddedMessage, pdata, 80);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
}
__host__
void keccak256_sm3_init(int thr_id, uint32_t threads)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
}
__host__
void keccak256_sm3_free(int thr_id)
{
cudaFree(d_KNonce[thr_id]);
cudaFreeHost(d_nounce[thr_id]);
}

61
Algo256/keccak256.cu

@ -16,10 +16,18 @@ extern "C" @@ -16,10 +16,18 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS];
extern void keccak256_cpu_init(int thr_id, uint32_t threads);
// SM5+ cuda
extern void keccak256_cpu_init(int thr_id);
extern void keccak256_cpu_free(int thr_id);
extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
extern uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, const uint2 highTarget);
extern void keccak256_setBlock_80(uint64_t *endiandata);
extern void keccak256_setOutput(int thr_id);
// compat
extern void keccak256_sm3_init(int thr_id, uint32_t threads);
extern void keccak256_sm3_free(int thr_id);
extern void keccak256_sm3_setBlock_80(void *pdata, const void *ptarget);
extern uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
// CPU Hash
extern "C" void keccak256_hash(void *state, const void *input)
@ -35,6 +43,7 @@ extern "C" void keccak256_hash(void *state, const void *input) @@ -35,6 +43,7 @@ extern "C" void keccak256_hash(void *state, const void *input)
}
static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
@ -42,6 +51,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no @@ -42,6 +51,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int dev_id = device_map[thr_id];
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21); // 256*256*8*4
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -50,17 +60,28 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no @@ -50,17 +60,28 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
if(!use_compat_kernels[thr_id]) {
uint32_t intensity = 23;
if (strstr(device_name[dev_id], "GTX 1080")) intensity = 25;
throughput = cuda_default_throughput(thr_id, 1U << intensity);
keccak256_cpu_init(thr_id);
} else {
// really useful ?
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
keccak256_sm3_init(thr_id, throughput);
}
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
keccak256_cpu_init(thr_id, throughput);
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
init[thr_id] = true;
}
@ -69,13 +90,25 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no @@ -69,13 +90,25 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
be32enc(&endiandata[k], pdata[k]);
}
keccak256_setBlock_80((void*)endiandata, ptarget);
const uint2 highTarget = make_uint2(ptarget[6], ptarget[7]);
if(use_compat_kernels[thr_id])
keccak256_sm3_setBlock_80((void*)endiandata, ptarget);
else {
keccak256_setBlock_80((uint64_t*)endiandata);
keccak256_setOutput(thr_id);
}
do {
int order = 0;
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[0] = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
if(use_compat_kernels[thr_id])
work->nonces[0] = keccak256_sm3_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
else {
keccak256_cpu_hash_80(thr_id, throughput, pdata[19], work->nonces, highTarget);
}
if (work->nonces[0] != UINT32_MAX && bench_algo < 0)
{
const uint32_t Htarg = ptarget[7];
@ -95,6 +128,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no @@ -95,6 +128,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
//keccak256_setOutput(thr_id);
continue;
}
}
@ -120,9 +154,12 @@ extern "C" void free_keccak256(int thr_id) @@ -120,9 +154,12 @@ extern "C" void free_keccak256(int thr_id)
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
keccak256_cpu_free(thr_id);
if(!use_compat_kernels[thr_id])
keccak256_cpu_free(thr_id);
else {
cudaFree(d_hash[thr_id]);
keccak256_sm3_free(thr_id);
}
cudaDeviceSynchronize();
init[thr_id] = false;

3
Makefile.am

@ -40,7 +40,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -40,7 +40,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \
Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu \
Algo256/cuda_keccak256_sm3.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \
Algo256/blake2s.cu sph/blake2s.c \
Algo256/bmw.cu Algo256/cuda_bmw.cu \

7
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccminer 2.2.3-dev (Nov. 2017) "polytimos algo"
ccminer 2.2.3 (Dec. 2017) "polytimos algo and keccakc (opt)"
---------------------------------------------------------------
***************************************************************
@ -93,6 +93,7 @@ its command line interface and options. @@ -93,6 +93,7 @@ its command line interface and options.
hsr use to mine Hshare
jackpot use to mine Sweepcoin
keccak use to mine Maxcoin
keccakc use to mine CreativeCoin
lbry use to mine LBRY Credits
luffa use to mine Joincoin
lyra2 use to mine CryptoCoin
@ -281,8 +282,10 @@ so we can more efficiently implement new algorithms using the latest hardware @@ -281,8 +282,10 @@ so we can more efficiently implement new algorithms using the latest hardware
features.
>>> RELEASE HISTORY <<<
Nov. 16th 2017 v2.2.3
Dec. 04th 2017 v2.2.3
Polytimos Algo
Handle keccakc variant (with refreshed sha256d merkle)
Optimised keccak for SM5+, based on alexis improvements
Oct. 09th 2017 v2.2.2
Import and clean the hsr algo (x13 + custom hash)

3
ccminer.cpp

@ -255,7 +255,8 @@ Options:\n\ @@ -255,7 +255,8 @@ Options:\n\
heavy Heavycoin\n\
hmq1725 Doubloons / Espers\n\
jackpot JHA v8\n\
keccak Keccak-256 (Maxcoin)\n\
keccak Deprecated Keccak-256\n\
keccakc Keccak-256 (CreativeCoin)\n\
lbry LBRY Credits (Sha/Ripemd)\n\
luffa Joincoin\n\
lyra2 CryptoCoin\n\

3
ccminer.vcxproj

@ -475,7 +475,8 @@ @@ -475,7 +475,8 @@
<CudaCompile Include="Algo256\cuda_cubehash256.cu" />
<CudaCompile Include="Algo256\cuda_fugue256.cu" />
<CudaCompile Include="Algo256\cuda_groestl256.cu" />
<CudaCompile Include="Algo256\cuda_keccak256.cu">
<CudaCompile Include="Algo256\cuda_keccak256.cu" />
<CudaCompile Include="Algo256\cuda_keccak256_sm3.cu">
<MaxRegCount>92</MaxRegCount>
</CudaCompile>
<CudaCompile Include="Algo256\cuda_skein256.cu" />

3
ccminer.vcxproj.filters

@ -844,6 +844,9 @@ @@ -844,6 +844,9 @@
<CudaCompile Include="Algo256\cuda_keccak256.cu">
<Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile>
<CudaCompile Include="Algo256\cuda_keccak256_sm3.cu">
<Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile>
<CudaCompile Include="Algo256\cuda_skein256.cu">
<Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile>

13
lyra2/lyra2RE.cu

@ -16,9 +16,10 @@ extern void blake256_cpu_init(int thr_id, uint32_t threads); @@ -16,9 +16,10 @@ extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void blake256_cpu_setBlock_80(uint32_t *pdata);
extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_cpu_init(int thr_id, uint32_t threads);
extern void keccak256_cpu_free(int thr_id);
extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_sm3_init(int thr_id, uint32_t threads);
extern void keccak256_sm3_free(int thr_id);
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, uint32_t threads);
@ -97,7 +98,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, @@ -97,7 +98,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
blake256_cpu_init(thr_id, throughput);
keccak256_cpu_init(thr_id, throughput);
keccak256_sm3_init(thr_id, throughput);
skein256_cpu_init(thr_id, throughput);
groestl256_cpu_init(thr_id, throughput);
@ -124,7 +125,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, @@ -124,7 +125,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
int order = 0;
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -186,7 +187,7 @@ extern "C" void free_lyra2(int thr_id) @@ -186,7 +187,7 @@ extern "C" void free_lyra2(int thr_id)
cudaFree(d_hash[thr_id]);
cudaFree(d_matrix[thr_id]);
keccak256_cpu_free(thr_id);
keccak256_sm3_free(thr_id);
groestl256_cpu_free(thr_id);
init[thr_id] = false;

14
lyra2/lyra2REv2.cu

@ -16,9 +16,11 @@ static uint64_t* d_matrix[MAX_GPUS]; @@ -16,9 +16,11 @@ static uint64_t* d_matrix[MAX_GPUS];
extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void blake256_cpu_setBlock_80(uint32_t *pdata);
extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_cpu_init(int thr_id, uint32_t threads);
extern void keccak256_cpu_free(int thr_id);
extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_sm3_init(int thr_id, uint32_t threads);
extern void keccak256_sm3_free(int thr_id);
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, uint32_t threads);
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
@ -101,7 +103,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc @@ -101,7 +103,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
blake256_cpu_init(thr_id, throughput);
keccak256_cpu_init(thr_id,throughput);
keccak256_sm3_init(thr_id,throughput);
skein256_cpu_init(thr_id, throughput);
bmw256_cpu_init(thr_id, throughput);
@ -129,7 +131,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc @@ -129,7 +131,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
int order = 0;
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -194,7 +196,7 @@ extern "C" void free_lyra2v2(int thr_id) @@ -194,7 +196,7 @@ extern "C" void free_lyra2v2(int thr_id)
cudaFree(d_matrix[thr_id]);
bmw256_cpu_free(thr_id);
keccak256_cpu_free(thr_id);
keccak256_sm3_free(thr_id);
init[thr_id] = false;

Loading…
Cancel
Save