diff --git a/blake32.cu b/Algo256/blake256.cu similarity index 100% rename from blake32.cu rename to Algo256/blake256.cu diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu new file mode 100644 index 0000000..f7a7601 --- /dev/null +++ b/Algo256/cuda_blake256.cu @@ -0,0 +1,250 @@ +/** + * Blake-256 Cuda Kernel (Tested on SM 5.0) + * + * Tanguy Pruvot - Nov. 2014 + */ +extern "C" { +#include "sph/sph_blake.h" +} + +#include "cuda_helper.h" + +#include + +static __device__ uint64_t cuda_swab32ll(uint64_t x) { + return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x))); +} + +__constant__ static uint32_t c_data[20]; + +__constant__ static uint32_t sigma[16][16]; +static uint32_t c_sigma[16][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + +static const uint32_t c_IV256[8] = { + 0x6A09E667, 0xBB67AE85, + 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 +}; + +__device__ __constant__ static uint32_t cpu_h[8]; + +__device__ __constant__ static uint32_t u256[16]; +static const uint32_t c_u256[16] = { + 0x243F6A88, 0x85A308D3, + 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, + 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, + 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, + 0x3F84D5B5, 0xB5470917 +}; + +#define GS2(a,b,c,d,x) { \ + const uint32_t idx1 = sigma[r][x]; \ + const uint32_t idx2 = sigma[r][x+1]; \ + v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \ + v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \ +\ + v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \ + v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ + v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ +} + +//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) +#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#define hostGS(a,b,c,d,x) { \ + const uint32_t idx1 = c_sigma[r][x]; \ + const uint32_t idx2 = c_sigma[r][x+1]; \ + v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c], 12); \ +\ + v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c], 7); \ + } + +/* Second part (64-80) msg never change, store it */ +__device__ __constant__ static const uint32_t c_Padding[16] = { + 0, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640, +}; + +__host__ __forceinline__ +static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0) +{ + uint32_t m[16]; + uint32_t v[16]; + + for (int i = 0; i < 16; i++) { + m[i] = block[i]; + } + + for (int i = 0; i < 8; i++) + v[i] = h[i]; + + v[8] = c_u256[0]; + v[9] = c_u256[1]; + v[10] = c_u256[2]; + v[11] = c_u256[3]; + + v[12] = c_u256[4] ^ T0; + v[13] = c_u256[5] ^ T0; + v[14] = c_u256[6]; + v[15] = c_u256[7]; + + for (int r = 0; r < 14; r++) { + /* column step */ + hostGS(0, 4, 0x8, 0xC, 0x0); + hostGS(1, 5, 0x9, 0xD, 0x2); + hostGS(2, 6, 0xA, 0xE, 0x4); + hostGS(3, 7, 0xB, 0xF, 0x6); + /* diagonal step */ + hostGS(0, 5, 0xA, 0xF, 0x8); + hostGS(1, 6, 0xB, 0xC, 0xA); + hostGS(2, 7, 0x8, 0xD, 0xC); + hostGS(3, 4, 0x9, 0xE, 0xE); + } + + for (int i = 0; i < 16; i++) { + int j = i & 7; + h[j] ^= v[i]; + } +} + +__device__ __forceinline__ +static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0) +{ + uint32_t m[16]; + uint32_t v[16]; + + m[0] = block[0]; + m[1] = block[1]; + m[2] = block[2]; + m[3] = block[3]; + + #pragma unroll + for (int i = 4; i < 16; i++) { + m[i] = c_Padding[i]; + } + + #pragma unroll 8 + for (int i = 0; i < 8; i++) + v[i] = h[i]; + + v[8] = u256[0]; + v[9] = u256[1]; + v[10] = u256[2]; + v[11] = u256[3]; + + v[12] = u256[4] ^ T0; + v[13] = u256[5] ^ T0; + v[14] = u256[6]; + v[15] = u256[7]; + + #pragma unroll 14 + for (int r = 0; r < 14; r++) { + /* column step */ + GS2(0, 4, 0x8, 0xC, 0x0); + GS2(1, 5, 0x9, 0xD, 0x2); + GS2(2, 6, 0xA, 0xE, 0x4); + GS2(3, 7, 0xB, 0xF, 0x6); + /* diagonal step */ + GS2(0, 5, 0xA, 0xF, 0x8); + GS2(1, 6, 0xB, 0xC, 0xA); + GS2(2, 7, 0x8, 0xD, 0xC); + GS2(3, 4, 0x9, 0xE, 0xE); + } + + #pragma unroll 16 + for (int i = 0; i < 16; i++) { + int j = i & 7; + h[j] ^= v[i]; + } +} + +__global__ __launch_bounds__(256,3) +void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + uint32_t h[8]; + uint32_t input[4]; + + #pragma unroll 8 + for (int i = 0; i<8; i++) { h[i] = cpu_h[i];} + + #pragma unroll 3 + for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i]; + + input[3] = nonce; + blake256_compress2nd(h, input, 640); + + #pragma unroll + for (int i = 0; i<4; i++) { + Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1])); + } + } +} + +__host__ +void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order) +{ + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + blake256_gpu_hash_80 <<>> (threads, startNonce, Hash); + MyStreamSynchronize(NULL, order, thr_id); +} + +__host__ +void blake256_cpu_setBlock_80(uint32_t *pdata) +{ + uint32_t h[8]; + uint32_t data[20]; + memcpy(data, pdata, 80); + for (int i = 0; i<8; i++) { + h[i] = c_IV256[i]; + } + blake256_compress1st(h, pdata, 512); + + cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice); +} + +__host__ +void blake256_cpu_init(int thr_id, int threads) +{ + cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice); +} diff --git a/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu similarity index 99% rename from cuda_fugue256.cu rename to Algo256/cuda_fugue256.cu index 9a3b6cc..cb5f4cd 100644 --- a/cuda_fugue256.cu +++ b/Algo256/cuda_fugue256.cu @@ -571,7 +571,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas for(int i=0;i<30;i++) sc[i] = GPUstate[i]; - uint32_t nounce = startNounce + thread; // muss noch ermittelt werden + uint32_t nounce = startNounce + thread; // muss noch ermittelt werden uint32_t q; @@ -687,7 +687,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas int i; bool rc = true; - + for (i = 7; i >= 0; i--) { if (hash[i] > pTarget[i]) { rc = false; @@ -730,7 +730,7 @@ void fugue256_cpu_init(int thr_id, int threads) // Speicher für alle Ergebnisse belegen cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); - cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); + cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); } __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu new file mode 100644 index 0000000..8007f24 --- /dev/null +++ b/Algo256/cuda_groestl256.cu @@ -0,0 +1,309 @@ +#include + +#include "cuda_helper.h" + +uint32_t *d_gnounce[8]; +uint32_t *d_GNonce[8]; + +__constant__ uint32_t pTarget[8]; + +#define SPH_C32(x) ((uint32_t)(x ## U)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#define C32e(x) \ + ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) + +#define PC32up(j, r) ((uint32_t)((j) + (r))) +#define PC32dn(j, r) 0 +#define QC32up(j, r) 0xFFFFFFFF +#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) + +#define B32_0(x) __byte_perm(x, 0, 0x4440) +//((x) & 0xFF) +#define B32_1(x) __byte_perm(x, 0, 0x4441) +//(((x) >> 8) & 0xFF) +#define B32_2(x) __byte_perm(x, 0, 0x4442) +//(((x) >> 16) & 0xFF) +#define B32_3(x) __byte_perm(x, 0, 0x4443) +//((x) >> 24) + +#define MAXWELL_OR_FERMI 1 +#if MAXWELL_OR_FERMI + #define USE_SHARED 1 + // Maxwell and Fermi cards get the best speed with SHARED access it seems. + #if USE_SHARED + #define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) + #define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) + #define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) + #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) + #define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) + #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) + #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) + #define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) + #else + #define T0up(x) tex1Dfetch(t0up2, x) + #define T0dn(x) tex1Dfetch(t0dn2, x) + #define T1up(x) tex1Dfetch(t1up2, x) + #define T1dn(x) tex1Dfetch(t1dn2, x) + #define T2up(x) tex1Dfetch(t2up2, x) + #define T2dn(x) tex1Dfetch(t2dn2, x) + #define T3up(x) tex1Dfetch(t3up2, x) + #define T3dn(x) tex1Dfetch(t3dn2, x) + #endif +#else + #define USE_SHARED 1 + // a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5! + #define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) + #define T0dn(x) tex1Dfetch(t0dn2, x) + #define T1up(x) tex1Dfetch(t1up2, x) + #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) + #define T2up(x) tex1Dfetch(t2up2, x) + #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) + #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) + #define T3dn(x) tex1Dfetch(t3dn2, x) +#endif + +texture t0up2; +texture t0dn2; +texture t1up2; +texture t1dn2; +texture t2up2; +texture t2dn2; +texture t3up2; +texture t3dn2; + +#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d0] = T0up(B32_0(a[b0])) \ + ^ T1up(B32_1(a[b1])) \ + ^ T2up(B32_2(a[b2])) \ + ^ T3up(B32_3(a[b3])) \ + ^ T0dn(B32_0(a[b4])) \ + ^ T1dn(B32_1(a[b5])) \ + ^ T2dn(B32_2(a[b6])) \ + ^ T3dn(B32_3(a[b7])); \ + t[d1] = T0dn(B32_0(a[b0])) \ + ^ T1dn(B32_1(a[b1])) \ + ^ T2dn(B32_2(a[b2])) \ + ^ T3dn(B32_3(a[b3])) \ + ^ T0up(B32_0(a[b4])) \ + ^ T1up(B32_1(a[b5])) \ + ^ T2up(B32_2(a[b6])) \ + ^ T3up(B32_3(a[b7])); \ + } while (0) + + +extern uint32_t T0up_cpu[]; +extern uint32_t T0dn_cpu[]; +extern uint32_t T1up_cpu[]; +extern uint32_t T1dn_cpu[]; +extern uint32_t T2up_cpu[]; +extern uint32_t T2dn_cpu[]; +extern uint32_t T3up_cpu[]; +extern uint32_t T3dn_cpu[]; + +__device__ __forceinline__ +void groestl256_perm_P(int thread,uint32_t *a, char *mixtabs) +{ + #pragma unroll 10 + for (int r = 0; r<10; r++) + { + uint32_t t[16]; + + a[0x0] ^= PC32up(0x00, r); + a[0x2] ^= PC32up(0x10, r); + a[0x4] ^= PC32up(0x20, r); + a[0x6] ^= PC32up(0x30, r); + a[0x8] ^= PC32up(0x40, r); + a[0xA] ^= PC32up(0x50, r); + a[0xC] ^= PC32up(0x60, r); + a[0xE] ^= PC32up(0x70, r); + RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); + RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); + RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); + RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); + RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); + RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); + RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); + RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); + + #pragma unroll 16 + for (int k = 0; k<16; k++) + a[k] = t[k]; + } +} + +__device__ __forceinline__ +void groestl256_perm_Q(int thread, uint32_t *a, char *mixtabs) +{ + #pragma unroll + for (int r = 0; r<10; r++) + { + uint32_t t[16]; + + a[0x0] ^= QC32up(0x00, r); + a[0x1] ^= QC32dn(0x00, r); + a[0x2] ^= QC32up(0x10, r); + a[0x3] ^= QC32dn(0x10, r); + a[0x4] ^= QC32up(0x20, r); + a[0x5] ^= QC32dn(0x20, r); + a[0x6] ^= QC32up(0x30, r); + a[0x7] ^= QC32dn(0x30, r); + a[0x8] ^= QC32up(0x40, r); + a[0x9] ^= QC32dn(0x40, r); + a[0xA] ^= QC32up(0x50, r); + a[0xB] ^= QC32dn(0x50, r); + a[0xC] ^= QC32up(0x60, r); + a[0xD] ^= QC32dn(0x60, r); + a[0xE] ^= QC32up(0x70, r); + a[0xF] ^= QC32dn(0x70, r); + RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); + RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); + RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); + RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); + RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); + RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); + RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); + RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); + + #pragma unroll + for (int k = 0; k<16; k++) + a[k] = t[k]; + } +} + +__global__ __launch_bounds__(256,1) +void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector) +{ +#if USE_SHARED + extern __shared__ char mixtabs[]; + + if (threadIdx.x < 256) { + *((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(t0up2, threadIdx.x); + *((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(t0dn2, threadIdx.x); + *((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(t1up2, threadIdx.x); + *((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(t1dn2, threadIdx.x); + *((uint32_t*)mixtabs + (1024 + threadIdx.x)) = tex1Dfetch(t2up2, threadIdx.x); + *((uint32_t*)mixtabs + (1280 + threadIdx.x)) = tex1Dfetch(t2dn2, threadIdx.x); + *((uint32_t*)mixtabs + (1536 + threadIdx.x)) = tex1Dfetch(t3up2, threadIdx.x); + *((uint32_t*)mixtabs + (1792 + threadIdx.x)) = tex1Dfetch(t3dn2, threadIdx.x); + } + + __syncthreads(); +#endif + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // GROESTL + uint32_t message[16]; + uint32_t state[16]; + + #pragma unroll + for (int k = 0; k<4; k++) + LOHI(message[2*k], message[2*k+1], outputHash[k*threads+thread]); + + #pragma unroll + for (int k = 9; k<15; k++) + message[k] = 0; + + message[8] = 0x80; + message[15] = 0x01000000; + + #pragma unroll 16 + for (int u = 0; u<16; u++) + state[u] = message[u]; + + state[15] ^= 0x10000; + + // Perm + +#if USE_SHARED + groestl256_perm_P(thread, state, mixtabs); + state[15] ^= 0x10000; + groestl256_perm_Q(thread, message, mixtabs); +#else + groestl256_perm_P(thread, state, NULL); + state[15] ^= 0x10000; + groestl256_perm_P(thread, message, NULL); +#endif + #pragma unroll 16 + for (int u = 0; u<16; u++) state[u] ^= message[u]; + #pragma unroll 16 + for (int u = 0; u<16; u++) message[u] = state[u]; +#if USE_SHARED + groestl256_perm_P(thread, message, mixtabs); +#else + groestl256_perm_P(thread, message, NULL); +#endif + state[14] ^= message[14]; + state[15] ^= message[15]; + + uint32_t nonce = startNounce + thread; + if (state[15] <= pTarget[7]) { + nonceVector[0] = nonce; + } + } +} + +#define texDef(texname, texmem, texsource, texsize) \ + unsigned int *texmem; \ + cudaMalloc(&texmem, texsize); \ + cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + texname.normalized = 0; \ + texname.filterMode = cudaFilterModePoint; \ + texname.addressMode[0] = cudaAddressModeClamp; \ + { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ + +__host__ +void groestl256_cpu_init(int thr_id, int threads) +{ + + // Texturen mit obigem Makro initialisieren + texDef(t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256); + texDef(t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256); + texDef(t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256); + texDef(t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256); + texDef(t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256); + texDef(t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256); + texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256); + texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256); + + cudaMalloc(&d_GNonce[thr_id], sizeof(uint32_t)); + cudaMallocHost(&d_gnounce[thr_id], 1*sizeof(uint32_t)); +} + +__host__ +uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +{ + uint32_t result = 0xffffffff; + cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t)); + const int threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + +#if USE_SHARED + size_t shared_size = 8 * 256 * sizeof(uint32_t); +#else + size_t shared_size = 0; +#endif + groestl256_gpu_hash32<<>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]); + + MyStreamSynchronize(NULL, order, thr_id); + cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaThreadSynchronize(); + result = *d_gnounce[thr_id]; + + return result; +} + +__host__ +void groestl256_setTarget(const void *pTargetIn) +{ + cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); +} \ No newline at end of file diff --git a/keccak/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu similarity index 51% rename from keccak/cuda_keccak256.cu rename to Algo256/cuda_keccak256.cu index 2b9315e..8b3546e 100644 --- a/keccak/cuda_keccak256.cu +++ b/Algo256/cuda_keccak256.cu @@ -27,11 +27,81 @@ uint32_t *d_KNonce[8]; __constant__ uint32_t pTarget[8]; __constant__ uint64_t keccak_round_constants[24]; -__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding) +__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?) +#if __CUDA_ARCH__ >= 350 +__device__ __forceinline__ +static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants) +{ + size_t i; + uint2 t[5], u[5], v, w; + + #pragma unroll + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROL2(t[1], 1); + u[1] = t[0] ^ ROL2(t[2], 1); + u[2] = t[1] ^ ROL2(t[3], 1); + u[3] = t[2] ^ ROL2(t[4], 1); + u[4] = t[3] ^ ROL2(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[1]; + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[22] = ROL2(s[14], 39); + s[14] = ROL2(s[20], 18); + s[20] = ROL2(s[2], 62); + s[2] = ROL2(s[12], 43); + s[12] = ROL2(s[13], 25); + s[13] = ROL2(s[19], 8); + s[19] = ROL2(s[23], 56); + s[23] = ROL2(s[15], 41); + s[15] = ROL2(s[4], 27); + s[4] = ROL2(s[24], 14); + s[24] = ROL2(s[21], 2); + s[21] = ROL2(s[8], 55); + s[8] = ROL2(s[16], 45); + s[16] = ROL2(s[5], 36); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[18] = ROL2(s[17], 15); + s[17] = ROL2(s[11], 10); + s[11] = ROL2(s[7], 6); + s[7] = ROL2(s[10], 3); + s[10] = ROL2(v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; + v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; -static __device__ __forceinline__ -void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) { + /* iota: a[0,0] ^= round constant */ + s[0] ^= vectorize(keccak_round_constants[i]); + } +} +#endif + +__device__ __forceinline__ +static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants) +{ size_t i; uint64_t t[5], u[5], v, w; @@ -109,14 +179,16 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, //#pragma unroll 25 for (int i=0; i<25; i++) { - if(i<9) {keccak_gpu_state[i] = c_PaddedMessage80[i];} - else {keccak_gpu_state[i] = 0;} + if (i < 9) + keccak_gpu_state[i] = c_PaddedMessage80[i]; + else + keccak_gpu_state[i] = 0; } - keccak_gpu_state[9]=REPLACE_HIWORD(c_PaddedMessage80[9],cuda_swab32(nounce)); - keccak_gpu_state[10]=0x0000000000000001; - keccak_gpu_state[16]=0x8000000000000000; + keccak_gpu_state[9] = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); + keccak_gpu_state[10] = 0x0000000000000001; + keccak_gpu_state[16] = 0x8000000000000000; - keccak_block(keccak_gpu_state,keccak_round_constants); + keccak_blockv30(keccak_gpu_state, keccak_round_constants); bool rc = false; if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;} @@ -125,18 +197,7 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, if(resNounce[0] > nounce) resNounce[0] = nounce; } - } //thread -} - -void keccak256_cpu_init(int thr_id, int threads) -{ - CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, - host_keccak_round_constants, - sizeof(host_keccak_round_constants), - 0, cudaMemcpyHostToDevice)); - - CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t))); + } } __host__ @@ -161,6 +222,66 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui return result; } +#ifdef _MSC_VER +#define UINT2(a, b) { a, b } +#else +#define UINT2(a, b) (uint2) { a, b } +#endif + +__global__ __launch_bounds__(256,3) +void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { +#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */ + uint2 keccak_gpu_state[25]; + #pragma unroll 25 + for (int i = 0; i<25; i++) { + if (i < 4) + keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]); + else + keccak_gpu_state[i] = UINT2(0, 0); + } + keccak_gpu_state[4] = UINT2(1, 0); + keccak_gpu_state[16] = UINT2(0, 0x80000000); + keccak_blockv35(keccak_gpu_state, keccak_round_constants); + + #pragma unroll 4 + for (int i=0; i<4;i++) + outputHash[i*threads+thread]=devectorize(keccak_gpu_state[i]); +#else + uint64_t keccak_gpu_state[25]; + #pragma unroll 25 + for (int i = 0; i<25; i++) { + if (i<4) + keccak_gpu_state[i] = outputHash[i*threads+thread]; + else + keccak_gpu_state[i] = 0; + } + keccak_gpu_state[4] = 0x0000000000000001; + keccak_gpu_state[16] = 0x8000000000000000; + + keccak_blockv30(keccak_gpu_state, keccak_round_constants); + #pragma unroll 4 + for (int i = 0; i<4; i++) + outputHash[i*threads + thread] = keccak_gpu_state[i]; +#endif + } +} + +__host__ +void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +{ + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + keccak256_gpu_hash_32 <<>> (threads, startNounce, d_outputHash); + MyStreamSynchronize(NULL, order, thr_id); +} + __host__ void keccak256_setBlock_80(void *pdata,const void *pTargetIn) { @@ -168,4 +289,13 @@ void keccak256_setBlock_80(void *pdata,const void *pTargetIn) memcpy(PaddedMessage, pdata, 80); CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); -} \ No newline at end of file +} + +__host__ +void keccak256_cpu_init(int thr_id, int threads) +{ + CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants, + sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t))); +} diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu new file mode 100644 index 0000000..9e4013f --- /dev/null +++ b/Algo256/cuda_skein256.cu @@ -0,0 +1,196 @@ +#include + +#include "cuda_helper.h" + +#if 0 +static __constant__ uint64_t SKEIN_IV512_256[8] = { + 0xCCD044A12FDB3E13, 0xE83590301A79A9EB, + 0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB, + 0xEC06025E74DD7683, 0xE7A436CDC4746251, + 0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13 +}; +#endif + +static __constant__ uint2 vSKEIN_IV512_256[8] = { + { 0x2FDB3E13, 0xCCD044A1 }, + { 0x1A79A9EB, 0xE8359030 }, + { 0x4F816E6F, 0x55AEA061 }, + { 0xAE9B94DB, 0x2A2767A4 }, + { 0x74DD7683, 0xEC06025E }, + { 0xC4746251, 0xE7A436CD }, + { 0x393AD185, 0xC36FBAF9 }, + { 0x33EDFC13, 0x3EEDBA18 } +}; + +static __constant__ int ROT256[8][4] = +{ + 46,36, 19, 37, + 33,27, 14, 42, + 17,49, 36, 39, + 44, 9, 54, 56, + 39,30, 34, 24, + 13,50, 10, 17, + 25,29, 39, 43, + 8, 35, 56, 22, +}; + +static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA}; +static __constant__ uint2 t12[6] = { + { 0x20, 0 }, + { 0, 0xf0000000 }, + { 0x20, 0xf0000000 }, + { 0x08, 0 }, + { 0, 0xff000000 }, + { 0x08, 0xff000000 } +}; + +#if 0 +static __constant__ uint64_t t12_30[6] = { + 0x20, + 0xf000000000000000, + 0xf000000000000020, + 0x08, + 0xff00000000000000, + 0xff00000000000008 +}; +#endif + +static __forceinline__ __device__ +void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT) +{ + p0 += p1; p1 = ROL2(p1, ROT256[ROT][0]); p1 ^= p0; + p2 += p3; p3 = ROL2(p3, ROT256[ROT][1]); p3 ^= p2; + p4 += p5; p5 = ROL2(p5, ROT256[ROT][2]); p5 ^= p4; + p6 += p7; p7 = ROL2(p7, ROT256[ROT][3]); p7 ^= p6; +} + + +static __forceinline__ __device__ +void Round_8_512v35(uint2 *ks, uint2 *ts, + uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, + uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 0); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 1); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 2); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 3); + p0 += ks[((R)+0) % 9]; /* inject the key schedule value */ + p1 += ks[((R)+1) % 9]; + p2 += ks[((R)+2) % 9]; + p3 += ks[((R)+3) % 9]; + p4 += ks[((R)+4) % 9]; + p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3]; + p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; + p7 += ks[((R)+7) % 9] + make_uint2((R),0); + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 4); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 5); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 6); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 7); + p0 += ks[((R)+1) % 9]; /* inject the key schedule value */ + p1 += ks[((R)+2) % 9]; + p2 += ks[((R)+3) % 9]; + p3 += ks[((R)+4) % 9]; + p4 += ks[((R)+5) % 9]; + p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; + p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; + p7 += ks[((R)+8) % 9] + make_uint2((R)+1, 0); +} + + +__global__ __launch_bounds__(256,3) +void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2 h[9]; + uint2 t[3]; + uint2 dt0,dt1,dt2,dt3; + uint2 p0, p1, p2, p3, p4, p5, p6, p7; + + h[8] = skein_ks_parity; + for (int i = 0; i<8; i++) { + h[i] = vSKEIN_IV512_256[i]; + h[8] ^= h[i]; + } + + t[0]=t12[0]; + t[1]=t12[1]; + t[2]=t12[2]; + + LOHI(dt0.x,dt0.y,outputHash[thread]); + LOHI(dt1.x,dt1.y,outputHash[threads+thread]); + LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]); + LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]); + + p0 = h[0] + dt0; + p1 = h[1] + dt1; + p2 = h[2] + dt2; + p3 = h[3] + dt3; + p4 = h[4]; + p5 = h[5] + t[0]; + p6 = h[6] + t[1]; + p7 = h[7]; + + #pragma unroll + for (int i = 1; i<19; i+=2) { + Round_8_512v35(h,t,p0,p1,p2,p3,p4,p5,p6,p7,i); + } + + p0 ^= dt0; + p1 ^= dt1; + p2 ^= dt2; + p3 ^= dt3; + + h[0] = p0; + h[1] = p1; + h[2] = p2; + h[3] = p3; + h[4] = p4; + h[5] = p5; + h[6] = p6; + h[7] = p7; + h[8] = skein_ks_parity; + + #pragma unroll 8 + for (int i = 0; i<8; i++) { + h[8] ^= h[i]; + } + + t[0] = t12[3]; + t[1] = t12[4]; + t[2] = t12[5]; + p5 += t[0]; //p5 already equal h[5] + p6 += t[1]; + + #pragma unroll + for (int i = 1; i<19; i+=2) { + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i); + } + + outputHash[thread] = devectorize(p0); + outputHash[threads+thread] = devectorize(p1); + outputHash[2*threads+thread] = devectorize(p2); + outputHash[3*threads+thread] = devectorize(p3); + } +} + +__host__ +void skein256_cpu_init(int thr_id, int threads) +{ + //empty +} + +__host__ +void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +{ + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + skein256_gpu_hash_32<<>>(threads, startNounce, d_outputHash); + + MyStreamSynchronize(NULL, order, thr_id); +} + diff --git a/keccak/keccak256.cu b/Algo256/keccak256.cu similarity index 100% rename from keccak/keccak256.cu rename to Algo256/keccak256.cu diff --git a/Makefile.am b/Makefile.am index 50ec95e..168ac14 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,11 +10,11 @@ EXTRA_DIST = autogen.sh README.txt LICENSE.txt \ cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \ compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in -SUBDIRS = compat +SUBDIRS = compat -bin_PROGRAMS = ccminer +bin_PROGRAMS = ccminer -ccminer_SOURCES = elist.h miner.h compat.h \ +ccminer_SOURCES = elist.h miner.h compat.h \ compat/inttypes.h compat/stdbool.h compat/unistd.h \ compat/sys/time.h compat/getopt/getopt.h \ crc32.c hefty1.c scrypt.c \ @@ -27,17 +27,20 @@ ccminer_SOURCES = elist.h miner.h compat.h \ heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \ heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \ heavy/cuda_sha256.cu heavy/cuda_sha256.h \ - keccak/cuda_keccak256.cu keccak/keccak256.cu \ - fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \ + fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c uint256.h \ groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ myriadgroestl.cpp cuda_myriadgroestl.cu \ + lyra2/Lyra2.c lyra2/Sponge.c \ + lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \ + Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ + Algo256/blake256.cu Algo256/keccak256.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \ quark/quarkcoin.cu quark/animecoin.cu \ quark/cuda_quark_compactionTest.cu \ - cuda_nist5.cu blake32.cu pentablake.cu \ + cuda_nist5.cu pentablake.cu \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ diff --git a/README.txt b/README.txt index a6cc8df..337b36d 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.5.0-tpruvot (27 Nov 2014) - "Extra nonce" +ccMiner release 1.5.1-tpruvot (16 Dec 2014) - "Vertcoin Lyra2" --------------------------------------------------------------- *************************************************************** @@ -38,6 +38,7 @@ Keccak (Maxcoin) Deep, Doom and Qubit Pentablake (Blake 512 x5) S3 (OneCoin) +Lyra2RE (new VertCoin algo) where some of these coins have a VERY NOTABLE nVidia advantage over competing AMD (OpenCL Only) implementations. @@ -68,6 +69,7 @@ its command line interface and options. jackpot use to mine Jackpotcoin keccak use to mine Maxcoin luffa use to mine Doomcoin + lyra2 use to mine Vertcoin mjollnir use to mine Mjollnircoin myr-gr use to mine Myriad-Groest nist5 use to mine TalkCoin @@ -169,6 +171,12 @@ features. >>> RELEASE HISTORY <<< + Dec. 2014 v1.5.1 (not released yet!) + Add lyra2 algo for Vertcoin (Release is 16 Dec 2014) + Multiple shares support (2 for the moment) + X11 optimisations (From klaust and sp-hash) + HTML5 WebSocket api compatibility (see api/websocket.htm) + Nov. 27th 2014 v1.5.0 Upgrade compat jansson to 2.6 (for windows) Add pool mining.set_extranonce support diff --git a/ccminer.cpp b/ccminer.cpp index 147c739..4afbcb2 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -138,6 +138,7 @@ enum sha_algos { ALGO_KECCAK, ALGO_JACKPOT, ALGO_LUFFA_DOOM, + ALGO_LYRA, ALGO_MJOLLNIR, /* Hefty hash */ ALGO_MYR_GR, ALGO_NIST5, @@ -167,6 +168,7 @@ static const char *algo_names[] = { "keccak", "jackpot", "luffa", + "lyra2", "mjollnir", "myr-gr", "nist5", @@ -272,6 +274,7 @@ Options:\n\ jackpot Jackpot\n\ keccak Keccak-256 (Maxcoin)\n\ luffa Doomcoin\n\ + lyra2 VertCoin\n\ mjollnir Mjollnircoin\n\ myr-gr Myriad-Groestl\n\ nist5 NIST5 (TalkCoin)\n\ @@ -1255,6 +1258,11 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_LYRA: + rc = scanhash_lyra(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_NIST5: rc = scanhash_nist5(thr_id, work.data, work.target, max_nonce, &hashes_done); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index ad3559e..415e9ce 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -105,7 +105,7 @@ 80 true false - compute_50,sm_50 + compute_30,sm_30;compute_50,sm_50 @@ -173,7 +173,7 @@ 80 true false - compute_30,sm_30;compute_50,sm_50; + compute_50,sm_50; --ptxas-options="-O2" %(AdditionalOptions) @@ -257,6 +257,8 @@ Full /Tp %(AdditionalOptions) + + @@ -330,13 +332,15 @@ + + + + true - - @@ -369,15 +373,19 @@ - + 64 --ptxas-options="-dlcm=cg" %(AdditionalOptions) true - + + + + + 92 - + 80 --ptxas-options="-dlcm=cg" %(AdditionalOptions) @@ -418,6 +426,8 @@ + + true diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index c2bf060..1f775de 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -61,12 +61,15 @@ {17b56151-79ec-4a32-bac3-9d94ae7f68fe} - - {9762c92c-9677-4044-8292-ff6ba4bfdd89} - {ef6f9983-bda5-4fb2-adfa-ac4f29b74f25} + + {9762c92c-9677-4044-8292-ff6ba4bfdd89} + + + {2ff6e4ce-7c92-4cb2-a3ad-c331e94fd81d} + @@ -213,6 +216,12 @@ Source Files\jansson + + Source Files\sph + + + Source Files\sph + @@ -347,14 +356,17 @@ Header Files\compat + + Header Files\lyra2 + + + Header Files\lyra2 + Source Files\CUDA - - Source Files\CUDA - Source Files\CUDA @@ -505,20 +517,38 @@ Source Files\CUDA\x17 - + + Source Files\CUDA + + + Source Files\CUDA\x11 + + Source Files\CUDA - + Source Files\CUDA - - Source Files\CUDA\keccak + + Source Files\CUDA\Algo256 - - Source Files\CUDA\keccak + + Source Files\CUDA\Algo256 - - Source Files\CUDA\x11 + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA + + + Source Files\CUDA \ No newline at end of file diff --git a/cuda_helper.h b/cuda_helper.h index 5d60288..d1bdc32 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -355,7 +355,7 @@ uint64_t ROTL64(const uint64_t x, const int offset) "setp.lt.u32 p, %2, 32;\n\t" "@!p mov.b64 %0, {vl,vh};\n\t" "@p mov.b64 %0, {vh,vl};\n\t" - "}" + "}" : "=l"(res) : "l"(x) , "r"(offset) ); return res; @@ -378,4 +378,99 @@ uint64_t SWAPDWORDS(uint64_t value) #endif } +/* lyra2 - int2 operators */ + +__device__ __forceinline__ +void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) { + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(lo), "=r"(hi) : "l"(x)); +} + +static __device__ __forceinline__ uint64_t devectorize(uint2 v) { return MAKE_ULONGLONG(v.x, v.y); } +static __device__ __forceinline__ uint2 vectorize(uint64_t v) { + uint2 result; + LOHI(result.x, result.y, v); + return result; +} + +static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } +static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } +static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } +static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } +static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } +static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} +static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; } + +/** + * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b)) + * (what does uint64 "*" operator) + */ +static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "mul.lo.u32 %0,%2,%4; \n\t" + "mul.hi.u32 %1,%2,%4; \n\t" + "mad.lo.cc.u32 %1,%3,%4,%1; \n\t" + "madc.lo.u32 %1,%3,%5,%1; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + +// uint2 method +#if __CUDA_ARCH__ >= 350 +__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) { + uint2 result; + if (offset < 32) { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +__device__ __inline__ uint2 ROR2(const uint2 v, const int n) { + uint2 result; + result.x = (((v.x) >> (n)) | ((v.x) << (64 - (n)))); + result.y = (((v.y) >> (n)) | ((v.y) << (64 - (n)))); + return result; +} +#endif + +#if __CUDA_ARCH__ >= 350 +__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) { + uint2 result; + if (offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +__inline__ __device__ uint2 ROL2(const uint2 v, const int n) { + uint2 result; + result.x = (((v.x) << (n)) | ((v.x) >> (64 - (n)))); + result.y = (((v.y) << (n)) | ((v.y) >> (64 - (n)))); + return result; +} +#endif + + #endif // #ifndef CUDA_HELPER_H diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c new file mode 100644 index 0000000..697f435 --- /dev/null +++ b/lyra2/Lyra2.c @@ -0,0 +1,211 @@ +/** + * Implementation of the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include + +#include "Lyra2.h" +#include "Sponge.h" + +/** + * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords + * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, + * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all + * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value + * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols). + * + * @param K The derived key to be output by the algorithm + * @param kLen Desired key length + * @param pwd User password + * @param pwdlen Password length + * @param salt Salt + * @param saltlen Salt length + * @param timeCost Parameter to determine the processing time (T) + * @param nRows Number or rows of the memory matrix (R) + * @param nCols Number of columns of the memory matrix (C) + * + * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) + */ +int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) +{ + //============================= Basic variables ============================// + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + //==========================================================================/ + + //========== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); + uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i); + if (wholeMatrix == NULL) { + return -1; + } + memset(wholeMatrix, 0, (size_t) i); + + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*)); + if (memMatrix == NULL) { + return -1; + } + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < (int64_t) nRows; i++) { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + //==========================================================================/ + + //============= Getting the password + salt + basil padded with 10*1 ===============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + + byte *ptrByte = (byte*) wholeMatrix; + memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); + + //Prepends the password + memcpy(ptrByte, pwd, (size_t) pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, (size_t) saltlen); + ptrByte += saltlen; + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &saltlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &timeCost, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nRows, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nCols, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + //==========================================================================/ + + //======================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + uint64_t *state = malloc(16 * sizeof (uint64_t)); + if (state == NULL) { + return -1; + } + initState(state); + //==========================================================================/ + + //================================ Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < (int64_t) nBlocksInput; i++) { + absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil) + } + + //Initializes M[0] and M[1] + reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here + + reducedDuplexRow1(state, memMatrix[0], memMatrix[1]); + + do { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < (int64_t) nRows); + //==========================================================================/ + + //============================ Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= (int64_t) timeCost; tau++) { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + //Selects a pseudorandom index row* + //------------------------------------------------------------------------------------------ + //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //------------------------------------------------------------------------------------------ + //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + } while (row != 0); + } + //==========================================================================/ + + //============================ Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlock(state, memMatrix[rowa]); + + //Squeezes the key + squeeze(state, K, (size_t) kLen); + //==========================================================================/ + + //========================= Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + //Wiping out the sponge's internal state before freeing it + memset(state, 0, 16 * sizeof (uint64_t)); + free(state); + //==========================================================================/ + + return 0; +} diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h new file mode 100644 index 0000000..229b2c9 --- /dev/null +++ b/lyra2/Lyra2.h @@ -0,0 +1,50 @@ +/** + * Header file for the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef LYRA2_H_ +#define LYRA2_H_ + +#include + +typedef unsigned char byte; + +//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED) +#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t) +#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes + + +#ifdef BLOCK_LEN_BITS + #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes +#else //default block lenght: 768 bits + #define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes +#endif + +#ifndef N_COLS + #define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default +#endif + +#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks +#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row + + +int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); + +#endif /* LYRA2_H_ */ diff --git a/lyra2/Sponge.c b/lyra2/Sponge.c new file mode 100644 index 0000000..bdf4397 --- /dev/null +++ b/lyra2/Sponge.c @@ -0,0 +1,755 @@ +/** + * A simple implementation of Blake2b's internal permutation + * in the form of a sponge. + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include "Sponge.h" +#include "Lyra2.h" + + + +/** + * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder + * receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges + * typically have their internal state initialized with zeros, Blake2b's G function + * has a fixed point: if the internal state and message are both filled with zeros. the + * resulting permutation will always be a block filled with zeros; this happens because + * Blake2b does not use the constants originally employed in Blake2 inside its G function, + * relying on the IV for avoiding possible fixed points. + * + * @param state The 1024-bit array to be initialized + */ + void initState(uint64_t state[/*16*/]) { + //First 512 bis are zeros + memset(state, 0, 64); + //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV + + state[8] = blake2b_IV[0]; + state[9] = blake2b_IV[1]; + state[10] = blake2b_IV[2]; + state[11] = blake2b_IV[3]; + state[12] = blake2b_IV[4]; + state[13] = blake2b_IV[5]; + state[14] = blake2b_IV[6]; + state[15] = blake2b_IV[7]; + +} + +/** + * Execute Blake2b's G function, with all 12 rounds. + * + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +__inline static void blake2bLyra(uint64_t *v) { + ROUND_LYRA(0); + ROUND_LYRA(1); + ROUND_LYRA(2); + ROUND_LYRA(3); + ROUND_LYRA(4); + ROUND_LYRA(5); + ROUND_LYRA(6); + ROUND_LYRA(7); + ROUND_LYRA(8); + ROUND_LYRA(9); + ROUND_LYRA(10); + ROUND_LYRA(11); +} + +/** + * Executes a reduced version of Blake2b's G function with only one round + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +__inline static void reducedBlake2bLyra(uint64_t *v) { + ROUND_LYRA(0); +} + +/** + * Performs a squeeze operation, using Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param out Array that will receive the data squeezed + * @param len The number of bytes to be squeezed into the "out" array + */ + void squeeze(uint64_t *state, byte *out, unsigned int len) { + int fullBlocks = len / BLOCK_LEN_BYTES; + byte *ptr = out; + int i; + //Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy(ptr, state, BLOCK_LEN_BYTES); + blake2bLyra(state); + ptr += BLOCK_LEN_BYTES; + } + + //Squeezes remaining bytes + memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words + * of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_INT64 words) + */ +void absorbBlock(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + state[8] ^= in[8]; + state[9] ^= in[9]; + state[10] ^= in[10]; + state[11] ^= in[11]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 + * words of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) + */ +void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); +/* + for(int i = 0; i<16; i++) { + printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32), + (uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32)); + } +*/ +} + +/** + * Performs a reduced squeeze operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowOut Row to receive the data squeezed + */ +void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { + uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] + int i; + //M[row][C-1-col] = H.reduced_squeeze() + for (i = 0; i < N_COLS; i++) { + + ptrWord[0] = state[0]; + ptrWord[1] = state[1]; + ptrWord[2] = state[2]; + ptrWord[3] = state[3]; + ptrWord[4] = state[4]; + ptrWord[5] = state[5]; + ptrWord[6] = state[6]; + ptrWord[7] = state[7]; + ptrWord[8] = state[8]; + ptrWord[9] = state[9]; + ptrWord[10] = state[10]; + ptrWord[11] = state[11]; + /* +for (int i = 0; i<12; i++) { + printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32), + (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32)); + } +*/ + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + } +} + +/** + * Performs a reduced duplex operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowIn Row to feed the sponge + * @param rowOut Row to receive the sponge's output + */ + void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[prev][col]" + state[0] ^= (ptrWordIn[0]); + state[1] ^= (ptrWordIn[1]); + state[2] ^= (ptrWordIn[2]); + state[3] ^= (ptrWordIn[3]); + state[4] ^= (ptrWordIn[4]); + state[5] ^= (ptrWordIn[5]); + state[6] ^= (ptrWordIn[6]); + state[7] ^= (ptrWordIn[7]); + state[8] ^= (ptrWordIn[8]); + state[9] ^= (ptrWordIn[9]); + state[10] ^= (ptrWordIn[10]); + state[11] ^= (ptrWordIn[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][C-1-col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + + //Input: next column (i.e., next block in sequence) + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left and N_COLS is a system parameter. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ + void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + for (i = 0; i < N_COLS; i++) { + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][col] = M[rowOut][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[rowOut][col] = M[rowOut][col] XOR rand + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } +} + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand" + * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +/* +inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] ^ ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] ^ ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] ^ ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] ^ ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] ^ ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] ^ ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] ^ ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] ^ ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] ^ ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] ^ ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] ^ ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] ^ ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = rand + ptrWordOut[0] = state[0]; + ptrWordOut[1] = state[1]; + ptrWordOut[2] = state[2]; + ptrWordOut[3] = state[3]; + ptrWordOut[4] = state[4]; + ptrWordOut[5] = state[5]; + ptrWordOut[6] = state[6]; + ptrWordOut[7] = state[7]; + ptrWordOut[8] = state[8]; + ptrWordOut[9] = state[9]; + ptrWordOut[10] = state[10]; + ptrWordOut[11] = state[11]; + + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[10]; + ptrWordInOut[1] ^= state[11]; + ptrWordInOut[2] ^= state[0]; + ptrWordInOut[3] ^= state[1]; + ptrWordInOut[4] ^= state[2]; + ptrWordInOut[5] ^= state[3]; + ptrWordInOut[6] ^= state[4]; + ptrWordInOut[7] ^= state[5]; + ptrWordInOut[8] ^= state[6]; + ptrWordInOut[9] ^= state[7]; + ptrWordInOut[10] ^= state[8]; + ptrWordInOut[11] ^= state[9]; + + //Goes to next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + ptrWordOut += BLOCK_LEN_INT64; + } +} +*/ + +/** + * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" + * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +/* +inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[10]; + ptrWordInOut[1] ^= state[11]; + ptrWordInOut[2] ^= state[0]; + ptrWordInOut[3] ^= state[1]; + ptrWordInOut[4] ^= state[2]; + ptrWordInOut[5] ^= state[3]; + ptrWordInOut[6] ^= state[4]; + ptrWordInOut[7] ^= state[5]; + ptrWordInOut[8] ^= state[6]; + ptrWordInOut[9] ^= state[7]; + ptrWordInOut[10] ^= state[8]; + ptrWordInOut[11] ^= state[9]; + + + //M[row][col] = rand + ptrWordOut[0] = state[0] ^ ptrWordIn[0]; + ptrWordOut[1] = state[1] ^ ptrWordIn[1]; + ptrWordOut[2] = state[2] ^ ptrWordIn[2]; + ptrWordOut[3] = state[3] ^ ptrWordIn[3]; + ptrWordOut[4] = state[4] ^ ptrWordIn[4]; + ptrWordOut[5] = state[5] ^ ptrWordIn[5]; + ptrWordOut[6] = state[6] ^ ptrWordIn[6]; + ptrWordOut[7] = state[7] ^ ptrWordIn[7]; + ptrWordOut[8] = state[8] ^ ptrWordIn[8]; + ptrWordOut[9] = state[9] ^ ptrWordIn[9]; + ptrWordOut[10] = state[10] ^ ptrWordIn[10]; + ptrWordOut[11] = state[11] ^ ptrWordIn[11]; + + //Goes to next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + ptrWordOut += BLOCK_LEN_INT64; + } +} +*/ + +/** + * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" + * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +/* +inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut; + int i; + + for (i = 0; i < N_COLS / 2; i++) { + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[10]; + ptrWordInOut[1] ^= state[11]; + ptrWordInOut[2] ^= state[0]; + ptrWordInOut[3] ^= state[1]; + ptrWordInOut[4] ^= state[2]; + ptrWordInOut[5] ^= state[3]; + ptrWordInOut[6] ^= state[4]; + ptrWordInOut[7] ^= state[5]; + ptrWordInOut[8] ^= state[6]; + ptrWordInOut[9] ^= state[7]; + ptrWordInOut[10] ^= state[8]; + ptrWordInOut[11] ^= state[9]; + + + //M[row][col] = rand + ptrWordOut[0] = state[0] ^ ptrWordIn[0]; + ptrWordOut[1] = state[1] ^ ptrWordIn[1]; + ptrWordOut[2] = state[2] ^ ptrWordIn[2]; + ptrWordOut[3] = state[3] ^ ptrWordIn[3]; + ptrWordOut[4] = state[4] ^ ptrWordIn[4]; + ptrWordOut[5] = state[5] ^ ptrWordIn[5]; + ptrWordOut[6] = state[6] ^ ptrWordIn[6]; + ptrWordOut[7] = state[7] ^ ptrWordIn[7]; + ptrWordOut[8] = state[8] ^ ptrWordIn[8]; + ptrWordOut[9] = state[9] ^ ptrWordIn[9]; + ptrWordOut[10] = state[10] ^ ptrWordIn[10]; + ptrWordOut[11] = state[11] ^ ptrWordIn[11]; + + //Goes to next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + ptrWordOut += 2 * BLOCK_LEN_INT64; + } + + ptrWordOut = rowOut + BLOCK_LEN_INT64; + for (i = 0; i < N_COLS / 2; i++) { + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[10]; + ptrWordInOut[1] ^= state[11]; + ptrWordInOut[2] ^= state[0]; + ptrWordInOut[3] ^= state[1]; + ptrWordInOut[4] ^= state[2]; + ptrWordInOut[5] ^= state[3]; + ptrWordInOut[6] ^= state[4]; + ptrWordInOut[7] ^= state[5]; + ptrWordInOut[8] ^= state[6]; + ptrWordInOut[9] ^= state[7]; + ptrWordInOut[10] ^= state[8]; + ptrWordInOut[11] ^= state[9]; + + + //M[row][col] = rand + ptrWordOut[0] = state[0] ^ ptrWordIn[0]; + ptrWordOut[1] = state[1] ^ ptrWordIn[1]; + ptrWordOut[2] = state[2] ^ ptrWordIn[2]; + ptrWordOut[3] = state[3] ^ ptrWordIn[3]; + ptrWordOut[4] = state[4] ^ ptrWordIn[4]; + ptrWordOut[5] = state[5] ^ ptrWordIn[5]; + ptrWordOut[6] = state[6] ^ ptrWordIn[6]; + ptrWordOut[7] = state[7] ^ ptrWordIn[7]; + ptrWordOut[8] = state[8] ^ ptrWordIn[8]; + ptrWordOut[9] = state[9] ^ ptrWordIn[9]; + ptrWordOut[10] = state[10] ^ ptrWordIn[10]; + ptrWordOut[11] = state[11] ^ ptrWordIn[11]; + + //Goes to next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + ptrWordOut += 2 * BLOCK_LEN_INT64; + } +} +*/ + +/** + * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand" + * to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)", + * where rotW is a 64-bit rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +/* +inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[rowInOut] XOR M[rowIn]" + state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; + state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; + state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; + state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; + state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; + state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; + state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; + state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; + state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; + state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; + state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; + state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[rowOut][col] = M[rowOut][col] XOR rand + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } +} +*/ + +/** + Prints an array of unsigned chars + */ +void printArray(unsigned char *array, unsigned int size, char *name) { + int i; + printf("%s: ", name); + for (i = 0; i < size; i++) { + printf("%2x|", array[i]); + } + printf("\n"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/lyra2/Sponge.h b/lyra2/Sponge.h new file mode 100644 index 0000000..9bd8ed6 --- /dev/null +++ b/lyra2/Sponge.h @@ -0,0 +1,108 @@ +/** + * Header file for Blake2b's internal permutation in the form of a sponge. + * This code is based on the original Blake2b's implementation provided by + * Samuel Neves (https://blake2.net/) + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef SPONGE_H_ +#define SPONGE_H_ + +#include + +#if defined(__GNUC__) +#define ALIGN __attribute__ ((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN __declspec(align(32)) +#else +#define ALIGN +#endif + + +/*Blake2b IV Array*/ +static const uint64_t blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/*Blake2b's rotation*/ +static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/*Blake2b's G function*/ +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while(0) + + +/*One Round of the Blake2b's compression function*/ +#define ROUND_LYRA(r) \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); + + +//---- Housekeeping +void initState(uint64_t state[/*16*/]); + +//---- Squeezes +void squeeze(uint64_t *state, unsigned char *out, unsigned int len); +void reducedSqueezeRow0(uint64_t* state, uint64_t* row); + +//---- Absorbs +void absorbBlock(uint64_t *state, const uint64_t *in); +void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); + +//---- Duplexes +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); + +//---- Misc +void printArray(unsigned char *array, unsigned int size, char *name); + +//////////////////////////////////////////////////////////////////////////////////////////////// + + +////TESTS//// +//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2); +//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +///////////// + + +#endif /* SPONGE_H_ */ diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu new file mode 100644 index 0000000..3508d34 --- /dev/null +++ b/lyra2/cuda_lyra2.cu @@ -0,0 +1,536 @@ +#include + +#include "cuda_helper.h" + +static __constant__ uint2 blake2b_IV[8] = { + { 0xf3bcc908, 0x6a09e667 }, + { 0x84caa73b, 0xbb67ae85 }, + { 0xfe94f82b, 0x3c6ef372 }, + { 0x5f1d36f1, 0xa54ff53a }, + { 0xade682d1, 0x510e527f }, + { 0x2b3e6c1f, 0x9b05688c }, + { 0xfb41bd6b, 0x1f83d9ab }, + { 0x137e2179, 0x5be0cd19 } +}; +// data: 0-4 outputhash 4-8 outputhash 8-16 basil + +#define reduceDuplexRowSetup(rowIn, rowInOut, rowOut) { \ + for (int i = 0; i < 8; i++) { \ + for (int j = 0; j < 12; j++) \ + state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ + round_lyra_v35(state); \ + for (int j = 0; j < 12; j++) \ + Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \ + Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ + Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ + Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ + Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ + Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ + Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ + Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ + Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ + Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ + Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ + Matrix[10+ 12 * i][rowInOut] ^= state[9]; \ + Matrix[11+ 12 * i][rowInOut] ^= state[10]; \ + } \ + } + +#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \ + for (int i = 0; i < 8; i++) { \ + for (int j = 0; j < 12; j++) \ + state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ + round_lyra_v35(state); \ + for (int j = 0; j < 12; j++) \ + Matrix[j + 12 * i][rowOut] ^= state[j]; \ + Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ + Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ + Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ + Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ + Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ + Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ + Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ + Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ + Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ + Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ + Matrix[10+ 12 * i][rowInOut] ^= state[9]; \ + Matrix[11+ 12 * i][rowInOut] ^= state[10]; \ + } \ + } + +#define absorbblock(in) { \ + state[0] ^= Matrix[0][in]; \ + state[1] ^= Matrix[1][in]; \ + state[2] ^= Matrix[2][in]; \ + state[3] ^= Matrix[3][in]; \ + state[4] ^= Matrix[4][in]; \ + state[5] ^= Matrix[5][in]; \ + state[6] ^= Matrix[6][in]; \ + state[7] ^= Matrix[7][in]; \ + state[8] ^= Matrix[8][in]; \ + state[9] ^= Matrix[9][in]; \ + state[10] ^= Matrix[10][in]; \ + state[11] ^= Matrix[11][in]; \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + } + +//// test version +#define reduceDuplexRowSetup_test(rowIn, rowInOut, rowOut) { \ + for (int i = 0; i < 8; i++) { \ + for (int j = 0; j < 12; j++) \ + state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \ + round_lyra_v35(state); \ + for (int j = 0; j < 12; j++) \ + Matrix[j][7-i][rowOut] = Matrix[j][i][rowIn] ^ state[j]; \ + Matrix[0][i][rowInOut] ^= state[11]; \ + Matrix[1][i][rowInOut] ^= state[0]; \ + Matrix[2][i][rowInOut] ^= state[1]; \ + Matrix[3][i][rowInOut] ^= state[2]; \ + Matrix[4][i][rowInOut] ^= state[3]; \ + Matrix[5][i][rowInOut] ^= state[4]; \ + Matrix[6][i][rowInOut] ^= state[5]; \ + Matrix[7][i][rowInOut] ^= state[6]; \ + Matrix[8][i][rowInOut] ^= state[7]; \ + Matrix[9][i][rowInOut] ^= state[8]; \ + Matrix[10][i][rowInOut] ^= state[9]; \ + Matrix[11][i][rowInOut] ^= state[10]; \ + } \ + } + +#define reduceDuplexRow_test(rowIn, rowInOut, rowOut) { \ + for (int i = 0; i < 8; i++) { \ + for (int j = 0; j < 12; j++) \ + state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \ + round_lyra_v35(state); \ + for (int j = 0; j < 12; j++) \ + Matrix[j][i][rowOut] ^= state[j]; \ + Matrix[0][i][rowInOut] ^= state[11]; \ + Matrix[1][i][rowInOut] ^= state[0]; \ + Matrix[2][i][rowInOut] ^= state[1]; \ + Matrix[3][i][rowInOut] ^= state[2]; \ + Matrix[4][i][rowInOut] ^= state[3]; \ + Matrix[5][i][rowInOut] ^= state[4]; \ + Matrix[6][i][rowInOut] ^= state[5]; \ + Matrix[7][i][rowInOut] ^= state[6]; \ + Matrix[8][i][rowInOut] ^= state[7]; \ + Matrix[9][i][rowInOut] ^= state[8]; \ + Matrix[10][i][rowInOut] ^= state[9]; \ + Matrix[11][i][rowInOut] ^= state[10]; \ + } \ + } + +#define absorbblock_test(in) { \ + state[0] ^= Matrix[0][0][ in]; \ + state[1] ^= Matrix[1][0][in]; \ + state[2] ^= Matrix[2][0][in]; \ + state[3] ^= Matrix[3][0][in]; \ + state[4] ^= Matrix[4][0][in]; \ + state[5] ^= Matrix[5][0][in]; \ + state[6] ^= Matrix[6][0][in]; \ + state[7] ^= Matrix[7][0][in]; \ + state[8] ^= Matrix[8][0][in]; \ + state[9] ^= Matrix[9][0][in]; \ + state[10] ^= Matrix[10][0][in]; \ + state[11] ^= Matrix[11][0][in]; \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + round_lyra_v35(state); \ + } + +//// compute 30 version +#define reduceDuplexRowSetup_v30(rowIn, rowInOut, rowOut) { \ + for (int i = 0; i < 8; i++) { \ + for (int j = 0; j < 12; j++) \ + state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ + round_lyra_v30(state); \ + for (int j = 0; j < 12; j++) \ + Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \ + Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ + Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ + Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ + Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ + Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ + Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ + Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ + Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ + Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ + Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ + Matrix[10 + 12 * i][rowInOut] ^= state[9]; \ + Matrix[11 + 12 * i][rowInOut] ^= state[10]; \ + } \ + } + +#define reduceDuplexRow_v30(rowIn, rowInOut, rowOut) { \ + for (int i = 0; i < 8; i++) { \ + for (int j = 0; j < 12; j++) \ + state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ + round_lyra_v30(state); \ + for (int j = 0; j < 12; j++) \ + Matrix[j + 12 * i][rowOut] ^= state[j]; \ + Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ + Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ + Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ + Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ + Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ + Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ + Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ + Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ + Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ + Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ + Matrix[10 + 12 * i][rowInOut] ^= state[9]; \ + Matrix[11 + 12 * i][rowInOut] ^= state[10]; \ + } \ + } + +#define absorbblock_v30(in) { \ + state[0] ^= Matrix[0][in]; \ + state[1] ^= Matrix[1][in]; \ + state[2] ^= Matrix[2][in]; \ + state[3] ^= Matrix[3][in]; \ + state[4] ^= Matrix[4][in]; \ + state[5] ^= Matrix[5][in]; \ + state[6] ^= Matrix[6][in]; \ + state[7] ^= Matrix[7][in]; \ + state[8] ^= Matrix[8][in]; \ + state[9] ^= Matrix[9][in]; \ + state[10] ^= Matrix[10][in]; \ + state[11] ^= Matrix[11][in]; \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + round_lyra_v30(state); \ + } + +static __device__ __forceinline__ +void Gfunc_v35(uint2 & a, uint2 &b, uint2 &c, uint2 &d) +{ + a += b; d ^= a; d = ROR2(d, 32); + c += d; b ^= c; b = ROR2(b, 24); + a += b; d ^= a; d = ROR2(d, 16); + c += d; b ^= c; b = ROR2(b, 63); +} + +static __device__ __forceinline__ +void Gfunc_v30(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d) +{ + a += b; d ^= a; d = ROTR64(d, 32); + c += d; b ^= c; b = ROTR64(b, 24); + a += b; d ^= a; d = ROTR64(d, 16); + c += d; b ^= c; b = ROTR64(b, 63); +} + +#define round_lyra_v35_new(state) { \ + Gfunc_v35(state[0], state[4], state[8], state[12]); \ + Gfunc_v35(state[1], state[5], state[9], state[13]); \ + Gfunc_v35(state[2], state[6], state[10], state[14]); \ + Gfunc_v35(state[3], state[7], state[11], state[15]); \ + Gfunc_v35(state[0], state[5], state[10], state[15]); \ + Gfunc_v35(state[1], state[6], state[11], state[12]); \ + Gfunc_v35(state[2], state[7], state[8], state[13]); \ + Gfunc_v35(state[3], state[4], state[9], state[14]); \ +} + +static __device__ __forceinline__ void round_lyra_v35(uint2 *s) +{ + Gfunc_v35(s[0], s[4], s[8], s[12]); + Gfunc_v35(s[1], s[5], s[9], s[13]); + Gfunc_v35(s[2], s[6], s[10], s[14]); + Gfunc_v35(s[3], s[7], s[11], s[15]); + Gfunc_v35(s[0], s[5], s[10], s[15]); + Gfunc_v35(s[1], s[6], s[11], s[12]); + Gfunc_v35(s[2], s[7], s[8], s[13]); + Gfunc_v35(s[3], s[4], s[9], s[14]); +} + +static __device__ __forceinline__ void round_lyra_v30(uint64_t *s) +{ + Gfunc_v30(s[0], s[4], s[8], s[12]); + Gfunc_v30(s[1], s[5], s[9], s[13]); + Gfunc_v30(s[2], s[6], s[10], s[14]); + Gfunc_v30(s[3], s[7], s[11], s[15]); + Gfunc_v30(s[0], s[5], s[10], s[15]); + Gfunc_v30(s[1], s[6], s[11], s[12]); + Gfunc_v30(s[2], s[7], s[8], s[13]); + Gfunc_v30(s[3], s[4], s[9], s[14]); +} + +__global__ __launch_bounds__(256, 1) +void lyra2_gpu_hash_32_v30(int threads, uint32_t startNounce, uint64_t *outputHash) +{ + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t state[16]; + #pragma unroll + for (int i = 0; i<4; i++) { state[i] = outputHash[threads*i + thread]; } //password + #pragma unroll + for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt + #pragma unroll + for (int i = 0; i<8; i++) { state[i + 8] = devectorize(blake2b_IV[i]); } + + // blake2blyra x2 + #pragma unroll 24 + for (int i = 0; i<24; i++) { round_lyra_v30(state); } //because 12 is not enough + + uint64_t Matrix[96][8]; // not cool + // reducedSqueezeRow0 + #pragma unroll 8 + for (int i = 0; i < 8; i++) { + int idx = 84-12*i; + #pragma unroll 12 + for (int j = 0; j<12; j++) { Matrix[j + idx][0] = state[j]; } + round_lyra_v30(state); + } + + // reducedSqueezeRow1 + #pragma unroll 8 + for (int i = 0; i < 8; i++) + { + int idx0= 12*i; + int idx1= 84-idx0; + #pragma unroll 12 + for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + idx0][0]; } + round_lyra_v30(state); + #pragma unroll 12 + for (int j = 0; j<12; j++) { Matrix[j + idx1][1] = Matrix[j + idx0][0] ^ state[j]; } + } + + reduceDuplexRowSetup_v30(1, 0, 2); + reduceDuplexRowSetup_v30(2, 1, 3); + reduceDuplexRowSetup_v30(3, 0, 4); + reduceDuplexRowSetup_v30(4, 3, 5); + reduceDuplexRowSetup_v30(5, 2, 6); + reduceDuplexRowSetup_v30(6, 1, 7); + + uint64_t rowa; + rowa = state[0] & 7; + reduceDuplexRow_v30(7, rowa, 0); + rowa = state[0] & 7; + reduceDuplexRow_v30(0, rowa, 3); + rowa = state[0] & 7; + reduceDuplexRow_v30(3, rowa, 6); + rowa = state[0] & 7; + reduceDuplexRow_v30(6, rowa, 1); + rowa = state[0] & 7; + reduceDuplexRow_v30(1, rowa, 4); + rowa = state[0] & 7; + reduceDuplexRow_v30(4, rowa, 7); + rowa = state[0] & 7; + reduceDuplexRow_v30(7, rowa, 2); + rowa = state[0] & 7; + reduceDuplexRow_v30(2, rowa, 5); + + absorbblock_v30(rowa); + + #pragma unroll + for (int i = 0; i<4; i++) { + outputHash[threads*i + thread] = state[i]; + } //password + + } //thread +} + +__global__ __launch_bounds__(256, 1) +void lyra2_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) +{ + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2 state[16]; + #pragma unroll + for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password + #pragma unroll + for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt + #pragma unroll + for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; } + + // blake2blyra x2 + #pragma unroll 24 + for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough + + uint2 Matrix[96][8]; // not cool + + // reducedSqueezeRow0 + #pragma unroll 8 + for (int i = 0; i < 8; i++) + { + #pragma unroll 12 + for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][0] = state[j]; } + round_lyra_v35(state); + } + + // reducedSqueezeRow1 + #pragma unroll 8 + for (int i = 0; i < 8; i++) + { + #pragma unroll 12 + for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + 12 * i][0]; } + round_lyra_v35(state); + #pragma unroll 12 + for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; } + } + + reduceDuplexRowSetup(1, 0, 2); + reduceDuplexRowSetup(2, 1, 3); + reduceDuplexRowSetup(3, 0, 4); + reduceDuplexRowSetup(4, 3, 5); + reduceDuplexRowSetup(5, 2, 6); + reduceDuplexRowSetup(6, 1, 7); + + uint32_t rowa; + rowa = state[0].x & 7; + reduceDuplexRow(7, rowa, 0); + rowa = state[0].x & 7; + reduceDuplexRow(0, rowa, 3); + rowa = state[0].x & 7; + reduceDuplexRow(3, rowa, 6); + rowa = state[0].x & 7; + reduceDuplexRow(6, rowa, 1); + rowa = state[0].x & 7; + reduceDuplexRow(1, rowa, 4); + rowa = state[0].x & 7; + reduceDuplexRow(4, rowa, 7); + rowa = state[0].x & 7; + reduceDuplexRow(7, rowa, 2); + rowa = state[0].x & 7; + reduceDuplexRow(2, rowa, 5); + + absorbblock(rowa); + + #pragma unroll + for (int i = 0; i<4; i++) { + outputHash[threads*i + thread] = devectorize(state[i]); + } //password + + } //thread +} + +__global__ +void __launch_bounds__(256, 1) lyra2_gpu_hash_32_test(int threads, uint32_t startNounce, uint64_t *outputHash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2 state[16]; + #pragma unroll + for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password + #pragma unroll + for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt + #pragma unroll + for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; } + + // blake2blyra x2 + #pragma unroll 24 + for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough + + uint2 Matrix[12][8][8]; // not cool + + // reducedSqueezeRow0 + #pragma unroll 8 + for (int i = 0; i < 8; i++) { + #pragma unroll 12 + for (int j = 0; j<12; j++) { Matrix[j][7-i][0] = state[j]; } + round_lyra_v35(state); + } + + // reducedSqueezeRow1 + #pragma unroll 8 + for (int i = 0; i < 8; i++) + { + #pragma unroll 12 + for (int j = 0; j<12; j++) { state[j] ^= Matrix[j][i][0]; } + round_lyra_v35(state); + #pragma unroll 12 + for (int j = 0; j<12; j++) { Matrix[j][7-i][1] = Matrix[j][i][0] ^ state[j]; } + } + + reduceDuplexRowSetup_test(1, 0, 2); + reduceDuplexRowSetup_test(2, 1, 3); + reduceDuplexRowSetup_test(3, 0, 4); + reduceDuplexRowSetup_test(4, 3, 5); + reduceDuplexRowSetup_test(5, 2, 6); + reduceDuplexRowSetup_test(6, 1, 7); + + uint64_t rowa; + rowa = devectorize(state[0]) & 7; + reduceDuplexRow_test(7, rowa, 0); + rowa = devectorize(state[0]) & 7; + reduceDuplexRow_test(0, rowa, 3); + rowa = devectorize(state[0]) & 7; + reduceDuplexRow_test(3, rowa, 6); + rowa = devectorize(state[0]) & 7; + reduceDuplexRow_test(6, rowa, 1); + rowa = devectorize(state[0]) & 7; + reduceDuplexRow_test(1, rowa, 4); + rowa = devectorize(state[0]) & 7; + reduceDuplexRow_test(4, rowa, 7); + rowa = devectorize(state[0]) & 7; + reduceDuplexRow_test(7, rowa, 2); + rowa = devectorize(state[0]) & 7; + reduceDuplexRow_test(2, rowa, 5); + + absorbblock_test(rowa); + + #pragma unroll + for (int i = 0; i<4; i++) { + outputHash[threads*i + thread] = devectorize(state[i]); + } //password + + } //thread +} + +__host__ +void lyra2_cpu_init(int thr_id, int threads) +{ + //not used +} + +__host__ +void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +{ + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + if (device_sm[device_map[thr_id]] >= 350) { + lyra2_gpu_hash_32 <<>> (threads, startNounce, d_outputHash); + } else { + // kernel for compute30 card + lyra2_gpu_hash_32_v30 <<>> (threads, startNounce, d_outputHash); + } + + cudaDeviceSynchronize(); + //MyStreamSynchronize(NULL, order, thr_id); +} + diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu new file mode 100644 index 0000000..2368e90 --- /dev/null +++ b/lyra2/lyra2RE.cu @@ -0,0 +1,133 @@ +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_keccak.h" +#include "lyra2/Lyra2.h" +} + +#include "miner.h" +#include "cuda_helper.h" + +static _ALIGN(64) uint64_t *d_hash[8]; + +extern void quark_check_cpu_init(int thr_id, int threads); +extern void quark_check_cpu_setTarget(const void *ptarget); +extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); +extern uint32_t quark_check_cpu_hash_64_2(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint64_t *d_inputHash, int order); + +extern void blake256_cpu_init(int thr_id, int threads); +extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); +extern void blake256_cpu_setBlock_80(uint32_t *pdata); +extern void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void keccak256_cpu_init(int thr_id, int threads); +extern void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void skein256_cpu_init(int thr_id, int threads); + +extern void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void lyra2_cpu_init(int thr_id, int threads); + +extern void groestl256_setTarget(const void *ptarget); +extern uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order); +extern void groestl256_cpu_init(int thr_id, int threads); + +extern "C" void lyra_hash(void *state, const void *input) +{ + sph_blake256_context ctx_blake; + sph_keccak256_context ctx_keccak; + sph_skein256_context ctx_skein; + sph_groestl256_context ctx_groestl; + + uint32_t hashA[8], hashB[8], hash[8]; + + sph_blake256_init(&ctx_blake); + sph_blake256(&ctx_blake, input, 80); + sph_blake256_close(&ctx_blake, hashA); + + sph_keccak256_init(&ctx_keccak); + sph_keccak256(&ctx_keccak, hashA, 32); + sph_keccak256_close(&ctx_keccak, hashB); + + LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + + sph_skein256_init(&ctx_skein); + sph_skein256(&ctx_skein, hashA, 32); + sph_skein256_close(&ctx_skein, hashB); + + sph_groestl256_init(&ctx_groestl); + sph_groestl256(&ctx_groestl, hashB, 32); + sph_groestl256_close(&ctx_groestl, hash); + + // seems wrong : hash or hashB ? + memcpy(state, hashB, 32); +} + +static bool init[8] = { 0 }; + +extern "C" int scanhash_lyra(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + int intensity = (device_sm[device_map[thr_id]] >= 500) ? 19 : 18; + int throughput = opt_work_size ? opt_work_size : (1 << intensity); // 18=256*256*4; + throughput = min(throughput, (int)(max_nonce - first_nonce)); + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + + blake256_cpu_init(thr_id, throughput); + keccak256_cpu_init(thr_id,throughput); + skein256_cpu_init(thr_id, throughput); + groestl256_cpu_init(thr_id, throughput); + lyra2_cpu_init(thr_id, throughput); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + blake256_cpu_setBlock_80(pdata); + groestl256_setTarget(ptarget); + + do { + int order = 0; + uint32_t foundNonce; + + blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + // const uint32_t Htarg = ptarget[6]; + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); + lyra_hash(vhash64, endiandata); + +// if (vhash64[7]<=Htarg) { // && fulltest(vhash64, ptarget)) { + *hashes_done = pdata[19] - first_nonce + throughput; + pdata[19] = foundNonce; + return 1; +// } else { +// applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); +// } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} diff --git a/miner.h b/miner.h index 90b9d52..59e3423 100644 --- a/miner.h +++ b/miner.h @@ -328,6 +328,10 @@ extern int scanhash_fresh(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_lyra(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + extern int scanhash_nist5(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -645,6 +649,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void keccak256_hash(void *state, const void *input); unsigned int jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); +void lyra_hash(void *state, const void *input); void myriadhash(void *state, const void *input); void nist5hash(void *state, const void *input); void pentablakehash(void *output, const void *input); diff --git a/util.cpp b/util.cpp index 3bc0dab..319961d 100644 --- a/util.cpp +++ b/util.cpp @@ -1633,18 +1633,22 @@ void print_hash_tests(void) heavycoin_hash(&hash[0], &buf[0], 32); printpfx("heavy", hash); - memset(hash, 0, sizeof hash); - keccak256_hash(&hash[0], &buf[0]); - printpfx("keccak", hash); - memset(hash, 0, sizeof hash); jackpothash(&hash[0], &buf[0]); printpfx("jackpot", hash); + memset(hash, 0, sizeof hash); + keccak256_hash(&hash[0], &buf[0]); + printpfx("keccak", hash); + memset(hash, 0, sizeof hash); doomhash(&hash[0], &buf[0]); printpfx("luffa", hash); - +/* to double check with a lyra2 cpu miner + memset(hash, 0, sizeof hash); + lyra_hash(&hash[0], &buf[0]); + printpfx("lyra2", hash); +*/ memset(hash, 0, sizeof hash); myriadhash(&hash[0], &buf[0]); printpfx("myriad", hash);