From 71f90039011ff9594b54a15440176e80e09e59da Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 24 Nov 2014 22:26:27 +0100 Subject: [PATCH] x13: use tsiv hamsi implementation (+70KH) --- Makefile.am | 3 + ccminer.vcxproj | 2 +- x13/cuda_x13_hamsi512.cu | 331 +++++++++++++++++---------------------- 3 files changed, 147 insertions(+), 189 deletions(-) diff --git a/Makefile.am b/Makefile.am index fb530a6..816d835 100644 --- a/Makefile.am +++ b/Makefile.am @@ -93,6 +93,9 @@ x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu x11/cuda_x11_luffa512_Cubehash.o: x11/cuda_x11_luffa512_Cubehash.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< +x13/cuda_x13_hamsi512.o: x13/cuda_x13_hamsi512.cu + $(NVCC) $(nvcc_FLAGS) --maxrregcount=72 -o $@ -c $< + x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< diff --git a/ccminer.vcxproj b/ccminer.vcxproj index cf4a2b7..1cf7791 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -447,7 +447,7 @@ - 80 + 72 diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu index 0b3a770..1e5de99 100644 --- a/x13/cuda_x13_hamsi512.cu +++ b/x13/cuda_x13_hamsi512.cu @@ -1,49 +1,21 @@ /* - * Quick and dirty addition of Hamsi-512 for X13 - * - * Built on cbuchner1's implementation, actual hashing code - * heavily based on phm's sgminer - * + * Quick Hamsi-512 for X13 + * by tsiv - 2014 */ -/* - * X13 kernel implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2014 phm - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author phm - */ +#include +#include +#include #include "cuda_helper.h" -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); +typedef unsigned char BitSequence; -__device__ __constant__ -static const uint32_t d_alpha_n[] = { +static __constant__ uint32_t d_alpha_n[32]; +static __constant__ uint32_t d_alpha_f[32]; +static __constant__ uint32_t d_T512[64][16]; + +static const uint32_t alpha_n[] = { SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc), @@ -57,8 +29,7 @@ static const uint32_t d_alpha_n[] = { SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0) }; -__device__ __constant__ -static const uint32_t d_alpha_f[] = { +static const uint32_t alpha_f[] = { SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0), SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0), @@ -106,73 +77,73 @@ static const uint32_t d_alpha_f[] = { #define hamsi_s1F mF #define SBOX(a, b, c, d) { \ - uint32_t t; \ - t = (a); \ - (a) &= (c); \ - (a) ^= (d); \ - (c) ^= (b); \ - (c) ^= (a); \ - (d) |= t; \ - (d) ^= (b); \ - t ^= (c); \ - (b) = (d); \ - (d) |= t; \ - (d) ^= (a); \ - (a) &= (b); \ - t ^= (a); \ - (b) ^= (d); \ - (b) ^= t; \ - (a) = (c); \ - (c) = (b); \ - (b) = (d); \ - (d) = SPH_T32(~t); \ - } + uint32_t t; \ + t = (a); \ + (a) &= (c); \ + (a) ^= (d); \ + (c) ^= (b); \ + (c) ^= (a); \ + (d) |= t; \ + (d) ^= (b); \ + t ^= (c); \ + (b) = (d); \ + (d) |= t; \ + (d) ^= (a); \ + (a) &= (b); \ + t ^= (a); \ + (b) ^= (d); \ + (b) ^= t; \ + (a) = (c); \ + (c) = (b); \ + (b) = (d); \ + (d) = SPH_T32(~t); \ + } #define HAMSI_L(a, b, c, d) { \ - (a) = ROTL32(a, 13); \ - (c) = ROTL32(c, 3); \ - (b) ^= (a) ^ (c); \ - (d) ^= (c) ^ SPH_T32((a) << 3); \ - (b) = ROTL32(b, 1); \ - (d) = ROTL32(d, 7); \ - (a) ^= (b) ^ (d); \ - (c) ^= (d) ^ SPH_T32((b) << 7); \ - (a) = ROTL32(a, 5); \ - (c) = ROTL32(c, 22); \ - } + (a) = ROTL32(a, 13); \ + (c) = ROTL32(c, 3); \ + (b) ^= (a) ^ (c); \ + (d) ^= (c) ^ ((a) << 3); \ + (b) = ROTL32(b, 1); \ + (d) = ROTL32(d, 7); \ + (a) ^= (b) ^ (d); \ + (c) ^= (d) ^ ((b) << 7); \ + (a) = ROTL32(a, 5); \ + (c) = ROTL32(c, 22); \ + } #define ROUND_BIG(rc, alpha) { \ hamsi_s00 ^= alpha[0x00]; \ - hamsi_s01 ^= alpha[0x01] ^ (uint32_t)(rc); \ - hamsi_s02 ^= alpha[0x02]; \ - hamsi_s03 ^= alpha[0x03]; \ - hamsi_s04 ^= alpha[0x04]; \ - hamsi_s05 ^= alpha[0x05]; \ - hamsi_s06 ^= alpha[0x06]; \ - hamsi_s07 ^= alpha[0x07]; \ hamsi_s08 ^= alpha[0x08]; \ - hamsi_s09 ^= alpha[0x09]; \ - hamsi_s0A ^= alpha[0x0A]; \ - hamsi_s0B ^= alpha[0x0B]; \ - hamsi_s0C ^= alpha[0x0C]; \ - hamsi_s0D ^= alpha[0x0D]; \ - hamsi_s0E ^= alpha[0x0E]; \ - hamsi_s0F ^= alpha[0x0F]; \ hamsi_s10 ^= alpha[0x10]; \ - hamsi_s11 ^= alpha[0x11]; \ - hamsi_s12 ^= alpha[0x12]; \ - hamsi_s13 ^= alpha[0x13]; \ - hamsi_s14 ^= alpha[0x14]; \ - hamsi_s15 ^= alpha[0x15]; \ - hamsi_s16 ^= alpha[0x16]; \ - hamsi_s17 ^= alpha[0x17]; \ hamsi_s18 ^= alpha[0x18]; \ + hamsi_s01 ^= alpha[0x01] ^ (uint32_t)(rc); \ + hamsi_s09 ^= alpha[0x09]; \ + hamsi_s11 ^= alpha[0x11]; \ hamsi_s19 ^= alpha[0x19]; \ + hamsi_s02 ^= alpha[0x02]; \ + hamsi_s0A ^= alpha[0x0A]; \ + hamsi_s12 ^= alpha[0x12]; \ hamsi_s1A ^= alpha[0x1A]; \ + hamsi_s03 ^= alpha[0x03]; \ + hamsi_s0B ^= alpha[0x0B]; \ + hamsi_s13 ^= alpha[0x13]; \ hamsi_s1B ^= alpha[0x1B]; \ + hamsi_s04 ^= alpha[0x04]; \ + hamsi_s0C ^= alpha[0x0C]; \ + hamsi_s14 ^= alpha[0x14]; \ hamsi_s1C ^= alpha[0x1C]; \ + hamsi_s05 ^= alpha[0x05]; \ + hamsi_s0D ^= alpha[0x0D]; \ + hamsi_s15 ^= alpha[0x15]; \ hamsi_s1D ^= alpha[0x1D]; \ + hamsi_s06 ^= alpha[0x06]; \ + hamsi_s0E ^= alpha[0x0E]; \ + hamsi_s16 ^= alpha[0x16]; \ hamsi_s1E ^= alpha[0x1E]; \ + hamsi_s07 ^= alpha[0x07]; \ + hamsi_s0F ^= alpha[0x0F]; \ + hamsi_s17 ^= alpha[0x17]; \ hamsi_s1F ^= alpha[0x1F]; \ SBOX(hamsi_s00, hamsi_s08, hamsi_s10, hamsi_s18); \ SBOX(hamsi_s01, hamsi_s09, hamsi_s11, hamsi_s19); \ @@ -198,30 +169,16 @@ static const uint32_t d_alpha_f[] = { #define P_BIG { \ - ROUND_BIG(0, d_alpha_n); \ - ROUND_BIG(1, d_alpha_n); \ - ROUND_BIG(2, d_alpha_n); \ - ROUND_BIG(3, d_alpha_n); \ - ROUND_BIG(4, d_alpha_n); \ - ROUND_BIG(5, d_alpha_n); \ + for( int r = 0; r < 6; r++ ) \ + ROUND_BIG(r, d_alpha_n); \ } -#define PF_BIG { \ - ROUND_BIG(0, d_alpha_f); \ - ROUND_BIG(1, d_alpha_f); \ - ROUND_BIG(2, d_alpha_f); \ - ROUND_BIG(3, d_alpha_f); \ - ROUND_BIG(4, d_alpha_f); \ - ROUND_BIG(5, d_alpha_f); \ - ROUND_BIG(6, d_alpha_f); \ - ROUND_BIG(7, d_alpha_f); \ - ROUND_BIG(8, d_alpha_f); \ - ROUND_BIG(9, d_alpha_f); \ - ROUND_BIG(10, d_alpha_f); \ - ROUND_BIG(11, d_alpha_f); \ +#define PF_BIG { \ + for( int r = 0; r < 12; r++ ) \ + ROUND_BIG(r, d_alpha_f); \ } -#define T_BIG { \ +#define T_BIG { \ /* order is important */ \ cF = (h[0xF] ^= hamsi_s17); \ cE = (h[0xE] ^= hamsi_s16); \ @@ -241,8 +198,8 @@ static const uint32_t d_alpha_f[] = { c0 = (h[0x0] ^= hamsi_s00); \ } -__device__ __constant__ -static const uint32_t d_T512[64][16] = { + +static const uint32_t T512[64][16] = { { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), @@ -629,53 +586,8 @@ static const uint32_t d_T512[64][16] = { SPH_C32(0xe7e00a94) } }; -#define INPUT_BIG { \ - const uint32_t *tp = &d_T512[0][0]; \ - unsigned u, v; \ - m0 = 0; \ - m1 = 0; \ - m2 = 0; \ - m3 = 0; \ - m4 = 0; \ - m5 = 0; \ - m6 = 0; \ - m7 = 0; \ - m8 = 0; \ - m9 = 0; \ - mA = 0; \ - mB = 0; \ - mC = 0; \ - mD = 0; \ - mE = 0; \ - mF = 0; \ - for (u = 0; u < 8; u ++) { \ - unsigned db = buf(u); \ - for (v = 0; v < 8; v ++, db >>= 1) { \ - uint32_t dm = SPH_T32(-(uint32_t)(db & 1)); \ - m0 ^= dm & *tp ++; \ - m1 ^= dm & *tp ++; \ - m2 ^= dm & *tp ++; \ - m3 ^= dm & *tp ++; \ - m4 ^= dm & *tp ++; \ - m5 ^= dm & *tp ++; \ - m6 ^= dm & *tp ++; \ - m7 ^= dm & *tp ++; \ - m8 ^= dm & *tp ++; \ - m9 ^= dm & *tp ++; \ - mA ^= dm & *tp ++; \ - mB ^= dm & *tp ++; \ - mC ^= dm & *tp ++; \ - mD ^= dm & *tp ++; \ - mE ^= dm & *tp ++; \ - mF ^= dm & *tp ++; \ - } \ - } \ - } - - -/***************************************************/ -// Die Hash-Funktion -__global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ +void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -690,28 +602,73 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c); uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48); uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d); - uint32_t m0, m1, m2, m3, m4, m5, m6, m7; - uint32_t m8, m9, mA, mB, mC, mD, mE, mF; + uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF; uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + uint32_t *tp, db, dm; -#define buf(u) (h1[i+u]) - #pragma unroll 8 for(int i = 0; i < 64; i += 8) { - INPUT_BIG; - P_BIG; + + m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; + m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; + tp = &d_T512[0][0]; + + #pragma unroll 2 + for (int u = 0; u < 8; u ++) { + db = h1[i+u]; + #pragma unroll 2 + for (int v = 0; v < 8; v ++, db >>= 1) { + dm = -(uint32_t)(db & 1); + m0 ^= dm & *(tp+ 0); m1 ^= dm & *(tp+ 1); + m2 ^= dm & *(tp+ 2); m3 ^= dm & *(tp+ 3); + m4 ^= dm & *(tp+ 4); m5 ^= dm & *(tp+ 5); + m6 ^= dm & *(tp+ 6); m7 ^= dm & *(tp+ 7); + m8 ^= dm & *(tp+ 8); m9 ^= dm & *(tp+ 9); + mA ^= dm & *(tp+10); mB ^= dm & *(tp+11); + mC ^= dm & *(tp+12); mD ^= dm & *(tp+13); + mE ^= dm & *(tp+14); mF ^= dm & *(tp+15); + tp += 16; + } + } + + for( int r = 0; r < 6; r += 2 ) { + ROUND_BIG(r, d_alpha_n); + ROUND_BIG(r+1, d_alpha_n); + } T_BIG; } -#undef buf -#define buf(u) (u == 0 ? 0x80 : 0) - INPUT_BIG; - P_BIG; + tp = &d_T512[0][0] + 112; + + m0 = *(tp+ 0); m1 = *(tp+ 1); + m2 = *(tp+ 2); m3 = *(tp+ 3); + m4 = *(tp+ 4); m5 = *(tp+ 5); + m6 = *(tp+ 6); m7 = *(tp+ 7); + m8 = *(tp+ 8); m9 = *(tp+ 9); + mA = *(tp+10); mB = *(tp+11); + mC = *(tp+12); mD = *(tp+13); + mE = *(tp+14); mF = *(tp+15); + + for( int r = 0; r < 6; r += 2 ) { + ROUND_BIG(r, d_alpha_n); + ROUND_BIG(r+1, d_alpha_n); + } T_BIG; -#undef buf -#define buf(u) (u == 6 ? 2 : 0) - INPUT_BIG; - PF_BIG; + tp = &d_T512[0][0] + 784; + + m0 = *(tp+ 0); m1 = *(tp+ 1); + m2 = *(tp+ 2); m3 = *(tp+ 3); + m4 = *(tp+ 4); m5 = *(tp+ 5); + m6 = *(tp+ 6); m7 = *(tp+ 7); + m8 = *(tp+ 8); m9 = *(tp+ 9); + mA = *(tp+10); mB = *(tp+11); + mC = *(tp+12); mD = *(tp+13); + mE = *(tp+14); mF = *(tp+15); + + for( int r = 0; r < 12; r += 2 ) { + ROUND_BIG(r, d_alpha_f); + ROUND_BIG(r+1, d_alpha_f); + } T_BIG; #pragma unroll 16 @@ -720,24 +677,22 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint } } - -__host__ void x13_hamsi512_cpu_init(int thr_id, int threads) +__host__ +void x13_hamsi512_cpu_init(int thr_id, int threads) { + cudaMemcpyToSymbol(d_alpha_n, alpha_n, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(d_alpha_f, alpha_f, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_T512, T512, sizeof(uint32_t)*64*16, 0, cudaMemcpyHostToDevice)); } -__host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ +void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const int threadsperblock = 256; - // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - - // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); -} +} \ No newline at end of file