Browse Source

Add skein algo (Skeincoin, Myriad, Unat...)

SKEIN512 + SHA256

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
master
Tanguy Pruvot 9 years ago
parent
commit
f86784ee56
  1. 6
      Makefile.am
  2. 5
      README.txt
  3. 8
      ccminer.cpp
  4. 3
      ccminer.vcxproj
  5. 5
      ccminer.vcxproj.filters
  6. 5
      miner.h
  7. 4
      myriadgroestl.cpp
  8. 208
      quark/cuda_skein512.cu
  9. 480
      skein.cu
  10. 8
      util.cpp

6
Makefile.am

@ -41,7 +41,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -41,7 +41,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \
cuda_nist5.cu pentablake.cu zr5.cu \
cuda_nist5.cu pentablake.cu skein.cu zr5.cu \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
@ -117,3 +117,7 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu @@ -117,3 +117,7 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
skein.o: skein.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<

5
README.txt

@ -34,10 +34,11 @@ TalkCoin @@ -34,10 +34,11 @@ TalkCoin
DarkCoin and other X11 coins
Saffroncoin blake (256 14-rounds)
BlakeCoin (256 8-rounds)
Keccak (Maxcoin)
Deep, Doom and Qubit
Keccak (Maxcoin)
Pentablake (Blake 512 x5)
1Coin Triple S
Skein (Skein + SHA)
Vertcoin Lyra2RE
Ziftrcoin (ZR5)
@ -79,6 +80,7 @@ its command line interface and options. @@ -79,6 +80,7 @@ its command line interface and options.
quark use to mine Quarkcoin
qubit use to mine Qubit Algo
s3 use to mine 1coin
skein use to mine Skeincoin
whirl use to mine Whirlcoin
whirlpoolx use to mine Vanillacoin
x11 use to mine DarkCoin
@ -187,6 +189,7 @@ features. @@ -187,6 +189,7 @@ features.
Mar. 27th 2015 v1.6.0
Add the ZR5 Algo for Ziftcoin
Implement Skeincoin algo (skein + sha)
Import pluck (djm34) and whirlpoolx (alexis78) algos
Hashrate units based on hashing rate values (Hs/kHs/MHs/GHs)
Default config file (also help to debug without command line)

8
ccminer.cpp

@ -100,6 +100,7 @@ enum sha_algos { @@ -100,6 +100,7 @@ enum sha_algos {
ALGO_PLUCK,
ALGO_QUARK,
ALGO_QUBIT,
ALGO_SKEIN,
ALGO_S3,
ALGO_WHIRLCOIN,
ALGO_WHIRLPOOLX,
@ -133,6 +134,7 @@ static const char *algo_names[] = { @@ -133,6 +134,7 @@ static const char *algo_names[] = {
"pluck",
"quark",
"qubit",
"skein",
"s3",
"whirl",
"whirlpoolx",
@ -249,6 +251,7 @@ Options:\n\ @@ -249,6 +251,7 @@ Options:\n\
pluck SupCoin\n\
quark Quark\n\
qubit Qubit\n\
skein Skein SHA2 (Skeincoin)\n\
s3 S3 (1Coin)\n\
x11 X11 (DarkCoin)\n\
x13 X13 (MaruCoin)\n\
@ -1512,6 +1515,11 @@ static void *miner_thread(void *userdata) @@ -1512,6 +1515,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done);
break;
case ALGO_SKEIN:
rc = scanhash_skeincoin(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
case ALGO_S3:
rc = scanhash_s3(thr_id, work.data, work.target,
max_nonce, &hashes_done);

3
ccminer.vcxproj

@ -440,6 +440,9 @@ @@ -440,6 +440,9 @@
</CudaCompile>
<CudaCompile Include="lyra2\lyra2RE.cu" />
<CudaCompile Include="lyra2\cuda_lyra2.cu" />
<CudaCompile Include="skein.cu">
<MaxRegCount>64</MaxRegCount>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_aes.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>

5
ccminer.vcxproj.filters

@ -574,6 +574,9 @@ @@ -574,6 +574,9 @@
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA\quark</Filter>
</CudaCompile>
<CudaCompile Include="skein.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<Image Include="res\ccminer.ico">
@ -590,4 +593,4 @@ @@ -590,4 +593,4 @@
<Filter>Ressources</Filter>
</Text>
</ItemGroup>
</Project>
</Project>

5
miner.h

@ -346,6 +346,10 @@ extern int scanhash_scrypt(int thr_id, uint32_t *pdata, @@ -346,6 +346,10 @@ extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
unsigned char *scratchbuf, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_skeincoin(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_s3(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
@ -675,6 +679,7 @@ void pentablakehash(void *output, const void *input); @@ -675,6 +679,7 @@ void pentablakehash(void *output, const void *input);
void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N);
void quarkhash(void *state, const void *input);
void qubithash(void *state, const void *input);
void skeincoinhash(void *output, const void *input);
void s3hash(void *output, const void *input);
void wcoinhash(void *state, const void *input);
void whirlxHash(void *state, const void *input);

4
myriadgroestl.cpp

@ -65,12 +65,12 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar @@ -65,12 +65,12 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar
do {
// GPU
uint32_t foundNounce = 0xFFFFFFFF;
uint32_t foundNounce = UINT32_MAX;
const uint32_t Htarg = ptarget[7];
myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
if(foundNounce < 0xffffffff)
if (foundNounce < UINT32_MAX)
{
uint32_t tmpHash[8];
endiandata[19] = SWAP32(foundNounce);

208
quark/cuda_skein512.cu

@ -4,6 +4,8 @@ @@ -4,6 +4,8 @@
#include "cuda_helper.h"
static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
// Take a look at: https://www.schneier.com/skein1.3.pdf
#define SHL(x, n) ((x) << (n))
@ -375,9 +377,9 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t @@ -375,9 +377,9 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
h6 = vectorize(0x991112C71A75B523ull);
h7 = vectorize(0xAE18A40B660FCC33ull);
// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
#pragma unroll 8
for(int i=0; i<8; i++)
for (int i = 0; i < 8; i++)
p[i] = vectorize(inpHash[i]);
t0 = vectorize(64); // ptr
@ -475,7 +477,7 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint @@ -475,7 +477,7 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint
h6 = 0x991112C71A75B523ull;
h7 = 0xAE18A40B660FCC33ull;
// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
#pragma unroll 8
for(int i=0; i<8; i++)
p[i] = inpHash[i];
@ -540,7 +542,7 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint @@ -540,7 +542,7 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint
TFBIG_4o(17);
TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
// fertig
// output
uint64_t *outpHash = &g_hash[8 * hashPosition];
#pragma unroll 8
@ -549,6 +551,176 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint @@ -549,6 +551,176 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint
}
}
__global__ __launch_bounds__(128,6)
void skein512_gpu_hash_close(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2 t0 = vectorize(8); // extra
uint2 t1 = vectorize(0xFF00000000000000ull); // etype
uint2 t2 = vectorize(0xB000000000000050ull);
uint64_t *state = &g_hash[8 * thread];
uint2 h0 = vectorize(state[0]);
uint2 h1 = vectorize(state[1]);
uint2 h2 = vectorize(state[2]);
uint2 h3 = vectorize(state[3]);
uint2 h4 = vectorize(state[4]);
uint2 h5 = vectorize(state[5]);
uint2 h6 = vectorize(state[6]);
uint2 h7 = vectorize(state[7]);
uint2 h8;
TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
uint2 p[8] = { 0 };
//#pragma unroll 8
//for (int i = 0; i<8; i++)
// p[i] = make_uint2(0, 0);
TFBIG_4e_UI2(0);
TFBIG_4o_UI2(1);
TFBIG_4e_UI2(2);
TFBIG_4o_UI2(3);
TFBIG_4e_UI2(4);
TFBIG_4o_UI2(5);
TFBIG_4e_UI2(6);
TFBIG_4o_UI2(7);
TFBIG_4e_UI2(8);
TFBIG_4o_UI2(9);
TFBIG_4e_UI2(10);
TFBIG_4o_UI2(11);
TFBIG_4e_UI2(12);
TFBIG_4o_UI2(13);
TFBIG_4e_UI2(14);
TFBIG_4o_UI2(15);
TFBIG_4e_UI2(16);
TFBIG_4o_UI2(17);
TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
uint64_t *outpHash = state;
#pragma unroll 8
for (int i = 0; i < 8; i++)
outpHash[i] = devectorize(p[i]);
}
}
__global__ __launch_bounds__(128,5)
void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *output64, int swap)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// Skein
uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8;
uint2 t0, t1, t2;
// Init
h0 = vectorize(0x4903ADFF749C51CEull);
h1 = vectorize(0x0D95DE399746DF03ull);
h2 = vectorize(0x8FD1934127C79BCEull);
h3 = vectorize(0x9A255629FF352CB1ull);
h4 = vectorize(0x5DB62599DF6CA7B0ull);
h5 = vectorize(0xEABE394CA9D5C3F4ull);
h6 = vectorize(0x991112C71A75B523ull);
h7 = vectorize(0xAE18A40B660FCC33ull);
// 1st step -> etype = 0xE0, ptr = 64, bcount = 0, extra = 0
t0 = vectorize(64); // ptr
//t1 = vectorize(0xE0ull << 55); // etype
t1 = vectorize(0x7000000000000000ull);
TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
uint2 p[8];
#pragma unroll 8
for (int i = 0; i<8; i++)
p[i] = vectorize(c_PaddedMessage80[i]);
TFBIG_4e_UI2(0);
TFBIG_4o_UI2(1);
TFBIG_4e_UI2(2);
TFBIG_4o_UI2(3);
TFBIG_4e_UI2(4);
TFBIG_4o_UI2(5);
TFBIG_4e_UI2(6);
TFBIG_4o_UI2(7);
TFBIG_4e_UI2(8);
TFBIG_4o_UI2(9);
TFBIG_4e_UI2(10);
TFBIG_4o_UI2(11);
TFBIG_4e_UI2(12);
TFBIG_4o_UI2(13);
TFBIG_4e_UI2(14);
TFBIG_4o_UI2(15);
TFBIG_4e_UI2(16);
TFBIG_4o_UI2(17);
TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
h0 = vectorize(c_PaddedMessage80[0]) ^ p[0];
h1 = vectorize(c_PaddedMessage80[1]) ^ p[1];
h2 = vectorize(c_PaddedMessage80[2]) ^ p[2];
h3 = vectorize(c_PaddedMessage80[3]) ^ p[3];
h4 = vectorize(c_PaddedMessage80[4]) ^ p[4];
h5 = vectorize(c_PaddedMessage80[5]) ^ p[5];
h6 = vectorize(c_PaddedMessage80[6]) ^ p[6];
h7 = vectorize(c_PaddedMessage80[7]) ^ p[7];
uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread;
uint2 nounce2 = make_uint2(_LOWORD(c_PaddedMessage80[9]), nonce);
// skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16
p[0] = vectorize(c_PaddedMessage80[8]);
p[1] = nounce2;
#pragma unroll
for (int i = 2; i < 8; i++)
p[i] = make_uint2(0,0);
t0 = vectorize(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra);
t1 = vectorize(0xB000000000000000ull); // (bcount >> 58) + ((sph_u64)(etype) << 55);
TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
TFBIG_4e_UI2(0);
TFBIG_4o_UI2(1);
TFBIG_4e_UI2(2);
TFBIG_4o_UI2(3);
TFBIG_4e_UI2(4);
TFBIG_4o_UI2(5);
TFBIG_4e_UI2(6);
TFBIG_4o_UI2(7);
TFBIG_4e_UI2(8);
TFBIG_4o_UI2(9);
TFBIG_4e_UI2(10);
TFBIG_4o_UI2(11);
TFBIG_4e_UI2(12);
TFBIG_4o_UI2(13);
TFBIG_4e_UI2(14);
TFBIG_4o_UI2(15);
TFBIG_4e_UI2(16);
TFBIG_4o_UI2(17);
TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
h0 = vectorize(c_PaddedMessage80[8]) ^ p[0];
h1 = nounce2 ^ p[1];
h2 = p[2];
h3 = p[3];
h4 = p[4];
h5 = p[5];
h6 = p[6];
h7 = p[7];
// skein_big_close 2nd loop -> etype = 0x1fe, ptr = 8, bcount = 0
uint64_t *outpHash = &output64[thread * 8];
outpHash[0] = devectorize(h0);
outpHash[1] = devectorize(h1);
outpHash[2] = devectorize(h2);
outpHash[3] = devectorize(h3);
outpHash[4] = devectorize(h4);
outpHash[5] = devectorize(h5);
outpHash[6] = devectorize(h6);
outpHash[7] = devectorize(h7);
}
}
__host__
void quark_skein512_cpu_init(int thr_id, uint32_t threads)
{
@ -559,7 +731,6 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun @@ -559,7 +731,6 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
{
const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
@ -569,6 +740,31 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun @@ -569,6 +740,31 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
else
quark_skein512_gpu_hash_64_v30 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id);
}
/* skeincoin */
__host__
void skein512_cpu_setBlock_80(void *pdata)
{
uint32_t PaddedMessage[32] = { 0 };
memcpy(&PaddedMessage[0], pdata, 80);
CUDA_SAFE_CALL(
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)
);
}
__host__
void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// hash function is cut in 2 parts
skein512_gpu_hash_80 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, swap);
skein512_gpu_hash_close <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
}

480
skein.cu

@ -0,0 +1,480 @@ @@ -0,0 +1,480 @@
/**
* SKEIN512 80 + SHA256 64
* by tpruvot@github - 2015
*/
extern "C" {
#include "sph/sph_skein.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include <openssl/sha.h>
static uint32_t *d_hash[MAX_GPUS];
extern void skein512_cpu_setBlock_80(void *pdata);
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
static __device__ __constant__ uint32_t sha256_hashTable[] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
static __device__ __constant__ uint32_t sha256_constantTable[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static __device__ __constant__ uint32_t sha256_endingTable[] = {
0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484
};
/* Elementary functions used by SHA256 */
#define SWAB32(x) cuda_swab32(x)
//#define ROTR32(x,n) SPH_ROTR32(x,n)
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
#define ADVANCED_SHA2
#ifndef ADVANCED_SHA2
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
do { \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1; \
} while (0)
/* Adjusted round function for rotating state */
#define RNDr(S, W, i) \
RND(S[(64 - i) & 7], S[(65 - i) & 7], \
S[(66 - i) & 7], S[(67 - i) & 7], \
S[(68 - i) & 7], S[(69 - i) & 7], \
S[(70 - i) & 7], S[(71 - i) & 7], \
W[i] + sha256_constantTable[i])
static __constant__ uint32_t sha256_ending[16] = {
0x80000000UL, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x200UL
};
__device__
void sha256_transform_gpu(uint32_t *state, uint32_t *message)
{
uint32_t S[8];
uint32_t W[64];
uint32_t t0, t1;
/* Initialize work variables. */
for (int i = 0; i < 8; i++) {
S[i] = state[i];
}
for (int i = 0; i < 16; i++) {
W[i] = message[i];
}
for (int i = 16; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i + 1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
/* 3. Mix. */
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
RNDr(S, W, 3);
RNDr(S, W, 4);
RNDr(S, W, 5);
RNDr(S, W, 6);
RNDr(S, W, 7);
RNDr(S, W, 8);
RNDr(S, W, 9);
RNDr(S, W, 10);
RNDr(S, W, 11);
RNDr(S, W, 12);
RNDr(S, W, 13);
RNDr(S, W, 14);
RNDr(S, W, 15);
RNDr(S, W, 16);
RNDr(S, W, 17);
RNDr(S, W, 18);
RNDr(S, W, 19);
RNDr(S, W, 20);
RNDr(S, W, 21);
RNDr(S, W, 22);
RNDr(S, W, 23);
RNDr(S, W, 24);
RNDr(S, W, 25);
RNDr(S, W, 26);
RNDr(S, W, 27);
RNDr(S, W, 28);
RNDr(S, W, 29);
RNDr(S, W, 30);
RNDr(S, W, 31);
RNDr(S, W, 32);
RNDr(S, W, 33);
RNDr(S, W, 34);
RNDr(S, W, 35);
RNDr(S, W, 36);
RNDr(S, W, 37);
RNDr(S, W, 38);
RNDr(S, W, 39);
RNDr(S, W, 40);
RNDr(S, W, 41);
RNDr(S, W, 42);
RNDr(S, W, 43);
RNDr(S, W, 44);
RNDr(S, W, 45);
RNDr(S, W, 46);
RNDr(S, W, 47);
RNDr(S, W, 48);
RNDr(S, W, 49);
RNDr(S, W, 50);
RNDr(S, W, 51);
RNDr(S, W, 52);
RNDr(S, W, 53);
RNDr(S, W, 54);
RNDr(S, W, 55);
RNDr(S, W, 56);
RNDr(S, W, 57);
RNDr(S, W, 58);
RNDr(S, W, 59);
RNDr(S, W, 60);
RNDr(S, W, 61);
RNDr(S, W, 62);
RNDr(S, W, 63);
for (int i = 0; i < 8; i++)
state[i] += S[i];
}
#endif
#ifdef ADVANCED_SHA2
__device__
void skeincoin_gpu_sha256(uint32_t *message)
{
uint32_t W1[16];
uint32_t W2[16];
uint32_t regs[8];
uint32_t hash[8];
// Init with Hash-Table
#pragma unroll 8
for (int k=0; k < 8; k++) {
hash[k] = regs[k] = sha256_hashTable[k];
}
#pragma unroll 16
for (int k = 0; k<16; k++)
W1[k] = SWAB32(message[k]);
// Progress W1
#pragma unroll 16
for (int j = 0; j<16; j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k];
regs[0] = T1 + T2;
regs[4] += T1;
}
// Progress W2...W3
////// PART 1
#pragma unroll 2
for (int j = 0; j<2; j++)
W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
#pragma unroll 5
for (int j = 2; j<7; j++)
W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
#pragma unroll 8
for (int j = 7; j<15; j++)
W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
// Round function
#pragma unroll 16
for (int j = 0; j<16; j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
regs[0] = T1 + T2;
regs[4] += T1;
}
////// PART 2
#pragma unroll 2
for (int j = 0; j<2; j++)
W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
#pragma unroll 5
for (int j = 2; j<7; j++)
W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
#pragma unroll 8
for (int j = 7; j<15; j++)
W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j];
W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
// Round function
#pragma unroll 16
for (int j = 0; j<16; j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
regs[0] = T1 + T2;
regs[4] += T1;
}
////// PART 3
#pragma unroll 2
for (int j = 0; j<2; j++)
W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
#pragma unroll 5
for (int j = 2; j<7; j++)
W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
#pragma unroll 8
for (int j = 7; j<15; j++)
W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
// Round function
#pragma unroll 16
for (int j = 0; j<16; j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
regs[0] = T1 + T2;
regs[4] += T1;
}
#pragma unroll 8
for (int k = 0; k<8; k++)
hash[k] += regs[k];
#if 1
/////
///// Second Pass (ending)
/////
#pragma unroll 8
for (int k = 0; k<8; k++)
regs[k] = hash[k];
// Progress W1
#pragma unroll 64
for (int j = 0; j<64; j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + sha256_endingTable[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k];
regs[0] = T1 + T2;
regs[4] += T1;
}
#pragma unroll 8
for (int k = 0; k<8; k++)
hash[k] += regs[k];
// Final Hash
#pragma unroll 8
for (int k = 0; k<8; k++)
message[k] = SWAB32(hash[k]);
#else
// sha256_transform only, require an additional sha256_transform_gpu() call
#pragma unroll 8
for (int k = 0; k<8; k++)
message[k] = hash[k];
#endif
}
#endif
__global__
void sha2_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t *hash = &hashBuffer[thread << 4];
#ifdef ADVANCED_SHA2
skeincoin_gpu_sha256(hash);
#else
uint32_t state[16];
uint32_t msg[16];
#pragma unroll
for (int i = 0; i < 8; i++)
state[i] = sha256_hashTable[i];
#pragma unroll
for (int i = 0; i < 16; i++)
msg[i] = SWAB32(hash[i]);
sha256_transform_gpu(state, msg);
sha256_transform_gpu(state, sha256_ending);
#pragma unroll
for (int i = 0; i < 8; i++)
hash[i] = SWAB32(state[i]);
#endif
}
}
__host__
void sha2_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHashes, int order)
{
uint32_t threadsperblock = 128;
dim3 block(threadsperblock);
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
//cudaMemset(d_outputHashes, 0, 64 * threads);
sha2_gpu_hash_64 <<< grid, block >>>(threads, startNounce, d_outputHashes);
MyStreamSynchronize(NULL, 0, thr_id);
}
extern "C" void skeincoinhash(void *output, const void *input)
{
sph_skein512_context ctx_skein;
SHA256_CTX sha256;
uint32_t hash[16];
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, input, 80);
sph_skein512_close(&ctx_skein, hash);
SHA256_Init(&sha256);
SHA256_Update(&sha256, (unsigned char *)hash, 64);
SHA256_Final((unsigned char *)hash, &sha256);
memcpy(output, hash, 32);
}
static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
return iftrue ? swab32(val) : val;
}
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
const int swap = 1;
uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
throughput = min(throughput, (max_nonce - first_nonce));
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0FFF;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
skein512_cpu_setBlock_80((void*)endiandata);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
*hashes_done = pdata[19] - first_nonce + throughput;
// Hash with CUDA
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);
sha2_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != UINT32_MAX)
{
uint32_t vhash64[8];
endiandata[19] = swab32_if(foundNonce, swap);
skeincoinhash(vhash64, endiandata);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
int res = 1;
uint8_t num = res;
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], num);
while (secNonce != 0 && res < 6)
{
endiandata[19] = swab32_if(secNonce, swap);
skeincoinhash(vhash64, endiandata);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
pdata[19+res] = swab32_if(secNonce, !swap);
res++;
}
num++;
secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], num);
}
if (res > 1 && opt_debug)
applog(LOG_BLUE, "GPU #%d: %d/%d valid nonces !!!", device_map[thr_id], res, (int)num);
pdata[19] = swab32_if(foundNonce, !swap);
return res;
}
else {
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
pdata[19]++;
}
} else
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

8
util.cpp

@ -1701,9 +1701,12 @@ void do_gpu_tests(void) @@ -1701,9 +1701,12 @@ void do_gpu_tests(void)
scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done);
//memset(buf, 0, sizeof buf);
//scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
//scanhash_skeincoin(0, (uint32_t*)buf, tgt, 1, &done);
memset(buf, 0, sizeof buf);
scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
//memset(buf, 0, sizeof buf);
// buf[0] = 1; buf[64] = 2; // for endian tests
//scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
@ -1786,6 +1789,9 @@ void print_hash_tests(void) @@ -1786,6 +1789,9 @@ void print_hash_tests(void)
qubithash(&hash[0], &buf[0]);
printpfx("qubit", hash);
skeincoinhash(&hash[0], &buf[0]);
printpfx("skein", hash);
s3hash(&hash[0], &buf[0]);
printpfx("S3", hash);

Loading…
Cancel
Save