x17 cleanup
haval256 is now 2x faster, but sha512 perf depends a lot on cuda version...
This commit is contained in:
parent
e7ed0dea5f
commit
dad0110557
@ -57,7 +57,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
|
|||||||
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
|
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
|
||||||
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
|
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
|
||||||
x15/whirlpool.cu \
|
x15/whirlpool.cu \
|
||||||
x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
|
x17/x17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
|
||||||
x11/c11.cu x11/s3.cu x11/sib.cu x11/cuda_streebog.cu
|
x11/c11.cu x11/s3.cu x11/sib.cu x11/cuda_streebog.cu
|
||||||
|
|
||||||
# scrypt
|
# scrypt
|
||||||
|
@ -508,7 +508,7 @@
|
|||||||
<CudaCompile Include="x15\whirlpool.cu" />
|
<CudaCompile Include="x15\whirlpool.cu" />
|
||||||
<CudaCompile Include="x17\x17.cu">
|
<CudaCompile Include="x17\x17.cu">
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="x17\cuda_x17_haval512.cu">
|
<CudaCompile Include="x17\cuda_x17_haval256.cu">
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="x17\cuda_x17_sha512.cu">
|
<CudaCompile Include="x17\cuda_x17_sha512.cu">
|
||||||
<MaxRegCount>80</MaxRegCount>
|
<MaxRegCount>80</MaxRegCount>
|
||||||
|
@ -595,7 +595,7 @@
|
|||||||
<CudaCompile Include="x15\whirlpool.cu">
|
<CudaCompile Include="x15\whirlpool.cu">
|
||||||
<Filter>Source Files\CUDA\x15</Filter>
|
<Filter>Source Files\CUDA\x15</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="x17\cuda_x17_haval512.cu">
|
<CudaCompile Include="x17\cuda_x17_haval256.cu">
|
||||||
<Filter>Source Files\CUDA\x17</Filter>
|
<Filter>Source Files\CUDA\x17</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile Include="x17\cuda_x17_sha512.cu">
|
<CudaCompile Include="x17\cuda_x17_sha512.cu">
|
||||||
|
@ -133,8 +133,16 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
|
|||||||
{
|
{
|
||||||
// Input: 77665544 33221100
|
// Input: 77665544 33221100
|
||||||
// Output: 00112233 44556677
|
// Output: 00112233 44556677
|
||||||
uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
|
uint64_t result;
|
||||||
return (result << 32) | __byte_perm(_HIDWORD(x), 0, 0x0123);
|
//result = __byte_perm((uint32_t) x, 0, 0x0123);
|
||||||
|
//return (result << 32) + __byte_perm(_HIDWORD(x), 0, 0x0123);
|
||||||
|
asm("{ .reg .b32 x, y; // swab64\n\t"
|
||||||
|
"mov.b64 {x,y}, %1;\n\t"
|
||||||
|
"prmt.b32 x, x, 0, 0x0123;\n\t"
|
||||||
|
"prmt.b32 y, y, 0, 0x0123;\n\t"
|
||||||
|
"mov.b64 %0, {y,x};\n\t"
|
||||||
|
"}\n" : "=l"(result): "l"(x));
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
/* host */
|
/* host */
|
||||||
@ -198,7 +206,7 @@ __device__ __forceinline__
|
|||||||
uint64_t xor1(uint64_t a, uint64_t b)
|
uint64_t xor1(uint64_t a, uint64_t b)
|
||||||
{
|
{
|
||||||
uint64_t result;
|
uint64_t result;
|
||||||
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b));
|
asm("xor.b64 %0, %1, %2; // xor1" : "=l"(result) : "l"(a), "l"(b));
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@ -211,7 +219,7 @@ __device__ __forceinline__
|
|||||||
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
|
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
|
||||||
{
|
{
|
||||||
uint64_t result;
|
uint64_t result;
|
||||||
asm("xor.b64 %0, %2, %3;\n\t"
|
asm("xor.b64 %0, %2, %3; // xor3\n\t"
|
||||||
"xor.b64 %0, %0, %1;\n\t"
|
"xor.b64 %0, %0, %1;\n\t"
|
||||||
/* output : input registers */
|
/* output : input registers */
|
||||||
: "=l"(result) : "l"(a), "l"(b), "l"(c));
|
: "=l"(result) : "l"(a), "l"(b), "l"(c));
|
||||||
@ -246,49 +254,31 @@ uint64_t xandx(uint64_t a, uint64_t b, uint64_t c)
|
|||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
uint64_t result;
|
uint64_t result;
|
||||||
asm("{\n\t"
|
asm("{ // xandx \n\t"
|
||||||
".reg .u64 n;\n\t"
|
".reg .u64 n;\n\t"
|
||||||
"xor.b64 %0, %2, %3;\n\t"
|
"xor.b64 %0, %2, %3;\n\t"
|
||||||
"and.b64 n, %0, %1;\n\t"
|
"and.b64 n, %0, %1;\n\t"
|
||||||
"xor.b64 %0, n, %3;"
|
"xor.b64 %0, n, %3;\n\t"
|
||||||
"}\n"
|
"}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c));
|
||||||
: "=l"(result) : "l"(a), "l"(b), "l"(c));
|
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
return ((b^c) & a) ^ c;
|
return ((b^c) & a) ^ c;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// device asm for x17
|
|
||||||
__device__ __forceinline__
|
|
||||||
uint64_t sph_t64(uint64_t x)
|
|
||||||
{
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
uint64_t result;
|
|
||||||
asm("{\n\t"
|
|
||||||
"and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t"
|
|
||||||
"}\n"
|
|
||||||
: "=l"(result) : "l"(x));
|
|
||||||
return result;
|
|
||||||
#else
|
|
||||||
return x;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// device asm for x17
|
// device asm for x17
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
|
uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
|
||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
uint64_t result;
|
uint64_t result;
|
||||||
asm("{\n\t"
|
asm("{ // andor\n\t"
|
||||||
".reg .u64 m,n;\n\t"
|
".reg .u64 m,n;\n\t"
|
||||||
"and.b64 m, %1, %2;\n\t"
|
"and.b64 m, %1, %2;\n\t"
|
||||||
" or.b64 n, %1, %2;\n\t"
|
" or.b64 n, %1, %2;\n\t"
|
||||||
"and.b64 %0, n, %3;\n\t"
|
"and.b64 %0, n, %3;\n\t"
|
||||||
" or.b64 %0, %0, m;\n\t"
|
" or.b64 %0, %0, m;\n\t"
|
||||||
"}\n"
|
"}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c));
|
||||||
: "=l"(result) : "l"(a), "l"(b), "l"(c));
|
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
return ((a | b) & c) | (a & b);
|
return ((a | b) & c) | (a & b);
|
||||||
@ -302,7 +292,6 @@ uint64_t shr_t64(uint64_t x, uint32_t n)
|
|||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
uint64_t result;
|
uint64_t result;
|
||||||
asm("shr.b64 %0,%1,%2;\n\t"
|
asm("shr.b64 %0,%1,%2;\n\t"
|
||||||
"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */
|
|
||||||
: "=l"(result) : "l"(x), "r"(n));
|
: "=l"(result) : "l"(x), "r"(n));
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
@ -316,7 +305,6 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
|
|||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
uint64_t result;
|
uint64_t result;
|
||||||
asm("shl.b64 %0,%1,%2;\n\t"
|
asm("shl.b64 %0,%1,%2;\n\t"
|
||||||
"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */
|
|
||||||
: "=l"(result) : "l"(x), "r"(n));
|
: "=l"(result) : "l"(x), "r"(n));
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
@ -370,15 +358,14 @@ __device__ __forceinline__
|
|||||||
uint64_t ROTR64(const uint64_t x, const int offset)
|
uint64_t ROTR64(const uint64_t x, const int offset)
|
||||||
{
|
{
|
||||||
uint64_t result;
|
uint64_t result;
|
||||||
asm("{\n\t"
|
asm("{ // ROTR64 \n\t"
|
||||||
".reg .b64 lhs;\n\t"
|
".reg .b64 lhs;\n\t"
|
||||||
".reg .u32 roff;\n\t"
|
".reg .u32 roff;\n\t"
|
||||||
"shr.b64 lhs, %1, %2;\n\t"
|
"shr.b64 lhs, %1, %2;\n\t"
|
||||||
"sub.u32 roff, 64, %2;\n\t"
|
"sub.u32 roff, 64, %2;\n\t"
|
||||||
"shl.b64 %0, %1, roff;\n\t"
|
"shl.b64 %0, %1, roff;\n\t"
|
||||||
"add.u64 %0, %0, lhs;\n\t"
|
"add.u64 %0, %0, lhs;\n\t"
|
||||||
"}\n"
|
"}\n" : "=l"(result) : "l"(x), "r"(offset));
|
||||||
: "=l"(result) : "l"(x), "r"(offset));
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@ -405,15 +392,14 @@ __device__ __forceinline__
|
|||||||
uint64_t ROTL64(const uint64_t x, const int offset)
|
uint64_t ROTL64(const uint64_t x, const int offset)
|
||||||
{
|
{
|
||||||
uint64_t result;
|
uint64_t result;
|
||||||
asm("{\n\t"
|
asm("{ // ROTL64 \n\t"
|
||||||
".reg .b64 lhs;\n\t"
|
".reg .b64 lhs;\n\t"
|
||||||
".reg .u32 roff;\n\t"
|
".reg .u32 roff;\n\t"
|
||||||
"shl.b64 lhs, %1, %2;\n\t"
|
"shl.b64 lhs, %1, %2;\n\t"
|
||||||
"sub.u32 roff, 64, %2;\n\t"
|
"sub.u32 roff, 64, %2;\n\t"
|
||||||
"shr.b64 %0, %1, roff;\n\t"
|
"shr.b64 %0, %1, roff;\n\t"
|
||||||
"add.u64 %0, lhs, %0;\n\t"
|
"add.u64 %0, lhs, %0;\n\t"
|
||||||
"}\n"
|
"}\n" : "=l"(result) : "l"(x), "r"(offset));
|
||||||
: "=l"(result) : "l"(x), "r"(offset));
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
|
#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
|
||||||
@ -421,7 +407,7 @@ __device__
|
|||||||
uint64_t ROTL64(const uint64_t x, const int offset)
|
uint64_t ROTL64(const uint64_t x, const int offset)
|
||||||
{
|
{
|
||||||
uint64_t res;
|
uint64_t res;
|
||||||
asm("{\n\t"
|
asm("{ // ROTL64 \n\t"
|
||||||
".reg .u32 tl,th,vl,vh;\n\t"
|
".reg .u32 tl,th,vl,vh;\n\t"
|
||||||
".reg .pred p;\n\t"
|
".reg .pred p;\n\t"
|
||||||
"mov.b64 {tl,th}, %1;\n\t"
|
"mov.b64 {tl,th}, %1;\n\t"
|
||||||
@ -430,8 +416,7 @@ uint64_t ROTL64(const uint64_t x, const int offset)
|
|||||||
"setp.lt.u32 p, %2, 32;\n\t"
|
"setp.lt.u32 p, %2, 32;\n\t"
|
||||||
"@!p mov.b64 %0, {vl,vh};\n\t"
|
"@!p mov.b64 %0, {vl,vh};\n\t"
|
||||||
"@p mov.b64 %0, {vh,vl};\n\t"
|
"@p mov.b64 %0, {vh,vl};\n\t"
|
||||||
"}"
|
"}\n" : "=l"(res) : "l"(x) , "r"(offset)
|
||||||
: "=l"(res) : "l"(x) , "r"(offset)
|
|
||||||
);
|
);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
@ -498,11 +483,10 @@ static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^
|
|||||||
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) {
|
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) {
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
uint2 result;
|
uint2 result;
|
||||||
asm("{\n\t"
|
asm("{ // uint2 a+b \n\t"
|
||||||
"add.cc.u32 %0, %2, %4; \n\t"
|
"add.cc.u32 %0, %2, %4; \n\t"
|
||||||
"addc.u32 %1, %3, %5; \n\t"
|
"addc.u32 %1, %3, %5; \n\t"
|
||||||
"}\n\t"
|
"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
|
||||||
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
|
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
return vectorize(devectorize(a) + devectorize(b));
|
return vectorize(devectorize(a) + devectorize(b));
|
||||||
@ -514,11 +498,10 @@ static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a +
|
|||||||
static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) {
|
static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) {
|
||||||
#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
|
#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
|
||||||
uint2 result;
|
uint2 result;
|
||||||
asm("{\n\t"
|
asm("{ // uint2 a-b \n\t"
|
||||||
"sub.cc.u32 %0, %2, %4; \n\t"
|
"sub.cc.u32 %0, %2, %4; \n\t"
|
||||||
"subc.u32 %1, %3, %5; \n\t"
|
"subc.u32 %1, %3, %5; \n\t"
|
||||||
"}\n\t"
|
"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
|
||||||
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
|
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
return vectorize(devectorize(a) - devectorize(b));
|
return vectorize(devectorize(a) - devectorize(b));
|
||||||
@ -534,13 +517,12 @@ static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
|
|||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
uint2 result;
|
uint2 result;
|
||||||
asm("{\n\t"
|
asm("{ // uint2 a*b \n\t"
|
||||||
"mul.lo.u32 %0, %2, %4; \n\t"
|
"mul.lo.u32 %0, %2, %4; \n\t"
|
||||||
"mul.hi.u32 %1, %2, %4; \n\t"
|
"mul.hi.u32 %1, %2, %4; \n\t"
|
||||||
"mad.lo.cc.u32 %1, %3, %4, %1; \n\t"
|
"mad.lo.cc.u32 %1, %3, %4, %1; \n\t"
|
||||||
"madc.lo.u32 %1, %3, %5, %1; \n\t"
|
"madc.lo.u32 %1, %3, %5, %1; \n\t"
|
||||||
"}\n\t"
|
"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
|
||||||
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
|
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
// incorrect but unused host equiv
|
// incorrect but unused host equiv
|
||||||
@ -646,17 +628,15 @@ static uint2 SHL2(uint2 a, int offset)
|
|||||||
#if __CUDA_ARCH__ > 300
|
#if __CUDA_ARCH__ > 300
|
||||||
uint2 result;
|
uint2 result;
|
||||||
if (offset < 32) {
|
if (offset < 32) {
|
||||||
asm("{\n\t"
|
asm("{ // SHL2 (l) \n\t"
|
||||||
"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
|
"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
|
||||||
"shl.b32 %0, %2, %4; \n\t"
|
"shl.b32 %0, %2, %4; \n\t"
|
||||||
"}\n\t"
|
"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||||
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
|
||||||
} else {
|
} else {
|
||||||
asm("{\n\t"
|
asm("{ // SHL2 (h) \n\t"
|
||||||
"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
|
"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
|
||||||
"shl.b32 %0, %2, %4; \n\t"
|
"shl.b32 %0, %2, %4; \n\t"
|
||||||
"}\n\t"
|
"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||||
: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
|
351
x17/cuda_x17_haval256.cu
Normal file
351
x17/cuda_x17_haval256.cu
Normal file
@ -0,0 +1,351 @@
|
|||||||
|
/*
|
||||||
|
* haval-256 kernel implementation.
|
||||||
|
*
|
||||||
|
* ==========================(LICENSE BEGIN)============================
|
||||||
|
*
|
||||||
|
* Copyright (c) 2014 djm34
|
||||||
|
* 2016 tpruvot
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining
|
||||||
|
* a copy of this software and associated documentation files (the
|
||||||
|
* "Software"), to deal in the Software without restriction, including
|
||||||
|
* without limitation the rights to use, copy, modify, merge, publish,
|
||||||
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||||
|
* permit persons to whom the Software is furnished to do so, subject to
|
||||||
|
* the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||||
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||||
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||||
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
* ===========================(LICENSE END)=============================
|
||||||
|
*/
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <memory.h>
|
||||||
|
|
||||||
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
|
#define F1(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
(((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0))
|
||||||
|
|
||||||
|
#define F2(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
(((x2) & (((x1) & ~(x3)) ^ ((x4) & (x5)) ^ (x6) ^ (x0))) \
|
||||||
|
^ ((x4) & ((x1) ^ (x5))) ^ ((x3 & (x5)) ^ (x0)))
|
||||||
|
|
||||||
|
#define F3(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
(((x3) & (((x1) & (x2)) ^ (x6) ^ (x0))) \
|
||||||
|
^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ (x0))
|
||||||
|
|
||||||
|
#define F4(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
(((x3) & (((x1) & (x2)) ^ ((x4) | (x6)) ^ (x5))) \
|
||||||
|
^ ((x4) & ((~(x2) & (x5)) ^ (x1) ^ (x6) ^ (x0))) \
|
||||||
|
^ ((x2) & (x6)) ^ (x0))
|
||||||
|
|
||||||
|
#define F5(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
(((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \
|
||||||
|
^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)))
|
||||||
|
|
||||||
|
#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
F1(x3, x4, x1, x0, x5, x2, x6)
|
||||||
|
#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
F2(x6, x2, x1, x0, x3, x4, x5)
|
||||||
|
#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
F3(x2, x6, x0, x4, x3, x1, x5)
|
||||||
|
#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
F4(x1, x5, x3, x2, x0, x4, x6)
|
||||||
|
#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
|
||||||
|
F5(x2, x5, x0, x6, x4, x3, x1)
|
||||||
|
|
||||||
|
#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \
|
||||||
|
uint32_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
|
||||||
|
(x7) = (uint32_t)(ROTR32(t, 7) + ROTR32((x7), 11) + (w) + (c)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define PASS1(n, in) { \
|
||||||
|
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], 0U); \
|
||||||
|
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 1], 0U); \
|
||||||
|
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[ 2], 0U); \
|
||||||
|
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], 0U); \
|
||||||
|
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], 0U); \
|
||||||
|
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[ 5], 0U); \
|
||||||
|
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[ 6], 0U); \
|
||||||
|
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[ 7], 0U); \
|
||||||
|
\
|
||||||
|
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 8], 0U); \
|
||||||
|
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], 0U); \
|
||||||
|
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[10], 0U); \
|
||||||
|
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[11], 0U); \
|
||||||
|
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[12], 0U); \
|
||||||
|
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[13], 0U); \
|
||||||
|
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[14], 0U); \
|
||||||
|
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[15], 0U); \
|
||||||
|
\
|
||||||
|
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[16], 0U); \
|
||||||
|
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[17], 0U); \
|
||||||
|
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[18], 0U); \
|
||||||
|
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[19], 0U); \
|
||||||
|
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[20], 0U); \
|
||||||
|
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[21], 0U); \
|
||||||
|
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[22], 0U); \
|
||||||
|
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[23], 0U); \
|
||||||
|
\
|
||||||
|
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[24], 0U); \
|
||||||
|
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[25], 0U); \
|
||||||
|
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[26], 0U); \
|
||||||
|
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[27], 0U); \
|
||||||
|
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[28], 0U); \
|
||||||
|
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[29], 0U); \
|
||||||
|
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[30], 0U); \
|
||||||
|
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[31], 0U); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define PASS2(n, in) { \
|
||||||
|
STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], 0x452821E6); \
|
||||||
|
STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[14], 0x38D01377); \
|
||||||
|
STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[26], 0xBE5466CF); \
|
||||||
|
STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[18], 0x34E90C6C); \
|
||||||
|
STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[11], 0xC0AC29B7); \
|
||||||
|
STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[28], 0xC97C50DD); \
|
||||||
|
STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 7], 0x3F84D5B5); \
|
||||||
|
STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[16], 0xB5470917); \
|
||||||
|
\
|
||||||
|
STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], 0x9216D5D9); \
|
||||||
|
STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[23], 0x8979FB1B); \
|
||||||
|
STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[20], 0xD1310BA6); \
|
||||||
|
STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[22], 0x98DFB5AC); \
|
||||||
|
STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], 0x2FFD72DB); \
|
||||||
|
STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[10], 0xD01ADFB7); \
|
||||||
|
STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 4], 0xB8E1AFED); \
|
||||||
|
STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 8], 0x6A267E96); \
|
||||||
|
\
|
||||||
|
STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[30], 0xBA7C9045); \
|
||||||
|
STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], 0xF12C7F99); \
|
||||||
|
STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[21], 0x24A19947); \
|
||||||
|
STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[ 9], 0xB3916CF7); \
|
||||||
|
STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[17], 0x0801F2E2); \
|
||||||
|
STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[24], 0x858EFC16); \
|
||||||
|
STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[29], 0x636920D8); \
|
||||||
|
STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 6], 0x71574E69); \
|
||||||
|
\
|
||||||
|
STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[19], 0xA458FEA3); \
|
||||||
|
STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[12], 0xF4933D7E); \
|
||||||
|
STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[15], 0x0D95748F); \
|
||||||
|
STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[13], 0x728EB658); \
|
||||||
|
STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], 0x718BCD58); \
|
||||||
|
STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[25], 0x82154AEE); \
|
||||||
|
STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[31], 0x7B54A41D); \
|
||||||
|
STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[27], 0xC25A59B5); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define PASS3(n, in) { \
|
||||||
|
STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[19], 0x9C30D539); \
|
||||||
|
STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], 0x2AF26013); \
|
||||||
|
STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 4], 0xC5D1B023); \
|
||||||
|
STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[20], 0x286085F0); \
|
||||||
|
STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[28], 0xCA417918); \
|
||||||
|
STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[17], 0xB8DB38EF); \
|
||||||
|
STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 8], 0x8E79DCB0); \
|
||||||
|
STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[22], 0x603A180E); \
|
||||||
|
\
|
||||||
|
STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[29], 0x6C9E0E8B); \
|
||||||
|
STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[14], 0xB01E8A3E); \
|
||||||
|
STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[25], 0xD71577C1); \
|
||||||
|
STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[12], 0xBD314B27); \
|
||||||
|
STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[24], 0x78AF2FDA); \
|
||||||
|
STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[30], 0x55605C60); \
|
||||||
|
STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[16], 0xE65525F3); \
|
||||||
|
STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[26], 0xAA55AB94); \
|
||||||
|
\
|
||||||
|
STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[31], 0x57489862); \
|
||||||
|
STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[15], 0x63E81440); \
|
||||||
|
STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 7], 0x55CA396A); \
|
||||||
|
STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], 0x2AAB10B6); \
|
||||||
|
STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], 0xB4CC5C34); \
|
||||||
|
STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[ 0], 0x1141E8CE); \
|
||||||
|
STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[18], 0xA15486AF); \
|
||||||
|
STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[27], 0x7C72E993); \
|
||||||
|
\
|
||||||
|
STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[13], 0xB3EE1411); \
|
||||||
|
STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], 0x636FBC2A); \
|
||||||
|
STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[21], 0x2BA9C55D); \
|
||||||
|
STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[10], 0x741831F6); \
|
||||||
|
STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[23], 0xCE5C3E16); \
|
||||||
|
STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[11], 0x9B87931E); \
|
||||||
|
STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 5], 0xAFD6BA33); \
|
||||||
|
STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[ 2], 0x6C24CF5C); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define PASS4(n, in) { \
|
||||||
|
STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[24], 0x7A325381); \
|
||||||
|
STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 4], 0x28958677); \
|
||||||
|
STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 0], 0x3B8F4898); \
|
||||||
|
STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[14], 0x6B4BB9AF); \
|
||||||
|
STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], 0xC4BFE81B); \
|
||||||
|
STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[ 7], 0x66282193); \
|
||||||
|
STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[28], 0x61D809CC); \
|
||||||
|
STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[23], 0xFB21A991); \
|
||||||
|
\
|
||||||
|
STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[26], 0x487CAC60); \
|
||||||
|
STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], 0x5DEC8032); \
|
||||||
|
STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[30], 0xEF845D5D); \
|
||||||
|
STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[20], 0xE98575B1); \
|
||||||
|
STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[18], 0xDC262302); \
|
||||||
|
STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[25], 0xEB651B88); \
|
||||||
|
STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[19], 0x23893E81); \
|
||||||
|
STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 3], 0xD396ACC5); \
|
||||||
|
\
|
||||||
|
STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[22], 0x0F6D6FF3); \
|
||||||
|
STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[11], 0x83F44239); \
|
||||||
|
STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[31], 0x2E0B4482); \
|
||||||
|
STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[21], 0xA4842004); \
|
||||||
|
STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 8], 0x69C8F04A); \
|
||||||
|
STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[27], 0x9E1F9B5E); \
|
||||||
|
STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[12], 0x21C66842); \
|
||||||
|
STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 9], 0xF6E96C9A); \
|
||||||
|
\
|
||||||
|
STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[ 1], 0x670C9C61); \
|
||||||
|
STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[29], 0xABD388F0); \
|
||||||
|
STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 5], 0x6A51A0D2); \
|
||||||
|
STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[15], 0xD8542F68); \
|
||||||
|
STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[17], 0x960FA728); \
|
||||||
|
STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[10], 0xAB5133A3); \
|
||||||
|
STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[16], 0x6EEF0B6C); \
|
||||||
|
STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[13], 0x137A3BE4); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define PASS5(n, in) { \
|
||||||
|
STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[27], 0xBA3BF050); \
|
||||||
|
STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], 0x7EFB2A98); \
|
||||||
|
STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[21], 0xA1F1651D); \
|
||||||
|
STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[26], 0x39AF0176); \
|
||||||
|
STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[17], 0x66CA593E); \
|
||||||
|
STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[11], 0x82430E88); \
|
||||||
|
STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[20], 0x8CEE8619); \
|
||||||
|
STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[29], 0x456F9FB4); \
|
||||||
|
\
|
||||||
|
STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[19], 0x7D84A5C3); \
|
||||||
|
STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 0], 0x3B8B5EBE); \
|
||||||
|
STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[12], 0xE06F75D8); \
|
||||||
|
STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[ 7], 0x85C12073); \
|
||||||
|
STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[13], 0x401A449F); \
|
||||||
|
STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 8], 0x56C16AA6); \
|
||||||
|
STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[31], 0x4ED3AA62); \
|
||||||
|
STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[10], 0x363F7706); \
|
||||||
|
\
|
||||||
|
STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], 0x1BFEDF72); \
|
||||||
|
STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], 0x429B023D); \
|
||||||
|
STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[14], 0x37D0D724); \
|
||||||
|
STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[30], 0xD00A1248); \
|
||||||
|
STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[18], 0xDB0FEAD3); \
|
||||||
|
STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 6], 0x49F1C09B); \
|
||||||
|
STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[28], 0x075372C9); \
|
||||||
|
STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[24], 0x80991B7B); \
|
||||||
|
\
|
||||||
|
STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 2], 0x25D479D8); \
|
||||||
|
STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[23], 0xF6E8DEF7); \
|
||||||
|
STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[16], 0xE3FE501A); \
|
||||||
|
STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[22], 0xB6794C3B); \
|
||||||
|
STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], 0x976CE0BD); \
|
||||||
|
STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 1], 0x04C006BA); \
|
||||||
|
STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[25], 0xC1A94FB6); \
|
||||||
|
STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[15], 0x409F60C4); \
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ /* __launch_bounds__(256, 6) */
|
||||||
|
void x17_haval256_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
|
||||||
|
{
|
||||||
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint64_t *pHash = &g_hash[thread*8U];
|
||||||
|
|
||||||
|
uint32_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||||
|
const uint32_t u0 = s0 = 0x243F6A88;
|
||||||
|
const uint32_t u1 = s1 = 0x85A308D3;
|
||||||
|
const uint32_t u2 = s2 = 0x13198A2E;
|
||||||
|
const uint32_t u3 = s3 = 0x03707344;
|
||||||
|
const uint32_t u4 = s4 = 0xA4093822;
|
||||||
|
const uint32_t u5 = s5 = 0x299F31D0;
|
||||||
|
const uint32_t u6 = s6 = 0x082EFA98;
|
||||||
|
const uint32_t u7 = s7 = 0xEC4E6C89;
|
||||||
|
|
||||||
|
union {
|
||||||
|
uint32_t h4[16];
|
||||||
|
uint64_t h8[8];
|
||||||
|
} hash;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i=0; i<8; i++) {
|
||||||
|
hash.h8[i] = pHash[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
///////// input big /////////////////////
|
||||||
|
|
||||||
|
uint32_t buf[32];
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i=0; i<16; i++)
|
||||||
|
buf[i] = hash.h4[i];
|
||||||
|
|
||||||
|
buf[16] = 0x00000001;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i=17; i<29; i++)
|
||||||
|
buf[i] = 0;
|
||||||
|
|
||||||
|
buf[29] = 0x40290000;
|
||||||
|
buf[30] = 0x00000200;
|
||||||
|
buf[31] = 0;
|
||||||
|
|
||||||
|
PASS1(5, buf);
|
||||||
|
PASS2(5, buf);
|
||||||
|
PASS3(5, buf);
|
||||||
|
PASS4(5, buf);
|
||||||
|
PASS5(5, buf);
|
||||||
|
|
||||||
|
hash.h4[0] = s0 + u0;
|
||||||
|
hash.h4[1] = s1 + u1;
|
||||||
|
hash.h4[2] = s2 + u2;
|
||||||
|
hash.h4[3] = s3 + u3;
|
||||||
|
hash.h4[4] = s4 + u4;
|
||||||
|
hash.h4[5] = s5 + u5;
|
||||||
|
hash.h4[6] = s6 + u6;
|
||||||
|
hash.h4[7] = s7 + u7;
|
||||||
|
|
||||||
|
pHash[0] = hash.h8[0];
|
||||||
|
pHash[1] = hash.h8[1];
|
||||||
|
pHash[2] = hash.h8[2];
|
||||||
|
pHash[3] = hash.h8[3];
|
||||||
|
#ifdef NEED_HASH_512
|
||||||
|
pHash[4] = hash.h8[4];
|
||||||
|
pHash[5] = hash.h8[5];
|
||||||
|
pHash[6] = hash.h8[6];
|
||||||
|
pHash[7] = hash.h8[7];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void x17_haval256_cpu_init(int thr_id, uint32_t threads)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
|
||||||
|
{
|
||||||
|
const uint32_t threadsperblock = 256;
|
||||||
|
|
||||||
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
|
x17_haval256_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
|
||||||
|
|
||||||
|
//MyStreamSynchronize(NULL, order, thr_id);
|
||||||
|
}
|
@ -1,400 +0,0 @@
|
|||||||
/*
|
|
||||||
* Haval-512 for X17
|
|
||||||
*
|
|
||||||
* Built on cbuchner1's implementation, actual hashing code
|
|
||||||
* heavily based on phm's sgminer
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Haval-512 kernel implementation.
|
|
||||||
*
|
|
||||||
* ==========================(LICENSE BEGIN)============================
|
|
||||||
*
|
|
||||||
* Copyright (c) 2014 djm34
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining
|
|
||||||
* a copy of this software and associated documentation files (the
|
|
||||||
* "Software"), to deal in the Software without restriction, including
|
|
||||||
* without limitation the rights to use, copy, modify, merge, publish,
|
|
||||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
||||||
* permit persons to whom the Software is furnished to do so, subject to
|
|
||||||
* the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
||||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
||||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
||||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
||||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*
|
|
||||||
* ===========================(LICENSE END)=============================
|
|
||||||
*
|
|
||||||
* @author phm <phm@inbox.com>
|
|
||||||
*/
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <memory.h>
|
|
||||||
|
|
||||||
#define USE_SHARED 1
|
|
||||||
|
|
||||||
#define SPH_T64(x) ((x) & 0xFFFFFFFFFFFFFFFFULL)
|
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
|
||||||
|
|
||||||
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
|
|
||||||
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
|
|
||||||
|
|
||||||
static __constant__ uint32_t initVector[8];
|
|
||||||
|
|
||||||
static const uint32_t c_initVector[8] = {
|
|
||||||
SPH_C32(0x243F6A88),
|
|
||||||
SPH_C32(0x85A308D3),
|
|
||||||
SPH_C32(0x13198A2E),
|
|
||||||
SPH_C32(0x03707344),
|
|
||||||
SPH_C32(0xA4093822),
|
|
||||||
SPH_C32(0x299F31D0),
|
|
||||||
SPH_C32(0x082EFA98),
|
|
||||||
SPH_C32(0xEC4E6C89)
|
|
||||||
};
|
|
||||||
|
|
||||||
#define PASS1(n, in) { \
|
|
||||||
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 1], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[ 2], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[ 5], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[ 6], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[ 7], SPH_C32(0x00000000)); \
|
|
||||||
\
|
|
||||||
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 8], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[10], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[11], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[12], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[13], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[14], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x00000000)); \
|
|
||||||
\
|
|
||||||
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[16], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[17], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[18], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[19], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[20], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[21], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[22], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0x00000000)); \
|
|
||||||
\
|
|
||||||
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[25], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[27], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[29], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[30], SPH_C32(0x00000000)); \
|
|
||||||
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[31], SPH_C32(0x00000000)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define PASS2(n, in) { \
|
|
||||||
STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x452821E6)); \
|
|
||||||
STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0x38D01377)); \
|
|
||||||
STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0xBE5466CF)); \
|
|
||||||
STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[18], SPH_C32(0x34E90C6C)); \
|
|
||||||
STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[11], SPH_C32(0xC0AC29B7)); \
|
|
||||||
STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[28], SPH_C32(0xC97C50DD)); \
|
|
||||||
STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 7], SPH_C32(0x3F84D5B5)); \
|
|
||||||
STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[16], SPH_C32(0xB5470917)); \
|
|
||||||
\
|
|
||||||
STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x9216D5D9)); \
|
|
||||||
STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0x8979FB1B)); \
|
|
||||||
STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[20], SPH_C32(0xD1310BA6)); \
|
|
||||||
STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0x98DFB5AC)); \
|
|
||||||
STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0x2FFD72DB)); \
|
|
||||||
STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xD01ADFB7)); \
|
|
||||||
STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 4], SPH_C32(0xB8E1AFED)); \
|
|
||||||
STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 8], SPH_C32(0x6A267E96)); \
|
|
||||||
\
|
|
||||||
STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[30], SPH_C32(0xBA7C9045)); \
|
|
||||||
STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0xF12C7F99)); \
|
|
||||||
STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x24A19947)); \
|
|
||||||
STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[ 9], SPH_C32(0xB3916CF7)); \
|
|
||||||
STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x0801F2E2)); \
|
|
||||||
STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[24], SPH_C32(0x858EFC16)); \
|
|
||||||
STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[29], SPH_C32(0x636920D8)); \
|
|
||||||
STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 6], SPH_C32(0x71574E69)); \
|
|
||||||
\
|
|
||||||
STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0xA458FEA3)); \
|
|
||||||
STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[12], SPH_C32(0xF4933D7E)); \
|
|
||||||
STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[15], SPH_C32(0x0D95748F)); \
|
|
||||||
STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[13], SPH_C32(0x728EB658)); \
|
|
||||||
STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0x718BCD58)); \
|
|
||||||
STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0x82154AEE)); \
|
|
||||||
STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x7B54A41D)); \
|
|
||||||
STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0xC25A59B5)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define PASS3(n, in) { \
|
|
||||||
STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x9C30D539)); \
|
|
||||||
STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x2AF26013)); \
|
|
||||||
STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 4], SPH_C32(0xC5D1B023)); \
|
|
||||||
STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0x286085F0)); \
|
|
||||||
STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0xCA417918)); \
|
|
||||||
STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[17], SPH_C32(0xB8DB38EF)); \
|
|
||||||
STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 8], SPH_C32(0x8E79DCB0)); \
|
|
||||||
STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[22], SPH_C32(0x603A180E)); \
|
|
||||||
\
|
|
||||||
STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[29], SPH_C32(0x6C9E0E8B)); \
|
|
||||||
STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0xB01E8A3E)); \
|
|
||||||
STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[25], SPH_C32(0xD71577C1)); \
|
|
||||||
STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[12], SPH_C32(0xBD314B27)); \
|
|
||||||
STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[24], SPH_C32(0x78AF2FDA)); \
|
|
||||||
STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[30], SPH_C32(0x55605C60)); \
|
|
||||||
STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0xE65525F3)); \
|
|
||||||
STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[26], SPH_C32(0xAA55AB94)); \
|
|
||||||
\
|
|
||||||
STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[31], SPH_C32(0x57489862)); \
|
|
||||||
STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[15], SPH_C32(0x63E81440)); \
|
|
||||||
STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 7], SPH_C32(0x55CA396A)); \
|
|
||||||
STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x2AAB10B6)); \
|
|
||||||
STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0xB4CC5C34)); \
|
|
||||||
STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[ 0], SPH_C32(0x1141E8CE)); \
|
|
||||||
STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[18], SPH_C32(0xA15486AF)); \
|
|
||||||
STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0x7C72E993)); \
|
|
||||||
\
|
|
||||||
STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[13], SPH_C32(0xB3EE1411)); \
|
|
||||||
STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x636FBC2A)); \
|
|
||||||
STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x2BA9C55D)); \
|
|
||||||
STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[10], SPH_C32(0x741831F6)); \
|
|
||||||
STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[23], SPH_C32(0xCE5C3E16)); \
|
|
||||||
STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x9B87931E)); \
|
|
||||||
STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 5], SPH_C32(0xAFD6BA33)); \
|
|
||||||
STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[ 2], SPH_C32(0x6C24CF5C)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define PASS4(n, in) { \
|
|
||||||
STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x7A325381)); \
|
|
||||||
STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 4], SPH_C32(0x28958677)); \
|
|
||||||
STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 0], SPH_C32(0x3B8F4898)); \
|
|
||||||
STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[14], SPH_C32(0x6B4BB9AF)); \
|
|
||||||
STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0xC4BFE81B)); \
|
|
||||||
STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[ 7], SPH_C32(0x66282193)); \
|
|
||||||
STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x61D809CC)); \
|
|
||||||
STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0xFB21A991)); \
|
|
||||||
\
|
|
||||||
STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[26], SPH_C32(0x487CAC60)); \
|
|
||||||
STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x5DEC8032)); \
|
|
||||||
STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[30], SPH_C32(0xEF845D5D)); \
|
|
||||||
STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0xE98575B1)); \
|
|
||||||
STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDC262302)); \
|
|
||||||
STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0xEB651B88)); \
|
|
||||||
STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[19], SPH_C32(0x23893E81)); \
|
|
||||||
STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 3], SPH_C32(0xD396ACC5)); \
|
|
||||||
\
|
|
||||||
STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[22], SPH_C32(0x0F6D6FF3)); \
|
|
||||||
STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[11], SPH_C32(0x83F44239)); \
|
|
||||||
STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[31], SPH_C32(0x2E0B4482)); \
|
|
||||||
STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[21], SPH_C32(0xA4842004)); \
|
|
||||||
STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 8], SPH_C32(0x69C8F04A)); \
|
|
||||||
STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[27], SPH_C32(0x9E1F9B5E)); \
|
|
||||||
STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[12], SPH_C32(0x21C66842)); \
|
|
||||||
STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 9], SPH_C32(0xF6E96C9A)); \
|
|
||||||
\
|
|
||||||
STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[ 1], SPH_C32(0x670C9C61)); \
|
|
||||||
STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[29], SPH_C32(0xABD388F0)); \
|
|
||||||
STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 5], SPH_C32(0x6A51A0D2)); \
|
|
||||||
STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[15], SPH_C32(0xD8542F68)); \
|
|
||||||
STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x960FA728)); \
|
|
||||||
STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xAB5133A3)); \
|
|
||||||
STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0x6EEF0B6C)); \
|
|
||||||
STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[13], SPH_C32(0x137A3BE4)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define PASS5(n, in) { \
|
|
||||||
STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[27], SPH_C32(0xBA3BF050)); \
|
|
||||||
STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0x7EFB2A98)); \
|
|
||||||
STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0xA1F1651D)); \
|
|
||||||
STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[26], SPH_C32(0x39AF0176)); \
|
|
||||||
STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x66CA593E)); \
|
|
||||||
STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x82430E88)); \
|
|
||||||
STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[20], SPH_C32(0x8CEE8619)); \
|
|
||||||
STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[29], SPH_C32(0x456F9FB4)); \
|
|
||||||
\
|
|
||||||
STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x7D84A5C3)); \
|
|
||||||
STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 0], SPH_C32(0x3B8B5EBE)); \
|
|
||||||
STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[12], SPH_C32(0xE06F75D8)); \
|
|
||||||
STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[ 7], SPH_C32(0x85C12073)); \
|
|
||||||
STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[13], SPH_C32(0x401A449F)); \
|
|
||||||
STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 8], SPH_C32(0x56C16AA6)); \
|
|
||||||
STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x4ED3AA62)); \
|
|
||||||
STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[10], SPH_C32(0x363F7706)); \
|
|
||||||
\
|
|
||||||
STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x1BFEDF72)); \
|
|
||||||
STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x429B023D)); \
|
|
||||||
STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[14], SPH_C32(0x37D0D724)); \
|
|
||||||
STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[30], SPH_C32(0xD00A1248)); \
|
|
||||||
STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDB0FEAD3)); \
|
|
||||||
STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 6], SPH_C32(0x49F1C09B)); \
|
|
||||||
STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x075372C9)); \
|
|
||||||
STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[24], SPH_C32(0x80991B7B)); \
|
|
||||||
\
|
|
||||||
STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 2], SPH_C32(0x25D479D8)); \
|
|
||||||
STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0xF6E8DEF7)); \
|
|
||||||
STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[16], SPH_C32(0xE3FE501A)); \
|
|
||||||
STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0xB6794C3B)); \
|
|
||||||
STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x976CE0BD)); \
|
|
||||||
STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 1], SPH_C32(0x04C006BA)); \
|
|
||||||
STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[25], SPH_C32(0xC1A94FB6)); \
|
|
||||||
STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x409F60C4)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define F1(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
(((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0))
|
|
||||||
|
|
||||||
#define F2(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
(((x2) & (((x1) & ~(x3)) ^ ((x4) & (x5)) ^ (x6) ^ (x0))) \
|
|
||||||
^ ((x4) & ((x1) ^ (x5))) ^ ((x3 & (x5)) ^ (x0)))
|
|
||||||
|
|
||||||
#define F3(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
(((x3) & (((x1) & (x2)) ^ (x6) ^ (x0))) \
|
|
||||||
^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ (x0))
|
|
||||||
|
|
||||||
#define F4(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
(((x3) & (((x1) & (x2)) ^ ((x4) | (x6)) ^ (x5))) \
|
|
||||||
^ ((x4) & ((~(x2) & (x5)) ^ (x1) ^ (x6) ^ (x0))) \
|
|
||||||
^ ((x2) & (x6)) ^ (x0))
|
|
||||||
|
|
||||||
#define F5(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
(((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \
|
|
||||||
^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)))
|
|
||||||
|
|
||||||
#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
F1(x3, x4, x1, x0, x5, x2, x6)
|
|
||||||
#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
F2(x6, x2, x1, x0, x3, x4, x5)
|
|
||||||
#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
F3(x2, x6, x0, x4, x3, x1, x5)
|
|
||||||
#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
F4(x1, x5, x3, x2, x0, x4, x6)
|
|
||||||
#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
|
|
||||||
F5(x2, x5, x0, x6, x4, x3, x1)
|
|
||||||
|
|
||||||
|
|
||||||
#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \
|
|
||||||
uint32_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
|
|
||||||
(x7) = SPH_T32(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \
|
|
||||||
+ (w) + (c)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
__global__
|
|
||||||
void x17_haval256_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
|
||||||
{
|
|
||||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
|
||||||
if (thread < threads)
|
|
||||||
{
|
|
||||||
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
|
|
||||||
int hashPosition = nounce - startNounce;
|
|
||||||
uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
|
|
||||||
union {
|
|
||||||
uint8_t h1[64];
|
|
||||||
uint32_t h4[16];
|
|
||||||
uint64_t h8[8];
|
|
||||||
} hash;
|
|
||||||
|
|
||||||
uint32_t u0, u1, u2, u3, u4, u5, u6, u7;
|
|
||||||
uint32_t s0,s1,s2,s3,s4,s5,s6,s7;
|
|
||||||
uint32_t buf[32];
|
|
||||||
|
|
||||||
s0 = initVector[0];
|
|
||||||
s1 = initVector[1];
|
|
||||||
s2 = initVector[2];
|
|
||||||
s3 = initVector[3];
|
|
||||||
s4 = initVector[4];
|
|
||||||
s5 = initVector[5];
|
|
||||||
s6 = initVector[6];
|
|
||||||
s7 = initVector[7];
|
|
||||||
|
|
||||||
u0 = s0;
|
|
||||||
u1 = s1;
|
|
||||||
u2 = s2;
|
|
||||||
u3 = s3;
|
|
||||||
u4 = s4;
|
|
||||||
u5 = s5;
|
|
||||||
u6 = s6;
|
|
||||||
u7 = s7;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int i=0; i<16; i++) {
|
|
||||||
hash.h4[i]= inpHash[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
///////// input big /////////////////////
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int i=0; i<32; i++) {
|
|
||||||
if (i<16) {
|
|
||||||
buf[i]=hash.h4[i];
|
|
||||||
} else {
|
|
||||||
buf[i]=0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
buf[16]=0x00000001;
|
|
||||||
buf[29]=0x40290000;
|
|
||||||
buf[30]=0x00000200;
|
|
||||||
|
|
||||||
PASS1(5, buf);
|
|
||||||
PASS2(5, buf);
|
|
||||||
PASS3(5, buf);
|
|
||||||
PASS4(5, buf);
|
|
||||||
PASS5(5, buf);
|
|
||||||
|
|
||||||
s0 = SPH_T32(s0 + u0);
|
|
||||||
s1 = SPH_T32(s1 + u1);
|
|
||||||
s2 = SPH_T32(s2 + u2);
|
|
||||||
s3 = SPH_T32(s3 + u3);
|
|
||||||
s4 = SPH_T32(s4 + u4);
|
|
||||||
s5 = SPH_T32(s5 + u5);
|
|
||||||
s6 = SPH_T32(s6 + u6);
|
|
||||||
s7 = SPH_T32(s7 + u7);
|
|
||||||
|
|
||||||
hash.h4[0]=s0;
|
|
||||||
hash.h4[1]=s1;
|
|
||||||
hash.h4[2]=s2;
|
|
||||||
hash.h4[3]=s3;
|
|
||||||
hash.h4[4]=s4;
|
|
||||||
hash.h4[5]=s5;
|
|
||||||
hash.h4[6]=s6;
|
|
||||||
hash.h4[7]=s7;
|
|
||||||
|
|
||||||
#pragma unroll 16
|
|
||||||
for (int u = 0; u < 16; u ++)
|
|
||||||
inpHash[u] = hash.h4[u];
|
|
||||||
} // threads
|
|
||||||
}
|
|
||||||
|
|
||||||
__host__
|
|
||||||
void x17_haval256_cpu_init(int thr_id, uint32_t threads)
|
|
||||||
{
|
|
||||||
cudaMemcpyToSymbol(initVector,c_initVector,sizeof(c_initVector),0, cudaMemcpyHostToDevice);
|
|
||||||
}
|
|
||||||
|
|
||||||
__host__
|
|
||||||
void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
|
||||||
{
|
|
||||||
const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
|
|
||||||
|
|
||||||
// berechne wie viele Thread Blocks wir brauchen
|
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
|
||||||
dim3 block(threadsperblock);
|
|
||||||
|
|
||||||
size_t shared_size = 0;
|
|
||||||
|
|
||||||
x17_haval256_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
|
||||||
|
|
||||||
MyStreamSynchronize(NULL, order, thr_id);
|
|
||||||
}
|
|
@ -1,14 +1,10 @@
|
|||||||
/**
|
|
||||||
* sha512 djm34
|
|
||||||
* (cleaned by tpruvot)
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* sha-512 kernel implementation.
|
* sha-512 cuda kernel implementation.
|
||||||
*
|
*
|
||||||
* ==========================(LICENSE BEGIN)============================
|
* ==========================(LICENSE BEGIN)============================
|
||||||
*
|
*
|
||||||
* Copyright (c) 2014 djm34
|
* Copyright (c) 2014 djm34
|
||||||
|
* 2016 tpruvot
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining
|
* Permission is hereby granted, free of charge, to any person obtaining
|
||||||
* a copy of this software and associated documentation files (the
|
* a copy of this software and associated documentation files (the
|
||||||
@ -30,206 +26,146 @@
|
|||||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*
|
*
|
||||||
* ===========================(LICENSE END)=============================
|
* ===========================(LICENSE END)=============================
|
||||||
*
|
|
||||||
* @author phm <phm@inbox.com>
|
|
||||||
*/
|
*/
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#define USE_SHARED 1
|
#define NEED_HASH_512
|
||||||
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
#define SWAP64(u64) cuda_swab64(u64)
|
#define SWAP64(u64) cuda_swab64(u64)
|
||||||
|
|
||||||
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
|
static __constant__ uint64_t c_WB[80];
|
||||||
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
|
|
||||||
|
|
||||||
static __constant__ uint64_t H_512[8];
|
static const uint64_t WB[80] = {
|
||||||
|
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
|
||||||
static const uint64_t H512[8] = {
|
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
|
||||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
|
||||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
|
||||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
|
||||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
|
||||||
};
|
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
|
||||||
static __constant__ uint64_t K_512[80];
|
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
|
||||||
|
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
|
||||||
static const uint64_t K512[80] = {
|
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
|
||||||
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
|
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
|
||||||
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
|
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
|
||||||
SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
|
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
|
||||||
SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
|
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
|
||||||
SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
|
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
|
||||||
SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
|
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
|
||||||
SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
|
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
|
||||||
SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
|
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
|
||||||
SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
|
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
|
||||||
SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
|
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
|
||||||
SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
|
|
||||||
SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
|
|
||||||
SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
|
|
||||||
SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
|
|
||||||
SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
|
|
||||||
SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
|
|
||||||
SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
|
|
||||||
SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
|
|
||||||
SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
|
|
||||||
SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
|
|
||||||
SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
|
|
||||||
SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
|
|
||||||
SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
|
|
||||||
SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
|
|
||||||
SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
|
|
||||||
SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
|
|
||||||
SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
|
|
||||||
SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
|
|
||||||
SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
|
|
||||||
SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
|
|
||||||
SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
|
|
||||||
SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
|
|
||||||
SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
|
|
||||||
SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
|
|
||||||
SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
|
|
||||||
SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
|
|
||||||
SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
|
|
||||||
SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
|
|
||||||
SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
|
|
||||||
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#define SHA3_STEP(ord,r,i) { \
|
|
||||||
uint64_t T1, T2; \
|
|
||||||
int a = 8-ord; \
|
|
||||||
T1 = SPH_T64(r[(7+a)%8] + BSG5_1(r[(4+a)%8]) + CH(r[(4+a)%8], r[(5+a)%8], r[(6+a)%8]) + K_512[i] + W[i]); \
|
|
||||||
T2 = SPH_T64(BSG5_0(r[(0+a)%8]) + MAJ(r[(0+a)%8], r[(1+a)%8], r[(2+a)%8])); \
|
|
||||||
r[(3+a)%8] = SPH_T64(r[(3+a)%8] + T1); \
|
|
||||||
r[(7+a)%8] = SPH_T64(T1 + T2); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define SHA3_STEP2(truc,ord,r,i) { \
|
|
||||||
uint64_t T1, T2; \
|
|
||||||
int a = 8-ord; \
|
|
||||||
T1 = Tone(truc,r,W,a,i); \
|
|
||||||
T2 = SPH_T64(BSG5_0(r[(0+a)%8]) + MAJ(r[(0+a)%8], r[(1+a)%8], r[(2+a)%8])); \
|
|
||||||
r[(3+a)%8] = SPH_T64(r[(3+a)%8] + T1); \
|
|
||||||
r[(7+a)%8] = SPH_T64(T1 + T2); \
|
|
||||||
}
|
|
||||||
//#define BSG5_0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
|
|
||||||
#define BSG5_0(x) xor3(ROTR64(x,28), ROTR64(x,34), ROTR64(x,39))
|
#define BSG5_0(x) xor3(ROTR64(x,28), ROTR64(x,34), ROTR64(x,39))
|
||||||
|
|
||||||
//#define BSG5_1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
|
|
||||||
#define BSG5_1(x) xor3(ROTR64(x, 14),ROTR64(x, 18),ROTR64(x, 41))
|
|
||||||
|
|
||||||
//#define SSG5_0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
|
|
||||||
#define SSG5_0(x) xor3(ROTR64(x, 1), ROTR64(x ,8), shr_t64(x,7))
|
#define SSG5_0(x) xor3(ROTR64(x, 1), ROTR64(x ,8), shr_t64(x,7))
|
||||||
|
|
||||||
//#define SSG5_1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
|
|
||||||
#define SSG5_1(x) xor3(ROTR64(x,19), ROTR64(x,61), shr_t64(x,6))
|
#define SSG5_1(x) xor3(ROTR64(x,19), ROTR64(x,61), shr_t64(x,6))
|
||||||
|
|
||||||
//#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
|
||||||
#define CH(x, y, z) xandx(x,y,z)
|
|
||||||
//#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
|
//#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
|
||||||
#define MAJ(x, y, z) andor(x,y,z)
|
#define MAJ(x, y, z) andor(x,y,z)
|
||||||
|
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
uint64_t Tone(const uint64_t* sharedMemory, uint64_t r[8], uint64_t W[80], uint32_t a, uint32_t i)
|
uint64_t Tone(uint64_t* K, uint64_t* r, uint64_t* W, const int a, const int i)
|
||||||
{
|
{
|
||||||
uint64_t h = r[(7+a)%8];
|
//asm("// TONE \n");
|
||||||
uint64_t e = r[(4+a)%8];
|
const uint64_t e = r[(a+4) & 7];
|
||||||
uint64_t f = r[(5+a)%8];
|
|
||||||
uint64_t g = r[(6+a)%8];
|
|
||||||
//uint64_t BSG51 = ROTR64(e, 14) ^ ROTR64(e, 18) ^ ROTR64(e, 41);
|
|
||||||
uint64_t BSG51 = xor3(ROTR64(e, 14), ROTR64(e, 18), ROTR64(e, 41));
|
uint64_t BSG51 = xor3(ROTR64(e, 14), ROTR64(e, 18), ROTR64(e, 41));
|
||||||
//uint64_t CHl = (((f) ^ (g)) & (e)) ^ (g);
|
const uint64_t f = r[(a+5) & 7];
|
||||||
uint64_t CHl = xandx(e,f,g);
|
const uint64_t g = r[(a+6) & 7];
|
||||||
uint64_t result = SPH_T64(h+BSG51+CHl+sharedMemory[i]+W[i]);
|
uint64_t CHl = ((f ^ g) & e) ^ g; // xandx(e, f, g);
|
||||||
return result;
|
return (r[(a+7) & 7] + BSG51 + CHl + K[i] + W[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__
|
#define SHA3_STEP(K, r, W, ord, i) { \
|
||||||
void x17_sha512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
const int a = (8 - ord) & 7; \
|
||||||
|
uint64_t T1 = Tone(K, r, W, a, i); \
|
||||||
|
r[(a+3) & 7] += T1; \
|
||||||
|
uint64_t T2 = (BSG5_0(r[a]) + MAJ(r[a], r[(a+1) & 7], r[(a+2) & 7])); \
|
||||||
|
r[(a+7) & 7] = T1 + T2; \
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ /*__launch_bounds__(256, 4)*/
|
||||||
|
void x17_sha512_gpu_hash_64(const uint32_t threads, uint64_t *g_hash)
|
||||||
{
|
{
|
||||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
uint64_t *pHash = &g_hash[thread*8U];
|
||||||
int hashPosition = nounce - startNounce;
|
|
||||||
uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
|
|
||||||
union {
|
|
||||||
uint8_t h1[64];
|
|
||||||
uint32_t h4[16];
|
|
||||||
uint64_t h8[8];
|
|
||||||
} hash;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int i=0;i<16;i++) {
|
|
||||||
hash.h4[i]= inpHash[i];
|
|
||||||
}
|
|
||||||
uint64_t W[80];
|
uint64_t W[80];
|
||||||
uint64_t r[8];
|
|
||||||
|
|
||||||
#pragma unroll 71
|
|
||||||
for (int i=9;i<80;i++) {
|
|
||||||
W[i]=0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 8; i ++) {
|
for (int i = 0; i < 8; i ++) {
|
||||||
W[i] = SWAP64(hash.h8[i]);
|
W[i] = SWAP64(pHash[i]);
|
||||||
r[i] = H_512[i];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
W[8] = 0x8000000000000000;
|
W[8] = 0x8000000000000000;
|
||||||
|
|
||||||
|
#pragma unroll 69
|
||||||
|
for (int i = 9; i<78; i++) {
|
||||||
|
W[i] = 0U;
|
||||||
|
}
|
||||||
W[15] = 0x0000000000000200;
|
W[15] = 0x0000000000000200;
|
||||||
|
|
||||||
#pragma unroll 64
|
#pragma unroll 64
|
||||||
for (int i = 16; i < 80; i ++)
|
for (int i = 16; i < 80; i ++) {
|
||||||
W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7]
|
W[i] = SSG5_1(W[i-2]) + W[i-7];
|
||||||
+ SSG5_0(W[i - 15]) + W[i - 16]);
|
W[i] += SSG5_0(W[i-15]) + W[i-16];
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint64_t IV512[8] = {
|
||||||
|
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||||
|
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||||
|
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||||
|
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||||
|
};
|
||||||
|
|
||||||
|
uint64_t r[8];
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 8; i ++) {
|
||||||
|
r[i] = IV512[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= 7050
|
||||||
#pragma unroll 10
|
#pragma unroll 10
|
||||||
|
#endif
|
||||||
for (int i = 0; i < 80; i += 8) {
|
for (int i = 0; i < 80; i += 8) {
|
||||||
#pragma unroll 8
|
#pragma unroll
|
||||||
for (int ord = 0; ord < 8; ord++) {
|
for (int ord = 0; ord < 8; ord++) {
|
||||||
SHA3_STEP2(K_512,ord,r,i+ord);
|
SHA3_STEP(c_WB, r, W, ord, i+ord);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll 8
|
#pragma unroll
|
||||||
for (int i = 0; i < 8; i++) {
|
for (int u = 0; u < 4; u ++) {
|
||||||
r[i] = SPH_T64(r[i] + H_512[i]);
|
pHash[u] = SWAP64(r[u] + IV512[u]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll 8
|
#ifdef NEED_HASH_512
|
||||||
for(int i=0;i<8;i++) {
|
#pragma unroll
|
||||||
hash.h8[i] = SWAP64(r[i]);
|
for (int u = 4; u < 8; u ++) {
|
||||||
}
|
pHash[u] = SWAP64(r[u] + IV512[u]);
|
||||||
|
|
||||||
#pragma unroll 16
|
|
||||||
for (int u = 0; u < 16; u ++) {
|
|
||||||
inpHash[u] = hash.h4[u];
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
void x17_sha512_cpu_init(int thr_id, uint32_t threads)
|
void x17_sha512_cpu_init(int thr_id, uint32_t threads)
|
||||||
{
|
{
|
||||||
cudaMemcpyToSymbol(K_512,K512,80*sizeof(uint64_t),0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(c_WB, WB, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(H_512,H512,sizeof(H512),0, cudaMemcpyHostToDevice);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
|
||||||
{
|
{
|
||||||
const uint32_t threadsperblock = 256;
|
const uint32_t threadsperblock = 256;
|
||||||
|
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
size_t shared_size =0;
|
|
||||||
x17_sha512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
|
||||||
|
|
||||||
MyStreamSynchronize(NULL, order, thr_id);
|
x17_sha512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
|
||||||
|
|
||||||
|
//MyStreamSynchronize(NULL, order, thr_id);
|
||||||
}
|
}
|
||||||
|
16
x17/x17.cu
16
x17/x17.cu
@ -49,10 +49,10 @@ extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t sta
|
|||||||
extern void x15_whirlpool_cpu_free(int thr_id);
|
extern void x15_whirlpool_cpu_free(int thr_id);
|
||||||
|
|
||||||
extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
|
extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
|
||||||
extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
|
extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
|
||||||
extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
|
||||||
// X17 Hashfunktion
|
// X17 Hashfunktion
|
||||||
@ -206,6 +206,8 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
|
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
|
||||||
cuda_check_cpu_setTarget(ptarget);
|
cuda_check_cpu_setTarget(ptarget);
|
||||||
|
|
||||||
|
int warn = 0;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
int order = 0;
|
int order = 0;
|
||||||
|
|
||||||
@ -224,8 +226,8 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||||
x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||||
|
|
||||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||||
|
|
||||||
@ -251,8 +253,14 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
}
|
}
|
||||||
pdata[19] = foundNonce;
|
pdata[19] = foundNonce;
|
||||||
return res;
|
return res;
|
||||||
|
} else {
|
||||||
|
// x11+ coins could do some random error, but not on retry
|
||||||
|
if (!warn) {
|
||||||
|
warn++; continue;
|
||||||
} else {
|
} else {
|
||||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
|
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
|
||||||
|
warn = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user