Browse Source

tribus: optimised version for recent cards

main improvement is on echo, based on alexis sib kernel work

tested on SM 3.0 and more recent
master
Tanguy Pruvot 7 years ago
parent
commit
d47dd9de39
  1. 2
      Makefile.am
  2. 6
      ccminer.vcxproj
  3. 13
      ccminer.vcxproj.filters
  4. 8
      res/ccminer.rc
  5. 318
      tribus/cuda_echo512_aes.cuh
  6. 285
      tribus/cuda_echo512_final.cu
  7. 47
      tribus/tribus.cu

2
Makefile.am

@ -66,7 +66,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -66,7 +66,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/ripemd.c sph/sph_sha2.c \
lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
tribus.cu \
tribus/tribus.cu tribus/cuda_echo512_final.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \

6
ccminer.vcxproj

@ -538,7 +538,11 @@ @@ -538,7 +538,11 @@
<MaxRegCount>64</MaxRegCount>
</CudaCompile>
<CudaCompile Include="skunk\cuda_skunk_streebog.cu" />
<CudaCompile Include="tribus.cu" />
<ClInclude Include="tribus\cuda_echo512_aes.cuh" />
<CudaCompile Include="tribus\cuda_echo512_final.cu">
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
</CudaCompile>
<CudaCompile Include="tribus\tribus.cu" />
<ClInclude Include="x11\cuda_x11_aes.cuh" />
<CudaCompile Include="x11\cuda_x11_cubehash512.cu" />
<CudaCompile Include="x11\cuda_x11_echo.cu">

13
ccminer.vcxproj.filters

@ -109,6 +109,9 @@ @@ -109,6 +109,9 @@
<Filter Include="Source Files\equi">
<UniqueIdentifier>{031afae7-2a78-4e32-9738-4b589b6f7ff3}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\tribus">
<UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="compat\jansson\dump.c">
@ -766,9 +769,15 @@ @@ -766,9 +769,15 @@
<CudaCompile Include="skunk\cuda_skunk_streebog.cu">
<Filter>Source Files\CUDA\skunk</Filter>
</CudaCompile>
<CudaCompile Include="tribus.cu">
<Filter>Source Files\CUDA</Filter>
<CudaCompile Include="tribus\tribus.cu">
<Filter>Source Files\CUDA\tribus</Filter>
</CudaCompile>
<CudaCompile Include="tribus\cuda_echo512_final.cu">
<Filter>Source Files\CUDA\tribus</Filter>
</CudaCompile>
<ClInclude Include="tribus\cuda_echo512_aes.cuh">
<Filter>Source Files\CUDA\tribus</Filter>
</ClInclude>
<CudaCompile Include="x11\sib.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>

8
res/ccminer.rc

@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 2,2,0,0
PRODUCTVERSION 2,2,0,0
FILEVERSION 2,2,1,0
PRODUCTVERSION 2,2,1,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN @@ -76,10 +76,10 @@ BEGIN
BEGIN
BLOCK "040904e4"
BEGIN
VALUE "FileVersion", "2.2"
VALUE "FileVersion", "2.2.1"
VALUE "LegalCopyright", "Copyright (C) 2017"
VALUE "ProductName", "ccminer"
VALUE "ProductVersion", "2.2"
VALUE "ProductVersion", "2.2.1"
END
END
BLOCK "VarFileInfo"

318
tribus/cuda_echo512_aes.cuh

@ -0,0 +1,318 @@ @@ -0,0 +1,318 @@
#ifdef __INTELLISENSE__
#define __byte_perm(x, y, b) x
#define __CUDA_ARCH__ 520
#include <cuda_helper.h>
#endif
#undef ROL8
#undef ROR8
#undef ROL16
#ifdef __CUDA_ARCH__
__device__ __forceinline__
uint32_t ROL8(const uint32_t a) {
return __byte_perm(a, 0, 0x2103);
}
__device__ __forceinline__
uint32_t ROR8(const uint32_t a) {
return __byte_perm(a, 0, 0x0321);
}
__device__ __forceinline__
uint32_t ROL16(const uint32_t a) {
return __byte_perm(a, 0, 0x1032);
}
#else
#define ROL8(u) ROTL32(u, 8)
#define ROR8(u) ROTR32(u, 8)
#define ROL16(u) ROTL32(u,16)
#endif
__device__ uint32_t d_AES0[256] = {
0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,
0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,
0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,
0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F,
0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA,
0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B,
0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413,
0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6,
0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85,
0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511,
0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B,
0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1,
0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF,
0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E,
0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6,
0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B,
0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD,
0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8,
0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2,
0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949,
0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810,
0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697,
0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F,
0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C,
0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27,
0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433,
0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5,
0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,
0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C
};
__device__ uint32_t d_AES3[256] = {
0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5,
0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676,
0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0,
0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0,
0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC,
0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515,
0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A,
0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575,
0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0,
0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484,
0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B,
0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF,
0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585,
0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8,
0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5,
0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2,
0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717,
0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373,
0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888,
0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB,
0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C,
0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979,
0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9,
0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808,
0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6,
0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A,
0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E,
0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E,
0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494,
0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF,
0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868,
0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616
};
__device__ __forceinline__
void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256])
{
/* each thread startup will fill a uint32 */
if (threadIdx.x < 256) {
uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
sharedMemory[0][threadIdx.x] = temp;
sharedMemory[1][threadIdx.x] = ROL8(temp);
sharedMemory[2][threadIdx.x] = ROL16(temp);
sharedMemory[3][threadIdx.x] = ROR8(temp);
}
}
__device__ __forceinline__
void aes_gpu_init256(uint32_t sharedMemory[4][256])
{
/* each thread startup will fill a uint32 */
uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
sharedMemory[0][threadIdx.x] = temp;
sharedMemory[1][threadIdx.x] = ROL8(temp);
sharedMemory[2][threadIdx.x] = ROL16(temp);
sharedMemory[3][threadIdx.x] = ROR8(temp);
}
__device__ __forceinline__
void aes_gpu_init128(uint32_t sharedMemory[4][256])
{
/* each thread startup will fill 2 uint32 */
uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
}
__device__ __forceinline__
void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256])
{
if (threadIdx.x < 128) {
/* each thread startup will fill 2 uint32 */
uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
}
}
__device__ __forceinline__
static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{
y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
#ifdef INTENSIVE_GMF
y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
#else
y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)];
#endif
y0 ^= k0;
y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
}
__device__ __forceinline__
static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{
y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
y0 ^= k0;
y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
}
__device__ __forceinline__
static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{
y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
#ifdef INTENSIVE_GMF
y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
#else
y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
#endif
y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
}
__device__ __forceinline__
static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{
y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
}
__device__ __forceinline__
static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
{
uint32_t y0, y1, y2, y3;
aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
// hier werden wir ein carry brauchen (oder auch nicht)
k0++;
}
__device__ __forceinline__
static void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
{
uint32_t y0, y1, y2, y3;
aes_round_LDG(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
// hier werden wir ein carry brauchen (oder auch nicht)
k0++;
}
__device__ __forceinline__
static void AES_ROUND_NOKEY(const uint32_t sharedMemory[4][256], uint4* x)
{
uint32_t y0, y1, y2, y3;
aes_round(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3);
x->x = y0;
x->y = y1;
x->z = y2;
x->w = y3;
}
__device__ __forceinline__
static void KEY_EXPAND_ELT(const uint32_t sharedMemory[4][256], uint32_t *k)
{
uint32_t y0, y1, y2, y3;
aes_round(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3);
k[0] = y1;
k[1] = y2;
k[2] = y3;
k[3] = y0;
}

285
tribus/cuda_echo512_final.cu

@ -0,0 +1,285 @@ @@ -0,0 +1,285 @@
/**
* Based on Provos Alexis work - 2016 FOR SM 5+
*
* final touch by tpruvot for tribus - 09 2017
*/
#include <cuda_helper.h>
#include <cuda_vector_uint2x4.h>
#include <cuda_vectors.h>
#define INTENSIVE_GMF
#include "tribus/cuda_echo512_aes.cuh"
#ifdef __INTELLISENSE__
#define __byte_perm(x, y, b) x
#define atomicExch(p,y) (*p) = y
#endif
__device__
static void echo_round(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0)
{
// Big Sub Words
#pragma unroll 16
for (int idx = 0; idx < 16; idx++)
AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0);
// Shift Rows
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint32_t t[4];
/// 1, 5, 9, 13
t[0] = W[i + 4];
t[1] = W[i + 8];
t[2] = W[i + 24];
t[3] = W[i + 60];
W[i + 4] = W[i + 20];
W[i + 8] = W[i + 40];
W[i + 24] = W[i + 56];
W[i + 60] = W[i + 44];
W[i + 20] = W[i + 36];
W[i + 40] = t[1];
W[i + 56] = t[2];
W[i + 44] = W[i + 28];
W[i + 28] = W[i + 12];
W[i + 12] = t[3];
W[i + 36] = W[i + 52];
W[i + 52] = t[0];
}
// Mix Columns
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
#pragma unroll 4
for (int idx = 0; idx < 64; idx += 16)
{
uint32_t a[4];
a[0] = W[idx + i];
a[1] = W[idx + i + 4];
a[2] = W[idx + i + 8];
a[3] = W[idx + i +12];
uint32_t ab = a[0] ^ a[1];
uint32_t bc = a[1] ^ a[2];
uint32_t cd = a[2] ^ a[3];
uint32_t t, t2, t3;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[idx + i] = bc ^ a[3] ^ abx;
W[idx + i + 4] = a[0] ^ cd ^ bcx;
W[idx + i + 8] = ab ^ a[3] ^ cdx;
W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
}
}
}
__global__ __launch_bounds__(256, 3) /* will force 80 registers */
static void tribus_echo512_gpu_final(uint32_t threads, uint64_t *g_hash, uint32_t* resNonce, const uint64_t target)
{
__shared__ uint32_t sharedMemory[4][256];
aes_gpu_init256(sharedMemory);
const uint32_t P[48] = {
0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//8-12
0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//21-25
0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751,0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//34-38
0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7,0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
//58-61
};
uint32_t k0;
uint32_t h[16];
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t *hash = (uint32_t*)&g_hash[thread<<3];
*(uint2x4*)&h[0] = __ldg4((uint2x4*)&hash[0]);
*(uint2x4*)&h[8] = __ldg4((uint2x4*)&hash[8]);
uint64_t backup = *(uint64_t*)&h[6];
k0 = 512 + 8;
#pragma unroll 4
for (uint32_t idx = 0; idx < 16; idx += 4)
AES_2ROUND(sharedMemory,h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
k0 += 4;
uint32_t W[64];
#pragma unroll 4
for (uint32_t i = 0; i < 4; i++)
{
uint32_t a = P[i];
uint32_t b = P[i + 4];
uint32_t c = h[i + 8];
uint32_t d = P[i + 8];
uint32_t ab = a ^ b;
uint32_t bc = b ^ c;
uint32_t cd = c ^ d;
uint32_t t = ((a ^ b) & 0x80808080);
uint32_t t2 = ((b ^ c) & 0x80808080);
uint32_t t3 = ((c ^ d) & 0x80808080);
uint32_t abx = ((t >> 7) * 27U) ^ ((ab^t) << 1);
uint32_t bcx = ((t2 >> 7) * 27U) ^ ((bc^t2) << 1);
uint32_t cdx = ((t3 >> 7) * 27U) ^ ((cd^t3) << 1);
W[0 + i] = bc ^ d ^ abx;
W[4 + i] = a ^ cd ^ bcx;
W[8 + i] = ab ^ d ^ cdx;
W[12+ i] = abx ^ bcx ^ cdx ^ ab ^ c;
a = P[12 + i];
b = h[i + 4];
c = P[12 + i + 4];
d = P[12 + i + 8];
ab = a ^ b;
bc = b ^ c;
cd = c ^ d;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
abx = (t >> 7) * 27U ^ ((ab^t) << 1);
bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[16 + i] = abx ^ bc ^ d;
W[16 + i + 4] = bcx ^ a ^ cd;
W[16 + i + 8] = cdx ^ ab ^ d;
W[16 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
a = h[i];
b = P[24 + i];
c = P[24 + i + 4];
d = P[24 + i + 8];
ab = a ^ b;
bc = b ^ c;
cd = c ^ d;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
abx = (t >> 7) * 27U ^ ((ab^t) << 1);
bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[32 + i] = abx ^ bc ^ d;
W[32 + i + 4] = bcx ^ a ^ cd;
W[32 + i + 8] = cdx ^ ab ^ d;
W[32 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
a = P[36 + i ];
b = P[36 + i + 4];
c = P[36 + i + 8];
d = h[i + 12];
ab = a ^ b;
bc = b ^ c;
cd = c ^ d;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
abx = (t >> 7) * 27U ^ ((ab^t) << 1);
bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[48 + i] = abx ^ bc ^ d;
W[48 + i + 4] = bcx ^ a ^ cd;
W[48 + i + 8] = cdx ^ ab ^ d;
W[48 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
}
for (int k = 1; k < 9; k++)
echo_round(sharedMemory,W,k0);
// Big Sub Words
uint32_t y0, y1, y2, y3;
// AES_2ROUND(sharedMemory,W[ 0], W[ 1], W[ 2], W[ 3], k0);
aes_round(sharedMemory, W[ 0], W[ 1], W[ 2], W[ 3], k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, W[ 0], W[ 1], W[ 2], W[ 3]);
aes_round(sharedMemory, W[ 4], W[ 5], W[ 6], W[ 7], k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, W[ 4], W[ 5], W[ 6], W[ 7]);
aes_round(sharedMemory, W[ 8], W[ 9], W[10], W[11], k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, W[ 8], W[ 9], W[10], W[11]);
aes_round(sharedMemory, W[20], W[21], W[22], W[23], k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, W[20], W[21], W[22], W[23]);
aes_round(sharedMemory, W[28], W[29], W[30], W[31], k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, W[28], W[29], W[30], W[31]);
aes_round(sharedMemory, W[32], W[33], W[34], W[35], k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, W[32], W[33], W[34], W[35]);
aes_round(sharedMemory, W[40], W[41], W[42], W[43], k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, W[40], W[41], W[42], W[43]);
aes_round(sharedMemory, W[52], W[53], W[54], W[55], k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, W[52], W[53], W[54], W[55]);
aes_round(sharedMemory, W[60], W[61], W[62], W[63], k0, y0, y1, y2, y3);
aes_round(sharedMemory, y0, y1, y2, y3, W[60], W[61], W[62], W[63]);
uint32_t bc = W[22] ^ W[42];
uint32_t t2 = (bc & 0x80808080);
W[ 6] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
bc = W[23] ^ W[43];
t2 = (bc & 0x80808080);
W[ 7] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
bc = W[10] ^ W[54];
t2 = (bc & 0x80808080);
W[38] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
bc = W[11] ^ W[55];
t2 = (bc & 0x80808080);
W[39] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
uint64_t check = backup ^ *(uint64_t*)&W[2] ^ *(uint64_t*)&W[6] ^ *(uint64_t*)&W[10] ^ *(uint64_t*)&W[30]
^ *(uint64_t*)&W[34] ^ *(uint64_t*)&W[38] ^ *(uint64_t*)&W[42] ^ *(uint64_t*)&W[62];
if(check <= target){
uint32_t tmp = atomicExch(&resNonce[0], thread);
if (tmp != UINT32_MAX)
resNonce[1] = tmp;
}
}
}
__host__
void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
tribus_echo512_gpu_final <<<grid, block>>> (threads, (uint64_t*)d_hash, d_resNonce, target);
}

47
tribus.cu → tribus/tribus.cu

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
/**
* Tribus Algo for Denarius
*
* tpruvot@github 06 2017 - GPLv3
* tpruvot@github 09 2017 - GPLv3
*
*/
extern "C" {
@ -16,9 +16,10 @@ extern "C" { @@ -16,9 +16,10 @@ extern "C" {
void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNonce[MAX_GPUS];
// cpu hash
@ -46,6 +47,7 @@ extern "C" void tribus_hash(void *state, const void *input) @@ -46,6 +47,7 @@ extern "C" void tribus_hash(void *state, const void *input)
}
static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
{
@ -63,7 +65,8 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce @@ -63,7 +65,8 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
int dev_id = device_map[thr_id];
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
@ -74,10 +77,15 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce @@ -74,10 +77,15 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
if (use_compat_kernels[thr_id])
x11_echo512_cpu_init(thr_id, throughput);
// char[64] work space for hashes results
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
@ -87,33 +95,43 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce @@ -87,33 +95,43 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
be32enc(&endiandata[k], pdata[k]);
jh512_setBlock_80(thr_id, endiandata);
cuda_check_cpu_setTarget(ptarget);
if (use_compat_kernels[thr_id])
cuda_check_cpu_setTarget(ptarget);
else
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
work->valid_nonces = 0;
do {
int order = 1;
// Hash with CUDA
jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (use_compat_kernels[thr_id]) {
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
work->nonces[1] = UINT32_MAX;
} else {
tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
}
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
const uint32_t Htarg = ptarget[7];
const uint32_t startNounce = pdata[19];
if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce;
be32enc(&endiandata[19], work->nonces[0]);
tribus_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
if (work->nonces[1] != 0) {
if (work->nonces[1] != UINT32_MAX) {
work->nonces[1] += startNounce;
be32enc(&endiandata[19], work->nonces[1]);
tribus_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
@ -127,7 +145,7 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce @@ -127,7 +145,7 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
@ -144,7 +162,6 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce @@ -144,7 +162,6 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
out:
// *hashes_done = pdata[19] - first_nonce;
return work->valid_nonces;
}
@ -157,8 +174,8 @@ extern "C" void free_tribus(int thr_id) @@ -157,8 +174,8 @@ extern "C" void free_tribus(int thr_id)
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
cudaFree(d_resNonce[thr_id]);
quark_groestl512_cpu_free(thr_id);
cuda_check_cpu_free(thr_id);
init[thr_id] = false;
Loading…
Cancel
Save