tribus: optimised version for recent cards
main improvement is on echo, based on alexis sib kernel work tested on SM 3.0 and more recent
This commit is contained in:
parent
949061d223
commit
d47dd9de39
@ -66,7 +66,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
|
||||
sph/ripemd.c sph/sph_sha2.c \
|
||||
lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
|
||||
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
|
||||
tribus.cu \
|
||||
tribus/tribus.cu tribus/cuda_echo512_final.cu \
|
||||
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
|
||||
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
|
||||
x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
|
||||
|
@ -538,7 +538,11 @@
|
||||
<MaxRegCount>64</MaxRegCount>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="skunk\cuda_skunk_streebog.cu" />
|
||||
<CudaCompile Include="tribus.cu" />
|
||||
<ClInclude Include="tribus\cuda_echo512_aes.cuh" />
|
||||
<CudaCompile Include="tribus\cuda_echo512_final.cu">
|
||||
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="tribus\tribus.cu" />
|
||||
<ClInclude Include="x11\cuda_x11_aes.cuh" />
|
||||
<CudaCompile Include="x11\cuda_x11_cubehash512.cu" />
|
||||
<CudaCompile Include="x11\cuda_x11_echo.cu">
|
||||
|
@ -109,6 +109,9 @@
|
||||
<Filter Include="Source Files\equi">
|
||||
<UniqueIdentifier>{031afae7-2a78-4e32-9738-4b589b6f7ff3}</UniqueIdentifier>
|
||||
</Filter>
|
||||
<Filter Include="Source Files\CUDA\tribus">
|
||||
<UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="compat\jansson\dump.c">
|
||||
@ -766,9 +769,15 @@
|
||||
<CudaCompile Include="skunk\cuda_skunk_streebog.cu">
|
||||
<Filter>Source Files\CUDA\skunk</Filter>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="tribus.cu">
|
||||
<Filter>Source Files\CUDA</Filter>
|
||||
<CudaCompile Include="tribus\tribus.cu">
|
||||
<Filter>Source Files\CUDA\tribus</Filter>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="tribus\cuda_echo512_final.cu">
|
||||
<Filter>Source Files\CUDA\tribus</Filter>
|
||||
</CudaCompile>
|
||||
<ClInclude Include="tribus\cuda_echo512_aes.cuh">
|
||||
<Filter>Source Files\CUDA\tribus</Filter>
|
||||
</ClInclude>
|
||||
<CudaCompile Include="x11\sib.cu">
|
||||
<Filter>Source Files\CUDA\x11</Filter>
|
||||
</CudaCompile>
|
||||
|
@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
|
||||
//
|
||||
|
||||
VS_VERSION_INFO VERSIONINFO
|
||||
FILEVERSION 2,2,0,0
|
||||
PRODUCTVERSION 2,2,0,0
|
||||
FILEVERSION 2,2,1,0
|
||||
PRODUCTVERSION 2,2,1,0
|
||||
FILEFLAGSMASK 0x3fL
|
||||
#ifdef _DEBUG
|
||||
FILEFLAGS 0x21L
|
||||
@ -76,10 +76,10 @@ BEGIN
|
||||
BEGIN
|
||||
BLOCK "040904e4"
|
||||
BEGIN
|
||||
VALUE "FileVersion", "2.2"
|
||||
VALUE "FileVersion", "2.2.1"
|
||||
VALUE "LegalCopyright", "Copyright (C) 2017"
|
||||
VALUE "ProductName", "ccminer"
|
||||
VALUE "ProductVersion", "2.2"
|
||||
VALUE "ProductVersion", "2.2.1"
|
||||
END
|
||||
END
|
||||
BLOCK "VarFileInfo"
|
||||
|
318
tribus/cuda_echo512_aes.cuh
Normal file
318
tribus/cuda_echo512_aes.cuh
Normal file
@ -0,0 +1,318 @@
|
||||
#ifdef __INTELLISENSE__
|
||||
#define __byte_perm(x, y, b) x
|
||||
#define __CUDA_ARCH__ 520
|
||||
#include <cuda_helper.h>
|
||||
#endif
|
||||
|
||||
#undef ROL8
|
||||
#undef ROR8
|
||||
#undef ROL16
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
__device__ __forceinline__
|
||||
uint32_t ROL8(const uint32_t a) {
|
||||
return __byte_perm(a, 0, 0x2103);
|
||||
}
|
||||
__device__ __forceinline__
|
||||
uint32_t ROR8(const uint32_t a) {
|
||||
return __byte_perm(a, 0, 0x0321);
|
||||
}
|
||||
__device__ __forceinline__
|
||||
uint32_t ROL16(const uint32_t a) {
|
||||
return __byte_perm(a, 0, 0x1032);
|
||||
}
|
||||
#else
|
||||
#define ROL8(u) ROTL32(u, 8)
|
||||
#define ROR8(u) ROTR32(u, 8)
|
||||
#define ROL16(u) ROTL32(u,16)
|
||||
#endif
|
||||
|
||||
__device__ uint32_t d_AES0[256] = {
|
||||
0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,
|
||||
0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
|
||||
0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,
|
||||
0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
|
||||
0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,
|
||||
0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
|
||||
0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F,
|
||||
0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA,
|
||||
0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B,
|
||||
0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413,
|
||||
0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6,
|
||||
0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85,
|
||||
0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511,
|
||||
0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B,
|
||||
0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1,
|
||||
0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF,
|
||||
0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E,
|
||||
0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6,
|
||||
0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B,
|
||||
0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD,
|
||||
0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8,
|
||||
0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2,
|
||||
0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949,
|
||||
0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810,
|
||||
0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697,
|
||||
0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F,
|
||||
0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C,
|
||||
0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27,
|
||||
0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433,
|
||||
0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5,
|
||||
0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,
|
||||
0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C
|
||||
};
|
||||
|
||||
__device__ uint32_t d_AES3[256] = {
|
||||
0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5,
|
||||
0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676,
|
||||
0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0,
|
||||
0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0,
|
||||
0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC,
|
||||
0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515,
|
||||
0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A,
|
||||
0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575,
|
||||
0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0,
|
||||
0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484,
|
||||
0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B,
|
||||
0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF,
|
||||
0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585,
|
||||
0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8,
|
||||
0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5,
|
||||
0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2,
|
||||
0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717,
|
||||
0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373,
|
||||
0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888,
|
||||
0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB,
|
||||
0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C,
|
||||
0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979,
|
||||
0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9,
|
||||
0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808,
|
||||
0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6,
|
||||
0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A,
|
||||
0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E,
|
||||
0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E,
|
||||
0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494,
|
||||
0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF,
|
||||
0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868,
|
||||
0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616
|
||||
};
|
||||
|
||||
__device__ __forceinline__
|
||||
void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256])
|
||||
{
|
||||
/* each thread startup will fill a uint32 */
|
||||
if (threadIdx.x < 256) {
|
||||
uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
|
||||
sharedMemory[0][threadIdx.x] = temp;
|
||||
sharedMemory[1][threadIdx.x] = ROL8(temp);
|
||||
sharedMemory[2][threadIdx.x] = ROL16(temp);
|
||||
sharedMemory[3][threadIdx.x] = ROR8(temp);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
void aes_gpu_init256(uint32_t sharedMemory[4][256])
|
||||
{
|
||||
/* each thread startup will fill a uint32 */
|
||||
uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
|
||||
sharedMemory[0][threadIdx.x] = temp;
|
||||
sharedMemory[1][threadIdx.x] = ROL8(temp);
|
||||
sharedMemory[2][threadIdx.x] = ROL16(temp);
|
||||
sharedMemory[3][threadIdx.x] = ROR8(temp);
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
void aes_gpu_init128(uint32_t sharedMemory[4][256])
|
||||
{
|
||||
/* each thread startup will fill 2 uint32 */
|
||||
uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
|
||||
|
||||
sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
|
||||
sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
|
||||
sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
|
||||
sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
|
||||
sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
|
||||
sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
|
||||
sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
|
||||
sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256])
|
||||
{
|
||||
if (threadIdx.x < 128) {
|
||||
/* each thread startup will fill 2 uint32 */
|
||||
uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
|
||||
|
||||
sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
|
||||
sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
|
||||
sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
|
||||
sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
|
||||
sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
|
||||
sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
|
||||
sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
|
||||
sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
|
||||
const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
||||
{
|
||||
y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
|
||||
y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
|
||||
y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
|
||||
y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
|
||||
|
||||
y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
|
||||
y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
|
||||
y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
|
||||
#ifdef INTENSIVE_GMF
|
||||
y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
|
||||
#else
|
||||
y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)];
|
||||
#endif
|
||||
|
||||
y0 ^= k0;
|
||||
|
||||
y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
|
||||
y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
|
||||
y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
|
||||
y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
|
||||
|
||||
y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
|
||||
y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
|
||||
y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
|
||||
y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
|
||||
const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
||||
{
|
||||
y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
|
||||
y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
|
||||
y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
|
||||
y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
|
||||
|
||||
y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
|
||||
y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
|
||||
y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
|
||||
y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
|
||||
|
||||
y0 ^= k0;
|
||||
|
||||
y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
|
||||
y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
|
||||
y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
|
||||
y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
|
||||
|
||||
y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
|
||||
y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
|
||||
y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
|
||||
y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
|
||||
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
||||
{
|
||||
y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
|
||||
y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
|
||||
y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
|
||||
y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
|
||||
|
||||
#ifdef INTENSIVE_GMF
|
||||
y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
|
||||
#else
|
||||
y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
|
||||
#endif
|
||||
y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
|
||||
y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
|
||||
y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
|
||||
|
||||
y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
|
||||
y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
|
||||
y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
|
||||
y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
|
||||
|
||||
y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
|
||||
y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
|
||||
y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
|
||||
y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
|
||||
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
||||
{
|
||||
y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
|
||||
y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
|
||||
y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
|
||||
y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
|
||||
|
||||
y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
|
||||
y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
|
||||
y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
|
||||
y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
|
||||
|
||||
y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
|
||||
y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
|
||||
y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
|
||||
y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
|
||||
|
||||
y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
|
||||
y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
|
||||
y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
|
||||
y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
|
||||
{
|
||||
uint32_t y0, y1, y2, y3;
|
||||
|
||||
aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
|
||||
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
|
||||
|
||||
// hier werden wir ein carry brauchen (oder auch nicht)
|
||||
k0++;
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
static void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
|
||||
{
|
||||
uint32_t y0, y1, y2, y3;
|
||||
|
||||
aes_round_LDG(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
|
||||
|
||||
aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
|
||||
|
||||
// hier werden wir ein carry brauchen (oder auch nicht)
|
||||
k0++;
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
static void AES_ROUND_NOKEY(const uint32_t sharedMemory[4][256], uint4* x)
|
||||
{
|
||||
uint32_t y0, y1, y2, y3;
|
||||
aes_round(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3);
|
||||
|
||||
x->x = y0;
|
||||
x->y = y1;
|
||||
x->z = y2;
|
||||
x->w = y3;
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
static void KEY_EXPAND_ELT(const uint32_t sharedMemory[4][256], uint32_t *k)
|
||||
{
|
||||
uint32_t y0, y1, y2, y3;
|
||||
aes_round(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3);
|
||||
|
||||
k[0] = y1;
|
||||
k[1] = y2;
|
||||
k[2] = y3;
|
||||
k[3] = y0;
|
||||
}
|
285
tribus/cuda_echo512_final.cu
Normal file
285
tribus/cuda_echo512_final.cu
Normal file
@ -0,0 +1,285 @@
|
||||
/**
|
||||
* Based on Provos Alexis work - 2016 FOR SM 5+
|
||||
*
|
||||
* final touch by tpruvot for tribus - 09 2017
|
||||
*/
|
||||
#include <cuda_helper.h>
|
||||
#include <cuda_vector_uint2x4.h>
|
||||
#include <cuda_vectors.h>
|
||||
|
||||
#define INTENSIVE_GMF
|
||||
#include "tribus/cuda_echo512_aes.cuh"
|
||||
|
||||
#ifdef __INTELLISENSE__
|
||||
#define __byte_perm(x, y, b) x
|
||||
#define atomicExch(p,y) (*p) = y
|
||||
#endif
|
||||
|
||||
__device__
|
||||
static void echo_round(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0)
|
||||
{
|
||||
// Big Sub Words
|
||||
#pragma unroll 16
|
||||
for (int idx = 0; idx < 16; idx++)
|
||||
AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0);
|
||||
|
||||
// Shift Rows
|
||||
#pragma unroll 4
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t t[4];
|
||||
/// 1, 5, 9, 13
|
||||
t[0] = W[i + 4];
|
||||
t[1] = W[i + 8];
|
||||
t[2] = W[i + 24];
|
||||
t[3] = W[i + 60];
|
||||
|
||||
W[i + 4] = W[i + 20];
|
||||
W[i + 8] = W[i + 40];
|
||||
W[i + 24] = W[i + 56];
|
||||
W[i + 60] = W[i + 44];
|
||||
|
||||
W[i + 20] = W[i + 36];
|
||||
W[i + 40] = t[1];
|
||||
W[i + 56] = t[2];
|
||||
W[i + 44] = W[i + 28];
|
||||
|
||||
W[i + 28] = W[i + 12];
|
||||
W[i + 12] = t[3];
|
||||
W[i + 36] = W[i + 52];
|
||||
W[i + 52] = t[0];
|
||||
}
|
||||
// Mix Columns
|
||||
#pragma unroll 4
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
#pragma unroll 4
|
||||
for (int idx = 0; idx < 64; idx += 16)
|
||||
{
|
||||
uint32_t a[4];
|
||||
a[0] = W[idx + i];
|
||||
a[1] = W[idx + i + 4];
|
||||
a[2] = W[idx + i + 8];
|
||||
a[3] = W[idx + i +12];
|
||||
|
||||
uint32_t ab = a[0] ^ a[1];
|
||||
uint32_t bc = a[1] ^ a[2];
|
||||
uint32_t cd = a[2] ^ a[3];
|
||||
|
||||
uint32_t t, t2, t3;
|
||||
t = (ab & 0x80808080);
|
||||
t2 = (bc & 0x80808080);
|
||||
t3 = (cd & 0x80808080);
|
||||
|
||||
uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1);
|
||||
uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
|
||||
uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
|
||||
|
||||
W[idx + i] = bc ^ a[3] ^ abx;
|
||||
W[idx + i + 4] = a[0] ^ cd ^ bcx;
|
||||
W[idx + i + 8] = ab ^ a[3] ^ cdx;
|
||||
W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(256, 3) /* will force 80 registers */
|
||||
static void tribus_echo512_gpu_final(uint32_t threads, uint64_t *g_hash, uint32_t* resNonce, const uint64_t target)
|
||||
{
|
||||
__shared__ uint32_t sharedMemory[4][256];
|
||||
|
||||
aes_gpu_init256(sharedMemory);
|
||||
|
||||
const uint32_t P[48] = {
|
||||
0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
||||
//8-12
|
||||
0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
||||
//21-25
|
||||
0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751,0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
||||
//34-38
|
||||
0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7,0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
||||
0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
|
||||
0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
|
||||
//58-61
|
||||
};
|
||||
uint32_t k0;
|
||||
uint32_t h[16];
|
||||
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
const uint32_t *hash = (uint32_t*)&g_hash[thread<<3];
|
||||
|
||||
*(uint2x4*)&h[0] = __ldg4((uint2x4*)&hash[0]);
|
||||
*(uint2x4*)&h[8] = __ldg4((uint2x4*)&hash[8]);
|
||||
|
||||
uint64_t backup = *(uint64_t*)&h[6];
|
||||
|
||||
k0 = 512 + 8;
|
||||
|
||||
#pragma unroll 4
|
||||
for (uint32_t idx = 0; idx < 16; idx += 4)
|
||||
AES_2ROUND(sharedMemory,h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
|
||||
|
||||
k0 += 4;
|
||||
|
||||
uint32_t W[64];
|
||||
|
||||
#pragma unroll 4
|
||||
for (uint32_t i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t a = P[i];
|
||||
uint32_t b = P[i + 4];
|
||||
uint32_t c = h[i + 8];
|
||||
uint32_t d = P[i + 8];
|
||||
|
||||
uint32_t ab = a ^ b;
|
||||
uint32_t bc = b ^ c;
|
||||
uint32_t cd = c ^ d;
|
||||
|
||||
uint32_t t = ((a ^ b) & 0x80808080);
|
||||
uint32_t t2 = ((b ^ c) & 0x80808080);
|
||||
uint32_t t3 = ((c ^ d) & 0x80808080);
|
||||
|
||||
uint32_t abx = ((t >> 7) * 27U) ^ ((ab^t) << 1);
|
||||
uint32_t bcx = ((t2 >> 7) * 27U) ^ ((bc^t2) << 1);
|
||||
uint32_t cdx = ((t3 >> 7) * 27U) ^ ((cd^t3) << 1);
|
||||
|
||||
W[0 + i] = bc ^ d ^ abx;
|
||||
W[4 + i] = a ^ cd ^ bcx;
|
||||
W[8 + i] = ab ^ d ^ cdx;
|
||||
W[12+ i] = abx ^ bcx ^ cdx ^ ab ^ c;
|
||||
|
||||
a = P[12 + i];
|
||||
b = h[i + 4];
|
||||
c = P[12 + i + 4];
|
||||
d = P[12 + i + 8];
|
||||
|
||||
ab = a ^ b;
|
||||
bc = b ^ c;
|
||||
cd = c ^ d;
|
||||
|
||||
t = (ab & 0x80808080);
|
||||
t2 = (bc & 0x80808080);
|
||||
t3 = (cd & 0x80808080);
|
||||
|
||||
abx = (t >> 7) * 27U ^ ((ab^t) << 1);
|
||||
bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
|
||||
cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
|
||||
|
||||
W[16 + i] = abx ^ bc ^ d;
|
||||
W[16 + i + 4] = bcx ^ a ^ cd;
|
||||
W[16 + i + 8] = cdx ^ ab ^ d;
|
||||
W[16 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
|
||||
|
||||
a = h[i];
|
||||
b = P[24 + i];
|
||||
c = P[24 + i + 4];
|
||||
d = P[24 + i + 8];
|
||||
|
||||
ab = a ^ b;
|
||||
bc = b ^ c;
|
||||
cd = c ^ d;
|
||||
|
||||
t = (ab & 0x80808080);
|
||||
t2 = (bc & 0x80808080);
|
||||
t3 = (cd & 0x80808080);
|
||||
|
||||
abx = (t >> 7) * 27U ^ ((ab^t) << 1);
|
||||
bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
|
||||
cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
|
||||
|
||||
W[32 + i] = abx ^ bc ^ d;
|
||||
W[32 + i + 4] = bcx ^ a ^ cd;
|
||||
W[32 + i + 8] = cdx ^ ab ^ d;
|
||||
W[32 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
|
||||
|
||||
a = P[36 + i ];
|
||||
b = P[36 + i + 4];
|
||||
c = P[36 + i + 8];
|
||||
d = h[i + 12];
|
||||
|
||||
ab = a ^ b;
|
||||
bc = b ^ c;
|
||||
cd = c ^ d;
|
||||
|
||||
t = (ab & 0x80808080);
|
||||
t2 = (bc & 0x80808080);
|
||||
t3 = (cd & 0x80808080);
|
||||
|
||||
abx = (t >> 7) * 27U ^ ((ab^t) << 1);
|
||||
bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
|
||||
cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
|
||||
|
||||
W[48 + i] = abx ^ bc ^ d;
|
||||
W[48 + i + 4] = bcx ^ a ^ cd;
|
||||
W[48 + i + 8] = cdx ^ ab ^ d;
|
||||
W[48 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
|
||||
}
|
||||
|
||||
for (int k = 1; k < 9; k++)
|
||||
echo_round(sharedMemory,W,k0);
|
||||
|
||||
// Big Sub Words
|
||||
uint32_t y0, y1, y2, y3;
|
||||
// AES_2ROUND(sharedMemory,W[ 0], W[ 1], W[ 2], W[ 3], k0);
|
||||
aes_round(sharedMemory, W[ 0], W[ 1], W[ 2], W[ 3], k0, y0, y1, y2, y3);
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, W[ 0], W[ 1], W[ 2], W[ 3]);
|
||||
|
||||
aes_round(sharedMemory, W[ 4], W[ 5], W[ 6], W[ 7], k0, y0, y1, y2, y3);
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, W[ 4], W[ 5], W[ 6], W[ 7]);
|
||||
aes_round(sharedMemory, W[ 8], W[ 9], W[10], W[11], k0, y0, y1, y2, y3);
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, W[ 8], W[ 9], W[10], W[11]);
|
||||
|
||||
aes_round(sharedMemory, W[20], W[21], W[22], W[23], k0, y0, y1, y2, y3);
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, W[20], W[21], W[22], W[23]);
|
||||
aes_round(sharedMemory, W[28], W[29], W[30], W[31], k0, y0, y1, y2, y3);
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, W[28], W[29], W[30], W[31]);
|
||||
|
||||
aes_round(sharedMemory, W[32], W[33], W[34], W[35], k0, y0, y1, y2, y3);
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, W[32], W[33], W[34], W[35]);
|
||||
aes_round(sharedMemory, W[40], W[41], W[42], W[43], k0, y0, y1, y2, y3);
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, W[40], W[41], W[42], W[43]);
|
||||
|
||||
aes_round(sharedMemory, W[52], W[53], W[54], W[55], k0, y0, y1, y2, y3);
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, W[52], W[53], W[54], W[55]);
|
||||
aes_round(sharedMemory, W[60], W[61], W[62], W[63], k0, y0, y1, y2, y3);
|
||||
aes_round(sharedMemory, y0, y1, y2, y3, W[60], W[61], W[62], W[63]);
|
||||
|
||||
uint32_t bc = W[22] ^ W[42];
|
||||
uint32_t t2 = (bc & 0x80808080);
|
||||
W[ 6] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
|
||||
|
||||
bc = W[23] ^ W[43];
|
||||
t2 = (bc & 0x80808080);
|
||||
W[ 7] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
|
||||
|
||||
bc = W[10] ^ W[54];
|
||||
t2 = (bc & 0x80808080);
|
||||
W[38] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
|
||||
|
||||
bc = W[11] ^ W[55];
|
||||
t2 = (bc & 0x80808080);
|
||||
W[39] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
|
||||
|
||||
uint64_t check = backup ^ *(uint64_t*)&W[2] ^ *(uint64_t*)&W[6] ^ *(uint64_t*)&W[10] ^ *(uint64_t*)&W[30]
|
||||
^ *(uint64_t*)&W[34] ^ *(uint64_t*)&W[38] ^ *(uint64_t*)&W[42] ^ *(uint64_t*)&W[62];
|
||||
|
||||
if(check <= target){
|
||||
uint32_t tmp = atomicExch(&resNonce[0], thread);
|
||||
if (tmp != UINT32_MAX)
|
||||
resNonce[1] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__
|
||||
void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target)
|
||||
{
|
||||
const uint32_t threadsperblock = 256;
|
||||
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
tribus_echo512_gpu_final <<<grid, block>>> (threads, (uint64_t*)d_hash, d_resNonce, target);
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
/**
|
||||
* Tribus Algo for Denarius
|
||||
*
|
||||
* tpruvot@github 06 2017 - GPLv3
|
||||
* tpruvot@github 09 2017 - GPLv3
|
||||
*
|
||||
*/
|
||||
extern "C" {
|
||||
@ -16,9 +16,10 @@ extern "C" {
|
||||
|
||||
void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
|
||||
void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
|
||||
void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
|
||||
|
||||
static uint32_t *d_hash[MAX_GPUS];
|
||||
|
||||
static uint32_t *d_resNonce[MAX_GPUS];
|
||||
|
||||
// cpu hash
|
||||
|
||||
@ -46,6 +47,7 @@ extern "C" void tribus_hash(void *state, const void *input)
|
||||
}
|
||||
|
||||
static bool init[MAX_GPUS] = { 0 };
|
||||
static bool use_compat_kernels[MAX_GPUS] = { 0 };
|
||||
|
||||
extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
@ -63,7 +65,8 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
|
||||
|
||||
if (!init[thr_id])
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
int dev_id = device_map[thr_id];
|
||||
cudaSetDevice(dev_id);
|
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) {
|
||||
cudaDeviceReset();
|
||||
// reduce cpu usage
|
||||
@ -74,10 +77,15 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
|
||||
|
||||
quark_jh512_cpu_init(thr_id, throughput);
|
||||
quark_keccak512_cpu_init(thr_id, throughput);
|
||||
x11_echo512_cpu_init(thr_id, throughput);
|
||||
|
||||
cuda_get_arch(thr_id);
|
||||
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
|
||||
if (use_compat_kernels[thr_id])
|
||||
x11_echo512_cpu_init(thr_id, throughput);
|
||||
|
||||
// char[64] work space for hashes results
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
|
||||
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
init[thr_id] = true;
|
||||
@ -87,33 +95,43 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
|
||||
jh512_setBlock_80(thr_id, endiandata);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
if (use_compat_kernels[thr_id])
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
else
|
||||
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
|
||||
|
||||
work->valid_nonces = 0;
|
||||
|
||||
do {
|
||||
int order = 1;
|
||||
|
||||
// Hash with CUDA
|
||||
jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
|
||||
if (use_compat_kernels[thr_id]) {
|
||||
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
||||
work->nonces[1] = UINT32_MAX;
|
||||
} else {
|
||||
tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
|
||||
cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||
|
||||
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
||||
if (work->nonces[0] != UINT32_MAX)
|
||||
{
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t _ALIGN(64) vhash[8];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t startNounce = pdata[19];
|
||||
if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce;
|
||||
be32enc(&endiandata[19], work->nonces[0]);
|
||||
tribus_hash(vhash, endiandata);
|
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
|
||||
work->valid_nonces = 1;
|
||||
work_set_target_ratio(work, vhash);
|
||||
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
|
||||
if (work->nonces[1] != 0) {
|
||||
if (work->nonces[1] != UINT32_MAX) {
|
||||
work->nonces[1] += startNounce;
|
||||
be32enc(&endiandata[19], work->nonces[1]);
|
||||
tribus_hash(vhash, endiandata);
|
||||
bn_set_target_ratio(work, vhash, 1);
|
||||
@ -127,7 +145,7 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
|
||||
else if (vhash[7] > Htarg) {
|
||||
gpu_increment_reject(thr_id);
|
||||
if (!opt_quiet)
|
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
|
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
|
||||
pdata[19] = work->nonces[0] + 1;
|
||||
continue;
|
||||
}
|
||||
@ -144,7 +162,6 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
|
||||
|
||||
out:
|
||||
// *hashes_done = pdata[19] - first_nonce;
|
||||
|
||||
return work->valid_nonces;
|
||||
}
|
||||
|
||||
@ -157,8 +174,8 @@ extern "C" void free_tribus(int thr_id)
|
||||
cudaThreadSynchronize();
|
||||
|
||||
cudaFree(d_hash[thr_id]);
|
||||
cudaFree(d_resNonce[thr_id]);
|
||||
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
cuda_check_cpu_free(thr_id);
|
||||
init[thr_id] = false;
|
||||
|
Loading…
x
Reference in New Issue
Block a user