Browse Source

Revert "x11: improve aes (shavite/echo)"

make a lot of cpu validation errors on windows,
to be double checked in the next version...

This reverts commit 1187a6e7e3.
master
Tanguy Pruvot 10 years ago
parent
commit
e21c75793a
  1. 540
      x11/cuda_x11_aes.cu
  2. 37
      x11/cuda_x11_echo.cu
  3. 104
      x11/cuda_x11_shavite512.cu

540
x11/cuda_x11_aes.cu

@ -1,181 +1,371 @@
// change nothing...
/* AES Helper for inline-usage from SPH */
#define AESx(x) (x ##UL) /* SPH_C32(x) */
//#define DEVICE_DIRECT_CONSTANTS //#define DEVICE_DIRECT_CONSTANTS
#ifdef DEVICE_DIRECT_CONSTANTS #ifdef DEVICE_DIRECT_CONSTANTS
__constant__ uint32_t c_AES[1024] = { __constant__ __align__(64) uint32_t d_AES0[256] = {
#else
static const uint32_t h_AES0[256] = {
#endif
AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
};
#ifdef DEVICE_DIRECT_CONSTANTS
__constant__ __align__(64) uint32_t d_AES1[256] = {
#else #else
static const uint32_t h_AES[1024] = { static const uint32_t h_AES1[256] = {
#endif #endif
0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591, AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC, AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB, AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B, AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83, AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A, AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F, AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA, AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B, AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413, AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6, AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85, AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511, AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B, AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1, AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF, AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E, AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6, AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B, AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD, AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8, AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2, AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949, AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810, AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697, AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F, AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C, AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27, AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433, AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5, AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0, AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C, AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
// ROL 8 AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
0x6363C6A5, 0x7C7CF884, 0x7777EE99, 0x7B7BF68D, 0xF2F2FF0D, 0x6B6BD6BD, 0x6F6FDEB1, 0xC5C59154, AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
0x30306050, 0x01010203, 0x6767CEA9, 0x2B2B567D, 0xFEFEE719, 0xD7D7B562, 0xABAB4DE6, 0x7676EC9A, AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
0xCACA8F45, 0x82821F9D, 0xC9C98940, 0x7D7DFA87, 0xFAFAEF15, 0x5959B2EB, 0x47478EC9, 0xF0F0FB0B, AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
0xADAD41EC, 0xD4D4B367, 0xA2A25FFD, 0xAFAF45EA, 0x9C9C23BF, 0xA4A453F7, 0x7272E496, 0xC0C09B5B, AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
0xB7B775C2, 0xFDFDE11C, 0x93933DAE, 0x26264C6A, 0x36366C5A, 0x3F3F7E41, 0xF7F7F502, 0xCCCC834F, AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
0x3434685C, 0xA5A551F4, 0xE5E5D134, 0xF1F1F908, 0x7171E293, 0xD8D8AB73, 0x31316253, 0x15152A3F, AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
0x0404080C, 0xC7C79552, 0x23234665, 0xC3C39D5E, 0x18183028, 0x969637A1, 0x05050A0F, 0x9A9A2FB5, AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
0x07070E09, 0x12122436, 0x80801B9B, 0xE2E2DF3D, 0xEBEBCD26, 0x27274E69, 0xB2B27FCD, 0x7575EA9F, AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
0x0909121B, 0x83831D9E, 0x2C2C5874, 0x1A1A342E, 0x1B1B362D, 0x6E6EDCB2, 0x5A5AB4EE, 0xA0A05BFB, AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
0x5252A4F6, 0x3B3B764D, 0xD6D6B761, 0xB3B37DCE, 0x2929527B, 0xE3E3DD3E, 0x2F2F5E71, 0x84841397, AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
0x5353A6F5, 0xD1D1B968, 0x00000000, 0xEDEDC12C, 0x20204060, 0xFCFCE31F, 0xB1B179C8, 0x5B5BB6ED, AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
0x6A6AD4BE, 0xCBCB8D46, 0xBEBE67D9, 0x3939724B, 0x4A4A94DE, 0x4C4C98D4, 0x5858B0E8, 0xCFCF854A, AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
0xD0D0BB6B, 0xEFEFC52A, 0xAAAA4FE5, 0xFBFBED16, 0x434386C5, 0x4D4D9AD7, 0x33336655, 0x85851194, AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
0x45458ACF, 0xF9F9E910, 0x02020406, 0x7F7FFE81, 0x5050A0F0, 0x3C3C7844, 0x9F9F25BA, 0xA8A84BE3, AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
0x5151A2F3, 0xA3A35DFE, 0x404080C0, 0x8F8F058A, 0x92923FAD, 0x9D9D21BC, 0x38387048, 0xF5F5F104, AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
0xBCBC63DF, 0xB6B677C1, 0xDADAAF75, 0x21214263, 0x10102030, 0xFFFFE51A, 0xF3F3FD0E, 0xD2D2BF6D, AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
0xCDCD814C, 0x0C0C1814, 0x13132635, 0xECECC32F, 0x5F5FBEE1, 0x979735A2, 0x444488CC, 0x17172E39, AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
0xC4C49357, 0xA7A755F2, 0x7E7EFC82, 0x3D3D7A47, 0x6464C8AC, 0x5D5DBAE7, 0x1919322B, 0x7373E695, AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
0x6060C0A0, 0x81811998, 0x4F4F9ED1, 0xDCDCA37F, 0x22224466, 0x2A2A547E, 0x90903BAB, 0x88880B83, AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
0x46468CCA, 0xEEEEC729, 0xB8B86BD3, 0x1414283C, 0xDEDEA779, 0x5E5EBCE2, 0x0B0B161D, 0xDBDBAD76, AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
0xE0E0DB3B, 0x32326456, 0x3A3A744E, 0x0A0A141E, 0x494992DB, 0x06060C0A, 0x2424486C, 0x5C5CB8E4, AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
0xC2C29F5D, 0xD3D3BD6E, 0xACAC43EF, 0x6262C4A6, 0x919139A8, 0x959531A4, 0xE4E4D337, 0x7979F28B, AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
0xE7E7D532, 0xC8C88B43, 0x37376E59, 0x6D6DDAB7, 0x8D8D018C, 0xD5D5B164, 0x4E4E9CD2, 0xA9A949E0, AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
0x6C6CD8B4, 0x5656ACFA, 0xF4F4F307, 0xEAEACF25, 0x6565CAAF, 0x7A7AF48E, 0xAEAE47E9, 0x08081018, AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
0xBABA6FD5, 0x7878F088, 0x25254A6F, 0x2E2E5C72, 0x1C1C3824, 0xA6A657F1, 0xB4B473C7, 0xC6C69751, AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
0xE8E8CB23, 0xDDDDA17C, 0x7474E89C, 0x1F1F3E21, 0x4B4B96DD, 0xBDBD61DC, 0x8B8B0D86, 0x8A8A0F85, AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
0x7070E090, 0x3E3E7C42, 0xB5B571C4, 0x6666CCAA, 0x484890D8, 0x03030605, 0xF6F6F701, 0x0E0E1C12, AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
0x6161C2A3, 0x35356A5F, 0x5757AEF9, 0xB9B969D0, 0x86861791, 0xC1C19958, 0x1D1D3A27, 0x9E9E27B9, AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
0xE1E1D938, 0xF8F8EB13, 0x98982BB3, 0x11112233, 0x6969D2BB, 0xD9D9A970, 0x8E8E0789, 0x949433A7, AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
0x9B9B2DB6, 0x1E1E3C22, 0x87871592, 0xE9E9C920, 0xCECE8749, 0x5555AAFF, 0x28285078, 0xDFDFA57A, AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
0x8C8C038F, 0xA1A159F8, 0x89890980, 0x0D0D1A17, 0xBFBF65DA, 0xE6E6D731, 0x424284C6, 0x6868D0B8, AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
0x414182C3, 0x999929B0, 0x2D2D5A77, 0x0F0F1E11, 0xB0B07BCB, 0x5454A8FC, 0xBBBB6DD6, 0x16162C3A, };
// ROL 16
0x63C6A563, 0x7CF8847C, 0x77EE9977, 0x7BF68D7B, 0xF2FF0DF2, 0x6BD6BD6B, 0x6FDEB16F, 0xC59154C5, #ifdef DEVICE_DIRECT_CONSTANTS
0x30605030, 0x01020301, 0x67CEA967, 0x2B567D2B, 0xFEE719FE, 0xD7B562D7, 0xAB4DE6AB, 0x76EC9A76, __constant__ __align__(64) uint32_t d_AES2[256] = {
0xCA8F45CA, 0x821F9D82, 0xC98940C9, 0x7DFA877D, 0xFAEF15FA, 0x59B2EB59, 0x478EC947, 0xF0FB0BF0, #else
0xAD41ECAD, 0xD4B367D4, 0xA25FFDA2, 0xAF45EAAF, 0x9C23BF9C, 0xA453F7A4, 0x72E49672, 0xC09B5BC0, static const uint32_t h_AES2[256] = {
0xB775C2B7, 0xFDE11CFD, 0x933DAE93, 0x264C6A26, 0x366C5A36, 0x3F7E413F, 0xF7F502F7, 0xCC834FCC, #endif
0x34685C34, 0xA551F4A5, 0xE5D134E5, 0xF1F908F1, 0x71E29371, 0xD8AB73D8, 0x31625331, 0x152A3F15, AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
0x04080C04, 0xC79552C7, 0x23466523, 0xC39D5EC3, 0x18302818, 0x9637A196, 0x050A0F05, 0x9A2FB59A, AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
0x070E0907, 0x12243612, 0x801B9B80, 0xE2DF3DE2, 0xEBCD26EB, 0x274E6927, 0xB27FCDB2, 0x75EA9F75, AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
0x09121B09, 0x831D9E83, 0x2C58742C, 0x1A342E1A, 0x1B362D1B, 0x6EDCB26E, 0x5AB4EE5A, 0xA05BFBA0, AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
0x52A4F652, 0x3B764D3B, 0xD6B761D6, 0xB37DCEB3, 0x29527B29, 0xE3DD3EE3, 0x2F5E712F, 0x84139784, AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
0x53A6F553, 0xD1B968D1, 0x00000000, 0xEDC12CED, 0x20406020, 0xFCE31FFC, 0xB179C8B1, 0x5BB6ED5B, AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
0x6AD4BE6A, 0xCB8D46CB, 0xBE67D9BE, 0x39724B39, 0x4A94DE4A, 0x4C98D44C, 0x58B0E858, 0xCF854ACF, AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
0xD0BB6BD0, 0xEFC52AEF, 0xAA4FE5AA, 0xFBED16FB, 0x4386C543, 0x4D9AD74D, 0x33665533, 0x85119485, AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
0x458ACF45, 0xF9E910F9, 0x02040602, 0x7FFE817F, 0x50A0F050, 0x3C78443C, 0x9F25BA9F, 0xA84BE3A8, AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
0x51A2F351, 0xA35DFEA3, 0x4080C040, 0x8F058A8F, 0x923FAD92, 0x9D21BC9D, 0x38704838, 0xF5F104F5, AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
0xBC63DFBC, 0xB677C1B6, 0xDAAF75DA, 0x21426321, 0x10203010, 0xFFE51AFF, 0xF3FD0EF3, 0xD2BF6DD2, AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
0xCD814CCD, 0x0C18140C, 0x13263513, 0xECC32FEC, 0x5FBEE15F, 0x9735A297, 0x4488CC44, 0x172E3917, AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
0xC49357C4, 0xA755F2A7, 0x7EFC827E, 0x3D7A473D, 0x64C8AC64, 0x5DBAE75D, 0x19322B19, 0x73E69573, AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
0x60C0A060, 0x81199881, 0x4F9ED14F, 0xDCA37FDC, 0x22446622, 0x2A547E2A, 0x903BAB90, 0x880B8388, AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
0x468CCA46, 0xEEC729EE, 0xB86BD3B8, 0x14283C14, 0xDEA779DE, 0x5EBCE25E, 0x0B161D0B, 0xDBAD76DB, AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
0xE0DB3BE0, 0x32645632, 0x3A744E3A, 0x0A141E0A, 0x4992DB49, 0x060C0A06, 0x24486C24, 0x5CB8E45C, AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
0xC29F5DC2, 0xD3BD6ED3, 0xAC43EFAC, 0x62C4A662, 0x9139A891, 0x9531A495, 0xE4D337E4, 0x79F28B79, AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
0xE7D532E7, 0xC88B43C8, 0x376E5937, 0x6DDAB76D, 0x8D018C8D, 0xD5B164D5, 0x4E9CD24E, 0xA949E0A9, AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
0x6CD8B46C, 0x56ACFA56, 0xF4F307F4, 0xEACF25EA, 0x65CAAF65, 0x7AF48E7A, 0xAE47E9AE, 0x08101808, AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
0xBA6FD5BA, 0x78F08878, 0x254A6F25, 0x2E5C722E, 0x1C38241C, 0xA657F1A6, 0xB473C7B4, 0xC69751C6, AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
0xE8CB23E8, 0xDDA17CDD, 0x74E89C74, 0x1F3E211F, 0x4B96DD4B, 0xBD61DCBD, 0x8B0D868B, 0x8A0F858A, AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
0x70E09070, 0x3E7C423E, 0xB571C4B5, 0x66CCAA66, 0x4890D848, 0x03060503, 0xF6F701F6, 0x0E1C120E, AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
0x61C2A361, 0x356A5F35, 0x57AEF957, 0xB969D0B9, 0x86179186, 0xC19958C1, 0x1D3A271D, 0x9E27B99E, AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
0xE1D938E1, 0xF8EB13F8, 0x982BB398, 0x11223311, 0x69D2BB69, 0xD9A970D9, 0x8E07898E, 0x9433A794, AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
0x9B2DB69B, 0x1E3C221E, 0x87159287, 0xE9C920E9, 0xCE8749CE, 0x55AAFF55, 0x28507828, 0xDFA57ADF, AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
0x8C038F8C, 0xA159F8A1, 0x89098089, 0x0D1A170D, 0xBF65DABF, 0xE6D731E6, 0x4284C642, 0x68D0B868, AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
0x4182C341, 0x9929B099, 0x2D5A772D, 0x0F1E110F, 0xB07BCBB0, 0x54A8FC54, 0xBB6DD6BB, 0x162C3A16, AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
// ROL 24 AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5, AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676, AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0, AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0, AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC, AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515, AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A, AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575, AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0, AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484, AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B, AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF, AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585, AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8, AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5, AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2, AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717, AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373, AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888, AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB, AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C, AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979, AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9, AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808, AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6, AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A, AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E, AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E, AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494, AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF, AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868, AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616 AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
};
#ifdef DEVICE_DIRECT_CONSTANTS
__constant__ __align__(64) uint32_t d_AES3[256] = {
#else
static const uint32_t h_AES3[256] = {
#endif
AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
}; };
#ifndef DEVICE_DIRECT_CONSTANTS #ifndef DEVICE_DIRECT_CONSTANTS
static __constant__ uint32_t c_AES[1024]; static __constant__ __align__(64) uint32_t d_AES0[256];
static __constant__ __align__(64) uint32_t d_AES1[256];
static __constant__ __align__(64) uint32_t d_AES2[256];
static __constant__ __align__(64) uint32_t d_AES3[256];
static void aes_cpu_init(int thr_id)
{
CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES0,
h_AES0,
sizeof(h_AES0),
0, cudaMemcpyHostToDevice));
static void aes_cpu_init(int thr_id) { CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES1,
CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_AES, h_AES, sizeof(h_AES), 0, cudaMemcpyHostToDevice)); h_AES1,
sizeof(h_AES1),
0, cudaMemcpyHostToDevice));
CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES2,
h_AES2,
sizeof(h_AES2),
0, cudaMemcpyHostToDevice));
CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES3,
h_AES3,
sizeof(h_AES3),
0, cudaMemcpyHostToDevice));
} }
#else #else
static void aes_cpu_init(int thr_id) {} static void aes_cpu_init(int thr_id) {}
#endif #endif
__device__ __forceinline__
void aes_gpu_init(uint32_t *sharedMemory)
{
/* each thread startup will fill a uint32 */
if (threadIdx.x < 256) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
sharedMemory[threadIdx.x+768] = d_AES3[threadIdx.x];
}
}
/* tried with 3 xor.b32 asm, not faster */
#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d));
__device__ __device__
static void aes_round( static void aes_round(
const uint32_t *sharedMemory, const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{ {
y0 = sharedMemory[x0 & 0xFF] y0 = xor4_32(
^ sharedMemory[__byte_perm(x1, 0x1, 0x5541)] sharedMemory[__byte_perm(x0, 0, 0x4440)],
^ sharedMemory[__byte_perm(x2, 0x2, 0x5542)] sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
^ sharedMemory[__byte_perm(x3, 0x3, 0x5543)]; sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
y1 = sharedMemory[x1 & 0xFF] y1 = xor4_32(
^ sharedMemory[__byte_perm(x2, 0x1, 0x5541)] sharedMemory[__byte_perm(x1, 0, 0x4440)],
^ sharedMemory[__byte_perm(x3, 0x2, 0x5542)] sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
^ sharedMemory[__byte_perm(x0, 0x3, 0x5543)]; sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
y2 = sharedMemory[x2 & 0xFF] y2 = xor4_32(
^ sharedMemory[__byte_perm(x3, 0x1, 0x5541)] sharedMemory[__byte_perm(x2, 0, 0x4440)],
^ sharedMemory[__byte_perm(x0, 0x2, 0x5542)] sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
^ sharedMemory[__byte_perm(x1, 0x3, 0x5543)]; sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
y0 ^= k0; y0 ^= k0;
y3 = sharedMemory[x3 & 0xFF] y3 = xor4_32(
^ sharedMemory[__byte_perm(x0, 0x1, 0x5541)] sharedMemory[__byte_perm(x3, 0, 0x4440)],
^ sharedMemory[__byte_perm(x1, 0x2, 0x5542)] sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
^ sharedMemory[__byte_perm(x2, 0x3, 0x5543)]; sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
} }
__device__ __device__
@ -184,23 +374,27 @@ static void aes_round(
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{ {
y0 = sharedMemory[x0 & 0xFF] y0 = xor4_32(
^ sharedMemory[__byte_perm(x1, 0x1, 0x5541)] sharedMemory[__byte_perm(x0, 0, 0x4440)],
^ sharedMemory[__byte_perm(x2, 0x2, 0x5542)] sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
^ sharedMemory[__byte_perm(x3, 0x3, 0x5543)]; sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
y1 = sharedMemory[x1 & 0xFF]
^ sharedMemory[__byte_perm(x2, 0x1, 0x5541)] y1 = xor4_32(
^ sharedMemory[__byte_perm(x3, 0x2, 0x5542)] sharedMemory[__byte_perm(x1, 0, 0x4440)],
^ sharedMemory[__byte_perm(x0, 0x3, 0x5543)]; sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
y2 = sharedMemory[x2 & 0xFF] sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
^ sharedMemory[__byte_perm(x3, 0x1, 0x5541)]
^ sharedMemory[__byte_perm(x0, 0x2, 0x5542)] y2 = xor4_32(
^ sharedMemory[__byte_perm(x1, 0x3, 0x5543)]; sharedMemory[__byte_perm(x2, 0, 0x4440)],
sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
y3 = sharedMemory[x3 & 0xFF] sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
^ sharedMemory[__byte_perm(x0, 0x1, 0x5541)] sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
^ sharedMemory[__byte_perm(x1, 0x2, 0x5542)]
^ sharedMemory[__byte_perm(x2, 0x3, 0x5543)]; y3 = xor4_32(
} sharedMemory[__byte_perm(x3, 0, 0x4440)],
sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
}

37
x11/cuda_x11_echo.cu

@ -261,18 +261,38 @@ void cuda_echo_round(
hash[i] ^= W[i]; hash[i] ^= W[i];
} }
__global__ __launch_bounds__(128, 6) /* 128,7 force 72 registers */ __device__ __forceinline__
void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash) void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
{
/* each thread startup will fill a uint32 */
if (threadIdx.x < 128) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
}
}
__global__ __launch_bounds__(128, 7) /* will force 72 registers */
void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
__shared__ uint32_t sharedMemory[1024];
echo_gpu_init(sharedMemory);
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// fill shared mem uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
__shared__ uint32_t sharedMemory[1024];
const uint16_t idx = thread & 0x3FF; // % 1024 int hashPosition = nounce - startNounce;
sharedMemory[idx] = c_AES[idx]; uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
uint32_t *Hash = &g_hash[thread<<4];
cuda_echo_round(sharedMemory, Hash); cuda_echo_round(sharedMemory, Hash);
} }
} }
@ -287,11 +307,10 @@ __host__
void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const uint32_t threadsperblock = 128; const uint32_t threadsperblock = 128;
threads = max(threads, 1024U);
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
x11_echo512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash); x11_echo512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

104
x11/cuda_x11_shavite512.cu

@ -1322,25 +1322,56 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, c
state[0xF] ^= p7; state[0xF] ^= p7;
} }
__device__ __forceinline__
void shavite_gpu_init(uint32_t *sharedMemory)
{
/* each thread startup will fill a uint32 */
if (threadIdx.x < 128) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
}
}
// GPU Hash // GPU Hash
__global__ __launch_bounds__(TPB, 7) /* 64 registers with 128,8 - 72 regs with 128,7 */ __global__ __launch_bounds__(TPB, 7) /* 64 registers with 128,8 - 72 regs with 128,7 */
void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNonce, uint32_t *g_hash) void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
__shared__ uint32_t sharedMemory[1024];
shavite_gpu_init(sharedMemory);
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t *Hash = &g_hash[thread * 16U]; uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
// fill shared mem (aes constants) // kopiere init-state
__shared__ uint32_t sharedMemory[1024]; uint32_t state[16] = {
thread &= 0x3ff; sharedMemory[thread] = c_AES[thread]; SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A)
};
// nachricht laden
uint32_t msg[32]; uint32_t msg[32];
// fülle die Nachricht mit 64-byte (vorheriger Hash)
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
msg[i] = Hash[i]; msg[i] = Hash[i];
// Nachrichtenende
msg[16] = 0x80; msg[16] = 0x80;
#pragma unroll 10 #pragma unroll 10
for(int i=17;i<27;i++) for(int i=17;i<27;i++)
@ -1352,14 +1383,6 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNonce, uint32_t
msg[30] = 0; msg[30] = 0;
msg[31] = 0x02000000; msg[31] = 0x02000000;
// init-state
uint32_t state[16] = {
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
};
c512(sharedMemory, state, msg, 512); c512(sharedMemory, state, msg, 512);
#pragma unroll 16 #pragma unroll 16
@ -1368,18 +1391,25 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNonce, uint32_t
} }
} }
__global__ __launch_bounds__(TPB, 5) __global__ __launch_bounds__(TPB, 7)
void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint32_t *outputHash) void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
{ {
__shared__ uint32_t sharedMemory[1024];
shavite_gpu_init(sharedMemory);
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
const uint32_t nonce = startNonce + thread; const uint32_t nounce = startNounce + thread;
uint32_t *outHash = &outputHash[thread * 16U];
// fill shared mem (aes constants) // kopiere init-state
__shared__ uint32_t sharedMemory[1024]; uint32_t state[16] = {
thread &= 0x3ff; sharedMemory[thread] = c_AES[thread]; SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A)
};
uint32_t msg[32]; uint32_t msg[32];
@ -1387,34 +1417,31 @@ void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint32_t
for(int i=0;i<32;i++) { for(int i=0;i<32;i++) {
msg[i] = c_PaddedMessage80[i]; msg[i] = c_PaddedMessage80[i];
} }
msg[19] = cuda_swab32(nonce); msg[19] = cuda_swab32(nounce);
msg[20] = 0x80;
// init-state msg[27] = 0x2800000;
uint32_t state[16] = { msg[31] = 0x2000000;
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
};
c512(sharedMemory, state, msg, 640); c512(sharedMemory, state, msg, 640);
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
outHash[i] = state[i]; outHash[i] = state[i];
}
} //thread < threads
} }
__host__ __host__
void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const uint32_t threadsperblock = TPB; const uint32_t threadsperblock = TPB;
threads = max(threads, 1024U);
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash); x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
//MyStreamSynchronize(NULL, order, thr_id); //MyStreamSynchronize(NULL, order, thr_id);
} }
@ -1422,7 +1449,6 @@ __host__
void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order) void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{ {
const uint32_t threadsperblock = TPB; const uint32_t threadsperblock = TPB;
threads = max(threads, 1024U);
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -1439,13 +1465,11 @@ void x11_shavite512_cpu_init(int thr_id, uint32_t threads)
__host__ __host__
void x11_shavite512_setBlock_80(void *pdata) void x11_shavite512_setBlock_80(void *pdata)
{ {
uint32_t PaddedMessage[32]; // Message mit Padding bereitstellen
// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
unsigned char PaddedMessage[128];
memcpy(PaddedMessage, pdata, 80); memcpy(PaddedMessage, pdata, 80);
memset(&PaddedMessage[20], 0, 48); memset(PaddedMessage+80, 0, 48);
PaddedMessage[20] = 0x80;
PaddedMessage[27] = 0x2800000;
PaddedMessage[31] = 0x2000000;
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
} }

Loading…
Cancel
Save