From 1187a6e7e3211f0216111554a55b685687003b11 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 19 Jun 2015 01:47:39 +0200 Subject: [PATCH] x11: improve aes (shavite/echo) shavite is faster, echo doesn't really change due to the reg. overload This changes allow custom lauchbounds without other code changes and improve the portability against different devices. also set a minimum throughput to 1024 for these algos (shared mem req. size) --- x11/cuda_x11_aes.cu | 540 ++++++++++++------------------------- x11/cuda_x11_echo.cu | 37 +-- x11/cuda_x11_shavite512.cu | 104 +++---- 3 files changed, 222 insertions(+), 459 deletions(-) diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu index 362dd9c..4470563 100644 --- a/x11/cuda_x11_aes.cu +++ b/x11/cuda_x11_aes.cu @@ -1,371 +1,181 @@ - -/* AES Helper for inline-usage from SPH */ -#define AESx(x) (x ##UL) /* SPH_C32(x) */ - +// change nothing... //#define DEVICE_DIRECT_CONSTANTS #ifdef DEVICE_DIRECT_CONSTANTS -__constant__ __align__(64) uint32_t d_AES0[256] = { -#else -static const uint32_t h_AES0[256] = { -#endif - AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), - AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), - AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), - AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC), - AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA), - AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB), - AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45), - AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B), - AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C), - AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83), - AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9), - AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A), - AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D), - AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F), - AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF), - AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA), - AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34), - AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B), - AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D), - AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413), - AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1), - AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6), - AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972), - AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85), - AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED), - AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511), - AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE), - AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B), - AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05), - AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1), - AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142), - AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF), - AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3), - AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E), - AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A), - AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6), - AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3), - AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B), - AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428), - AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD), - AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14), - AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8), - AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4), - AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2), - AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA), - AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949), - AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF), - AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810), - AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C), - AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697), - AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E), - AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F), - AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC), - AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C), - AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969), - AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27), - AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122), - AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433), - AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9), - AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5), - AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A), - AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0), - AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E), - AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) -}; - -#ifdef DEVICE_DIRECT_CONSTANTS -__constant__ __align__(64) uint32_t d_AES1[256] = { +__constant__ uint32_t c_AES[1024] = { #else -static const uint32_t h_AES1[256] = { +static const uint32_t h_AES[1024] = { #endif - AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), - AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), - AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), - AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), - AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), - AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), - AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), - AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), - AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), - AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), - AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), - AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), - AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), - AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), - AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), - AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), - AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), - AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), - AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), - AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), - AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), - AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), - AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), - AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), - AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), - AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), - AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), - AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), - AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), - AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), - AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), - AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), - AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), - AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), - AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), - AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), - AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), - AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), - AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), - AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), - AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), - AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), - AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), - AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), - AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), - AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), - AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), - AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), - AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), - AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), - AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), - AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), - AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), - AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), - AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), - AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), - AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), - AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), - AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), - AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), - AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), - AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), - AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), - AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) -}; - -#ifdef DEVICE_DIRECT_CONSTANTS -__constant__ __align__(64) uint32_t d_AES2[256] = { -#else -static const uint32_t h_AES2[256] = { -#endif - AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), - AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), - AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), - AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), - AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), - AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), - AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), - AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), - AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), - AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), - AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), - AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), - AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), - AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), - AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), - AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), - AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), - AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), - AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), - AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), - AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), - AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), - AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), - AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), - AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), - AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), - AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), - AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), - AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), - AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), - AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), - AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), - AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), - AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), - AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), - AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), - AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), - AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), - AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), - AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), - AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), - AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), - AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), - AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), - AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), - AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), - AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), - AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), - AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), - AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), - AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), - AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), - AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), - AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), - AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), - AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), - AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), - AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), - AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), - AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), - AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), - AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), - AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), - AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) -}; - -#ifdef DEVICE_DIRECT_CONSTANTS -__constant__ __align__(64) uint32_t d_AES3[256] = { -#else -static const uint32_t h_AES3[256] = { -#endif - AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), - AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), - AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), - AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676), - AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D), - AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0), - AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF), - AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0), - AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626), - AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC), - AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1), - AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515), - AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3), - AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A), - AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2), - AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575), - AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A), - AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0), - AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3), - AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484), - AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED), - AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B), - AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939), - AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF), - AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB), - AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585), - AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F), - AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8), - AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F), - AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5), - AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121), - AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2), - AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC), - AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717), - AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D), - AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373), - AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC), - AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888), - AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414), - AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB), - AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A), - AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C), - AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262), - AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979), - AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D), - AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9), - AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA), - AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808), - AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E), - AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6), - AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F), - AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A), - AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666), - AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E), - AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9), - AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E), - AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111), - AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494), - AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9), - AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF), - AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D), - AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868), - AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F), - AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) + 0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591, + 0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC, + 0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB, + 0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B, + 0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83, + 0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A, + 0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F, + 0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA, + 0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B, + 0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413, + 0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6, + 0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85, + 0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511, + 0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B, + 0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1, + 0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF, + 0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E, + 0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6, + 0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B, + 0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD, + 0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8, + 0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2, + 0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949, + 0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810, + 0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697, + 0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F, + 0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C, + 0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27, + 0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433, + 0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5, + 0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0, + 0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C, + // ROL 8 + 0x6363C6A5, 0x7C7CF884, 0x7777EE99, 0x7B7BF68D, 0xF2F2FF0D, 0x6B6BD6BD, 0x6F6FDEB1, 0xC5C59154, + 0x30306050, 0x01010203, 0x6767CEA9, 0x2B2B567D, 0xFEFEE719, 0xD7D7B562, 0xABAB4DE6, 0x7676EC9A, + 0xCACA8F45, 0x82821F9D, 0xC9C98940, 0x7D7DFA87, 0xFAFAEF15, 0x5959B2EB, 0x47478EC9, 0xF0F0FB0B, + 0xADAD41EC, 0xD4D4B367, 0xA2A25FFD, 0xAFAF45EA, 0x9C9C23BF, 0xA4A453F7, 0x7272E496, 0xC0C09B5B, + 0xB7B775C2, 0xFDFDE11C, 0x93933DAE, 0x26264C6A, 0x36366C5A, 0x3F3F7E41, 0xF7F7F502, 0xCCCC834F, + 0x3434685C, 0xA5A551F4, 0xE5E5D134, 0xF1F1F908, 0x7171E293, 0xD8D8AB73, 0x31316253, 0x15152A3F, + 0x0404080C, 0xC7C79552, 0x23234665, 0xC3C39D5E, 0x18183028, 0x969637A1, 0x05050A0F, 0x9A9A2FB5, + 0x07070E09, 0x12122436, 0x80801B9B, 0xE2E2DF3D, 0xEBEBCD26, 0x27274E69, 0xB2B27FCD, 0x7575EA9F, + 0x0909121B, 0x83831D9E, 0x2C2C5874, 0x1A1A342E, 0x1B1B362D, 0x6E6EDCB2, 0x5A5AB4EE, 0xA0A05BFB, + 0x5252A4F6, 0x3B3B764D, 0xD6D6B761, 0xB3B37DCE, 0x2929527B, 0xE3E3DD3E, 0x2F2F5E71, 0x84841397, + 0x5353A6F5, 0xD1D1B968, 0x00000000, 0xEDEDC12C, 0x20204060, 0xFCFCE31F, 0xB1B179C8, 0x5B5BB6ED, + 0x6A6AD4BE, 0xCBCB8D46, 0xBEBE67D9, 0x3939724B, 0x4A4A94DE, 0x4C4C98D4, 0x5858B0E8, 0xCFCF854A, + 0xD0D0BB6B, 0xEFEFC52A, 0xAAAA4FE5, 0xFBFBED16, 0x434386C5, 0x4D4D9AD7, 0x33336655, 0x85851194, + 0x45458ACF, 0xF9F9E910, 0x02020406, 0x7F7FFE81, 0x5050A0F0, 0x3C3C7844, 0x9F9F25BA, 0xA8A84BE3, + 0x5151A2F3, 0xA3A35DFE, 0x404080C0, 0x8F8F058A, 0x92923FAD, 0x9D9D21BC, 0x38387048, 0xF5F5F104, + 0xBCBC63DF, 0xB6B677C1, 0xDADAAF75, 0x21214263, 0x10102030, 0xFFFFE51A, 0xF3F3FD0E, 0xD2D2BF6D, + 0xCDCD814C, 0x0C0C1814, 0x13132635, 0xECECC32F, 0x5F5FBEE1, 0x979735A2, 0x444488CC, 0x17172E39, + 0xC4C49357, 0xA7A755F2, 0x7E7EFC82, 0x3D3D7A47, 0x6464C8AC, 0x5D5DBAE7, 0x1919322B, 0x7373E695, + 0x6060C0A0, 0x81811998, 0x4F4F9ED1, 0xDCDCA37F, 0x22224466, 0x2A2A547E, 0x90903BAB, 0x88880B83, + 0x46468CCA, 0xEEEEC729, 0xB8B86BD3, 0x1414283C, 0xDEDEA779, 0x5E5EBCE2, 0x0B0B161D, 0xDBDBAD76, + 0xE0E0DB3B, 0x32326456, 0x3A3A744E, 0x0A0A141E, 0x494992DB, 0x06060C0A, 0x2424486C, 0x5C5CB8E4, + 0xC2C29F5D, 0xD3D3BD6E, 0xACAC43EF, 0x6262C4A6, 0x919139A8, 0x959531A4, 0xE4E4D337, 0x7979F28B, + 0xE7E7D532, 0xC8C88B43, 0x37376E59, 0x6D6DDAB7, 0x8D8D018C, 0xD5D5B164, 0x4E4E9CD2, 0xA9A949E0, + 0x6C6CD8B4, 0x5656ACFA, 0xF4F4F307, 0xEAEACF25, 0x6565CAAF, 0x7A7AF48E, 0xAEAE47E9, 0x08081018, + 0xBABA6FD5, 0x7878F088, 0x25254A6F, 0x2E2E5C72, 0x1C1C3824, 0xA6A657F1, 0xB4B473C7, 0xC6C69751, + 0xE8E8CB23, 0xDDDDA17C, 0x7474E89C, 0x1F1F3E21, 0x4B4B96DD, 0xBDBD61DC, 0x8B8B0D86, 0x8A8A0F85, + 0x7070E090, 0x3E3E7C42, 0xB5B571C4, 0x6666CCAA, 0x484890D8, 0x03030605, 0xF6F6F701, 0x0E0E1C12, + 0x6161C2A3, 0x35356A5F, 0x5757AEF9, 0xB9B969D0, 0x86861791, 0xC1C19958, 0x1D1D3A27, 0x9E9E27B9, + 0xE1E1D938, 0xF8F8EB13, 0x98982BB3, 0x11112233, 0x6969D2BB, 0xD9D9A970, 0x8E8E0789, 0x949433A7, + 0x9B9B2DB6, 0x1E1E3C22, 0x87871592, 0xE9E9C920, 0xCECE8749, 0x5555AAFF, 0x28285078, 0xDFDFA57A, + 0x8C8C038F, 0xA1A159F8, 0x89890980, 0x0D0D1A17, 0xBFBF65DA, 0xE6E6D731, 0x424284C6, 0x6868D0B8, + 0x414182C3, 0x999929B0, 0x2D2D5A77, 0x0F0F1E11, 0xB0B07BCB, 0x5454A8FC, 0xBBBB6DD6, 0x16162C3A, + // ROL 16 + 0x63C6A563, 0x7CF8847C, 0x77EE9977, 0x7BF68D7B, 0xF2FF0DF2, 0x6BD6BD6B, 0x6FDEB16F, 0xC59154C5, + 0x30605030, 0x01020301, 0x67CEA967, 0x2B567D2B, 0xFEE719FE, 0xD7B562D7, 0xAB4DE6AB, 0x76EC9A76, + 0xCA8F45CA, 0x821F9D82, 0xC98940C9, 0x7DFA877D, 0xFAEF15FA, 0x59B2EB59, 0x478EC947, 0xF0FB0BF0, + 0xAD41ECAD, 0xD4B367D4, 0xA25FFDA2, 0xAF45EAAF, 0x9C23BF9C, 0xA453F7A4, 0x72E49672, 0xC09B5BC0, + 0xB775C2B7, 0xFDE11CFD, 0x933DAE93, 0x264C6A26, 0x366C5A36, 0x3F7E413F, 0xF7F502F7, 0xCC834FCC, + 0x34685C34, 0xA551F4A5, 0xE5D134E5, 0xF1F908F1, 0x71E29371, 0xD8AB73D8, 0x31625331, 0x152A3F15, + 0x04080C04, 0xC79552C7, 0x23466523, 0xC39D5EC3, 0x18302818, 0x9637A196, 0x050A0F05, 0x9A2FB59A, + 0x070E0907, 0x12243612, 0x801B9B80, 0xE2DF3DE2, 0xEBCD26EB, 0x274E6927, 0xB27FCDB2, 0x75EA9F75, + 0x09121B09, 0x831D9E83, 0x2C58742C, 0x1A342E1A, 0x1B362D1B, 0x6EDCB26E, 0x5AB4EE5A, 0xA05BFBA0, + 0x52A4F652, 0x3B764D3B, 0xD6B761D6, 0xB37DCEB3, 0x29527B29, 0xE3DD3EE3, 0x2F5E712F, 0x84139784, + 0x53A6F553, 0xD1B968D1, 0x00000000, 0xEDC12CED, 0x20406020, 0xFCE31FFC, 0xB179C8B1, 0x5BB6ED5B, + 0x6AD4BE6A, 0xCB8D46CB, 0xBE67D9BE, 0x39724B39, 0x4A94DE4A, 0x4C98D44C, 0x58B0E858, 0xCF854ACF, + 0xD0BB6BD0, 0xEFC52AEF, 0xAA4FE5AA, 0xFBED16FB, 0x4386C543, 0x4D9AD74D, 0x33665533, 0x85119485, + 0x458ACF45, 0xF9E910F9, 0x02040602, 0x7FFE817F, 0x50A0F050, 0x3C78443C, 0x9F25BA9F, 0xA84BE3A8, + 0x51A2F351, 0xA35DFEA3, 0x4080C040, 0x8F058A8F, 0x923FAD92, 0x9D21BC9D, 0x38704838, 0xF5F104F5, + 0xBC63DFBC, 0xB677C1B6, 0xDAAF75DA, 0x21426321, 0x10203010, 0xFFE51AFF, 0xF3FD0EF3, 0xD2BF6DD2, + 0xCD814CCD, 0x0C18140C, 0x13263513, 0xECC32FEC, 0x5FBEE15F, 0x9735A297, 0x4488CC44, 0x172E3917, + 0xC49357C4, 0xA755F2A7, 0x7EFC827E, 0x3D7A473D, 0x64C8AC64, 0x5DBAE75D, 0x19322B19, 0x73E69573, + 0x60C0A060, 0x81199881, 0x4F9ED14F, 0xDCA37FDC, 0x22446622, 0x2A547E2A, 0x903BAB90, 0x880B8388, + 0x468CCA46, 0xEEC729EE, 0xB86BD3B8, 0x14283C14, 0xDEA779DE, 0x5EBCE25E, 0x0B161D0B, 0xDBAD76DB, + 0xE0DB3BE0, 0x32645632, 0x3A744E3A, 0x0A141E0A, 0x4992DB49, 0x060C0A06, 0x24486C24, 0x5CB8E45C, + 0xC29F5DC2, 0xD3BD6ED3, 0xAC43EFAC, 0x62C4A662, 0x9139A891, 0x9531A495, 0xE4D337E4, 0x79F28B79, + 0xE7D532E7, 0xC88B43C8, 0x376E5937, 0x6DDAB76D, 0x8D018C8D, 0xD5B164D5, 0x4E9CD24E, 0xA949E0A9, + 0x6CD8B46C, 0x56ACFA56, 0xF4F307F4, 0xEACF25EA, 0x65CAAF65, 0x7AF48E7A, 0xAE47E9AE, 0x08101808, + 0xBA6FD5BA, 0x78F08878, 0x254A6F25, 0x2E5C722E, 0x1C38241C, 0xA657F1A6, 0xB473C7B4, 0xC69751C6, + 0xE8CB23E8, 0xDDA17CDD, 0x74E89C74, 0x1F3E211F, 0x4B96DD4B, 0xBD61DCBD, 0x8B0D868B, 0x8A0F858A, + 0x70E09070, 0x3E7C423E, 0xB571C4B5, 0x66CCAA66, 0x4890D848, 0x03060503, 0xF6F701F6, 0x0E1C120E, + 0x61C2A361, 0x356A5F35, 0x57AEF957, 0xB969D0B9, 0x86179186, 0xC19958C1, 0x1D3A271D, 0x9E27B99E, + 0xE1D938E1, 0xF8EB13F8, 0x982BB398, 0x11223311, 0x69D2BB69, 0xD9A970D9, 0x8E07898E, 0x9433A794, + 0x9B2DB69B, 0x1E3C221E, 0x87159287, 0xE9C920E9, 0xCE8749CE, 0x55AAFF55, 0x28507828, 0xDFA57ADF, + 0x8C038F8C, 0xA159F8A1, 0x89098089, 0x0D1A170D, 0xBF65DABF, 0xE6D731E6, 0x4284C642, 0x68D0B868, + 0x4182C341, 0x9929B099, 0x2D5A772D, 0x0F1E110F, 0xB07BCBB0, 0x54A8FC54, 0xBB6DD6BB, 0x162C3A16, + // ROL 24 + 0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5, + 0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676, + 0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0, + 0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0, + 0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC, + 0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515, + 0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A, + 0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575, + 0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0, + 0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484, + 0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B, + 0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF, + 0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585, + 0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8, + 0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5, + 0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2, + 0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717, + 0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373, + 0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888, + 0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB, + 0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C, + 0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979, + 0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9, + 0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808, + 0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6, + 0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A, + 0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E, + 0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E, + 0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494, + 0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF, + 0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868, + 0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616 }; #ifndef DEVICE_DIRECT_CONSTANTS -static __constant__ __align__(64) uint32_t d_AES0[256]; -static __constant__ __align__(64) uint32_t d_AES1[256]; -static __constant__ __align__(64) uint32_t d_AES2[256]; -static __constant__ __align__(64) uint32_t d_AES3[256]; - -static void aes_cpu_init(int thr_id) -{ - CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES0, - h_AES0, - sizeof(h_AES0), - 0, cudaMemcpyHostToDevice)); +static __constant__ uint32_t c_AES[1024]; - CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES1, - h_AES1, - sizeof(h_AES1), - 0, cudaMemcpyHostToDevice)); - - CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES2, - h_AES2, - sizeof(h_AES2), - 0, cudaMemcpyHostToDevice)); - - CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES3, - h_AES3, - sizeof(h_AES3), - 0, cudaMemcpyHostToDevice)); +static void aes_cpu_init(int thr_id) { + CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_AES, h_AES, sizeof(h_AES), 0, cudaMemcpyHostToDevice)); } #else static void aes_cpu_init(int thr_id) {} #endif -__device__ __forceinline__ -void aes_gpu_init(uint32_t *sharedMemory) -{ - /* each thread startup will fill a uint32 */ - if (threadIdx.x < 256) { - sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; - sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x]; - sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x]; - sharedMemory[threadIdx.x+768] = d_AES3[threadIdx.x]; - } -} - -/* tried with 3 xor.b32 asm, not faster */ -#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d)); - __device__ static void aes_round( const uint32_t *sharedMemory, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { - y0 = xor4_32( - sharedMemory[__byte_perm(x0, 0, 0x4440)], - sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); + y0 = sharedMemory[x0 & 0xFF] + ^ sharedMemory[__byte_perm(x1, 0x1, 0x5541)] + ^ sharedMemory[__byte_perm(x2, 0x2, 0x5542)] + ^ sharedMemory[__byte_perm(x3, 0x3, 0x5543)]; - y1 = xor4_32( - sharedMemory[__byte_perm(x1, 0, 0x4440)], - sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); + y1 = sharedMemory[x1 & 0xFF] + ^ sharedMemory[__byte_perm(x2, 0x1, 0x5541)] + ^ sharedMemory[__byte_perm(x3, 0x2, 0x5542)] + ^ sharedMemory[__byte_perm(x0, 0x3, 0x5543)]; - y2 = xor4_32( - sharedMemory[__byte_perm(x2, 0, 0x4440)], - sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 + y2 = sharedMemory[x2 & 0xFF] + ^ sharedMemory[__byte_perm(x3, 0x1, 0x5541)] + ^ sharedMemory[__byte_perm(x0, 0x2, 0x5542)] + ^ sharedMemory[__byte_perm(x1, 0x3, 0x5543)]; y0 ^= k0; - y3 = xor4_32( - sharedMemory[__byte_perm(x3, 0, 0x4440)], - sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 + y3 = sharedMemory[x3 & 0xFF] + ^ sharedMemory[__byte_perm(x0, 0x1, 0x5541)] + ^ sharedMemory[__byte_perm(x1, 0x2, 0x5542)] + ^ sharedMemory[__byte_perm(x2, 0x3, 0x5543)]; } __device__ @@ -374,27 +184,23 @@ static void aes_round( uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { - y0 = xor4_32( - sharedMemory[__byte_perm(x0, 0, 0x4440)], - sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); - - y1 = xor4_32( - sharedMemory[__byte_perm(x1, 0, 0x4440)], - sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); - - y2 = xor4_32( - sharedMemory[__byte_perm(x2, 0, 0x4440)], - sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 - - y3 = xor4_32( - sharedMemory[__byte_perm(x3, 0, 0x4440)], - sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 -} + y0 = sharedMemory[x0 & 0xFF] + ^ sharedMemory[__byte_perm(x1, 0x1, 0x5541)] + ^ sharedMemory[__byte_perm(x2, 0x2, 0x5542)] + ^ sharedMemory[__byte_perm(x3, 0x3, 0x5543)]; + + y1 = sharedMemory[x1 & 0xFF] + ^ sharedMemory[__byte_perm(x2, 0x1, 0x5541)] + ^ sharedMemory[__byte_perm(x3, 0x2, 0x5542)] + ^ sharedMemory[__byte_perm(x0, 0x3, 0x5543)]; + + y2 = sharedMemory[x2 & 0xFF] + ^ sharedMemory[__byte_perm(x3, 0x1, 0x5541)] + ^ sharedMemory[__byte_perm(x0, 0x2, 0x5542)] + ^ sharedMemory[__byte_perm(x1, 0x3, 0x5543)]; + + y3 = sharedMemory[x3 & 0xFF] + ^ sharedMemory[__byte_perm(x0, 0x1, 0x5541)] + ^ sharedMemory[__byte_perm(x1, 0x2, 0x5542)] + ^ sharedMemory[__byte_perm(x2, 0x3, 0x5543)]; +} \ No newline at end of file diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index 4a9af90..b173dfe 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -261,38 +261,18 @@ void cuda_echo_round( hash[i] ^= W[i]; } -__device__ __forceinline__ -void echo_gpu_init(uint32_t *const __restrict__ sharedMemory) -{ - /* each thread startup will fill a uint32 */ - if (threadIdx.x < 128) { - sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; - sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; - sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; - sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; - - sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; - } -} - -__global__ __launch_bounds__(128, 7) /* will force 72 registers */ -void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ __launch_bounds__(128, 6) /* 128,7 force 72 registers */ +void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash) { - __shared__ uint32_t sharedMemory[1024]; - - echo_gpu_init(sharedMemory); - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; + // fill shared mem + __shared__ uint32_t sharedMemory[1024]; + const uint16_t idx = thread & 0x3FF; // % 1024 + sharedMemory[idx] = c_AES[idx]; + uint32_t *Hash = &g_hash[thread<<4]; cuda_echo_round(sharedMemory, Hash); } } @@ -307,10 +287,11 @@ __host__ void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const uint32_t threadsperblock = 128; + threads = max(threads, 1024U); dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x11_echo512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x11_echo512_gpu_hash_64<<>>(threads, startNounce, d_hash); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu index 2f5e806..c02749b 100644 --- a/x11/cuda_x11_shavite512.cu +++ b/x11/cuda_x11_shavite512.cu @@ -1322,56 +1322,25 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, c state[0xF] ^= p7; } -__device__ __forceinline__ -void shavite_gpu_init(uint32_t *sharedMemory) -{ - /* each thread startup will fill a uint32 */ - if (threadIdx.x < 128) { - sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; - sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; - sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; - sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; - - sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; - } -} - // GPU Hash __global__ __launch_bounds__(TPB, 7) /* 64 registers with 128,8 - 72 regs with 128,7 */ -void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNonce, uint32_t *g_hash) { - __shared__ uint32_t sharedMemory[1024]; - - shavite_gpu_init(sharedMemory); - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; + uint32_t *Hash = &g_hash[thread * 16U]; - // kopiere init-state - uint32_t state[16] = { - SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), - SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), - SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), - SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A) - }; + // fill shared mem (aes constants) + __shared__ uint32_t sharedMemory[1024]; + thread &= 0x3ff; sharedMemory[thread] = c_AES[thread]; - // nachricht laden uint32_t msg[32]; - // fülle die Nachricht mit 64-byte (vorheriger Hash) #pragma unroll 16 for(int i=0;i<16;i++) msg[i] = Hash[i]; - // Nachrichtenende msg[16] = 0x80; #pragma unroll 10 for(int i=17;i<27;i++) @@ -1383,6 +1352,14 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t msg[30] = 0; msg[31] = 0x02000000; + // init-state + uint32_t state[16] = { + 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, + 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, + 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, + 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A + }; + c512(sharedMemory, state, msg, 512); #pragma unroll 16 @@ -1391,25 +1368,18 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t } } -__global__ __launch_bounds__(TPB, 7) -void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) +__global__ __launch_bounds__(TPB, 5) +void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint32_t *outputHash) { - __shared__ uint32_t sharedMemory[1024]; - - shavite_gpu_init(sharedMemory); - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - const uint32_t nounce = startNounce + thread; + const uint32_t nonce = startNonce + thread; + uint32_t *outHash = &outputHash[thread * 16U]; - // kopiere init-state - uint32_t state[16] = { - SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), - SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), - SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), - SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A) - }; + // fill shared mem (aes constants) + __shared__ uint32_t sharedMemory[1024]; + thread &= 0x3ff; sharedMemory[thread] = c_AES[thread]; uint32_t msg[32]; @@ -1417,31 +1387,34 @@ void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou for(int i=0;i<32;i++) { msg[i] = c_PaddedMessage80[i]; } - msg[19] = cuda_swab32(nounce); - msg[20] = 0x80; - msg[27] = 0x2800000; - msg[31] = 0x2000000; + msg[19] = cuda_swab32(nonce); - c512(sharedMemory, state, msg, 640); + // init-state + uint32_t state[16] = { + 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, + 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, + 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, + 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A + }; - uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; + c512(sharedMemory, state, msg, 640); #pragma unroll 16 for(int i=0;i<16;i++) outHash[i] = state[i]; - - } //thread < threads + } } __host__ void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const uint32_t threadsperblock = TPB; + threads = max(threads, 1024U); dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x11_shavite512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x11_shavite512_gpu_hash_64<<>>(threads, startNounce, d_hash); //MyStreamSynchronize(NULL, order, thr_id); } @@ -1449,6 +1422,7 @@ __host__ void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order) { const uint32_t threadsperblock = TPB; + threads = max(threads, 1024U); dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); @@ -1465,11 +1439,13 @@ void x11_shavite512_cpu_init(int thr_id, uint32_t threads) __host__ void x11_shavite512_setBlock_80(void *pdata) { - // Message mit Padding bereitstellen - // lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen. - unsigned char PaddedMessage[128]; + uint32_t PaddedMessage[32]; + memcpy(PaddedMessage, pdata, 80); - memset(PaddedMessage+80, 0, 48); + memset(&PaddedMessage[20], 0, 48); + PaddedMessage[20] = 0x80; + PaddedMessage[27] = 0x2800000; + PaddedMessage[31] = 0x2000000; - cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice); }