Browse Source

import xmr, to finish

todo: fix jh cuda and wrong decimal diff (0xffff problem ?)
master
Tanguy Pruvot 8 years ago
parent
commit
066a569357
  1. 2
      .gitignore
  2. 5
      Makefile.am
  3. 2
      algos.h
  4. 1
      bench.cpp
  5. 39
      ccminer.cpp
  6. 22
      ccminer.vcxproj
  7. 45
      ccminer.vcxproj.filters
  8. 170
      crypto/aesb.cpp
  9. 319
      crypto/cn_aes.cuh
  10. 180
      crypto/cn_blake.cuh
  11. 347
      crypto/cn_groestl.cuh
  12. 293
      crypto/cn_jh.cuh
  13. 211
      crypto/cn_keccak.cuh
  14. 345
      crypto/cn_skein.cuh
  15. 122
      crypto/cpu/c_keccak.c
  16. 34
      crypto/cpu/c_keccak.h
  17. 226
      crypto/cryptonight-cpu.cpp
  18. 170
      crypto/cryptonight.cu
  19. 156
      crypto/cryptonight.h
  20. 262
      crypto/cuda_cryptonight_core.cu
  21. 234
      crypto/cuda_cryptonight_extra.cu
  22. 51
      crypto/oaes_config.h
  23. 1446
      crypto/oaes_lib.cpp
  24. 214
      crypto/oaes_lib.h
  25. 68
      crypto/xmr-rpc.cpp
  26. 3
      miner.h
  27. 4
      pools.cpp
  28. 5
      util.cpp

2
.gitignore vendored

@ -25,7 +25,7 @@ config.sub
mingw32-config.cache mingw32-config.cache
*/.dirstamp .dirstamp
.DS_Store .DS_Store
Desktop.ini Desktop.ini
Thumbs.db Thumbs.db

5
Makefile.am

@ -40,6 +40,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
Algo256/blake2s.cu sph/blake2s.c \ Algo256/blake2s.cu sph/blake2s.c \
Algo256/bmw.cu Algo256/cuda_bmw.cu \ Algo256/bmw.cu Algo256/cuda_bmw.cu \
crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \ crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \
crypto/cryptonight.cu crypto/cuda_cryptonight_core.cu crypto/cuda_cryptonight_extra.cu \
crypto/cryptonight-cpu.cpp crypto/oaes_lib.cpp crypto/aesb.cpp crypto/cpu/c_keccak.c \
JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \ JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
@ -105,6 +107,9 @@ Algo256/blake256.o: Algo256/blake256.cu
Algo256/cuda_bmw.o: Algo256/cuda_bmw.cu Algo256/cuda_bmw.o: Algo256/cuda_bmw.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
crypto/cuda_cryptonight_extra.o: crypto/cuda_cryptonight_extra.cu
$(NVCC) $(nvcc_FLAGS) -o $@ -c $<
heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<

2
algos.h

@ -10,6 +10,7 @@ enum sha_algos {
ALGO_BLAKE2S, ALGO_BLAKE2S,
ALGO_BMW, ALGO_BMW,
ALGO_C11, ALGO_C11,
ALGO_CRYPTONIGHT,
ALGO_DEEP, ALGO_DEEP,
ALGO_DECRED, ALGO_DECRED,
ALGO_DMD_GR, ALGO_DMD_GR,
@ -62,6 +63,7 @@ static const char *algo_names[] = {
"blake2s", "blake2s",
"bmw", "bmw",
"c11", "c11",
"cryptonight",
"deep", "deep",
"decred", "decred",
"dmd-gr", "dmd-gr",

1
bench.cpp

@ -48,6 +48,7 @@ void algo_free_all(int thr_id)
free_blake2s(thr_id); free_blake2s(thr_id);
free_bmw(thr_id); free_bmw(thr_id);
free_c11(thr_id); free_c11(thr_id);
free_cryptonight(thr_id);
free_decred(thr_id); free_decred(thr_id);
free_deep(thr_id); free_deep(thr_id);
free_keccak256(thr_id); free_keccak256(thr_id);

39
ccminer.cpp

@ -620,6 +620,7 @@ static bool work_decode(const json_t *val, struct work *work)
data_size = 80; data_size = 80;
adata_sz = data_size / 4; adata_sz = data_size / 4;
break; break;
case ALGO_CRYPTONIGHT:
case ALGO_WILDKECCAK: case ALGO_WILDKECCAK:
return rpc2_job_decode(val, work); return rpc2_job_decode(val, work);
default: default:
@ -1766,12 +1767,15 @@ static void *miner_thread(void *userdata)
nonceptr = (uint32_t*) (((char*)work.data) + 1); nonceptr = (uint32_t*) (((char*)work.data) + 1);
wcmpoft = 2; wcmpoft = 2;
wcmplen = 32; wcmplen = 32;
} else if (opt_algo == ALGO_CRYPTONIGHT) {
nonceptr = (uint32_t*) (((char*)work.data) + 39);
wcmplen = 39;
} }
if (have_stratum) { if (have_stratum) {
uint32_t sleeptime = 0; uint32_t sleeptime = 0;
if (opt_algo == ALGO_DECRED || stratum.rpc2) if (opt_algo == ALGO_DECRED || opt_algo == ALGO_WILDKECCAK /* getjob */)
work_done = true; // force "regen" hash work_done = true; // force "regen" hash
while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) { while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
usleep(100*1000); usleep(100*1000);
@ -1798,6 +1802,8 @@ static void *miner_thread(void *userdata)
extrajob = false; extrajob = false;
if (stratum_gen_work(&stratum, &g_work)) if (stratum_gen_work(&stratum, &g_work))
g_work_time = time(NULL); g_work_time = time(NULL);
if (opt_algo == ALGO_CRYPTONIGHT)
nonceptr[0] += 0x100000;
} }
} else { } else {
uint32_t secs = 0; uint32_t secs = 0;
@ -1839,7 +1845,20 @@ static void *miner_thread(void *userdata)
wcmplen -= 4; wcmplen -= 4;
} }
if (opt_algo == ALGO_CRYPTONIGHT) {
uint32_t oldpos = nonceptr[0];
if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) { if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
memcpy(&work, &g_work, sizeof(struct work));
nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // reset cursor
}
// also check the end, nonce in the middle
else if (memcmp(&work.data[44/4], &g_work.data[0], 76-44)) {
memcpy(&work, &g_work, sizeof(struct work));
}
if (oldpos & 0xFFFF) nonceptr[0] = oldpos + 0x100000;
}
else if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
#if 0 #if 0
if (opt_debug) { if (opt_debug) {
for (int n=0; n <= (wcmplen-8); n+=8) { for (int n=0; n <= (wcmplen-8); n+=8) {
@ -1916,7 +1935,7 @@ static void *miner_thread(void *userdata)
gpulog(LOG_DEBUG, thr_id, "no data"); gpulog(LOG_DEBUG, thr_id, "no data");
continue; continue;
} }
if (stratum.rpc2 && !scratchpad_size) { if (opt_algo == ALGO_WILDKECCAK && !scratchpad_size) {
sleep(1); sleep(1);
if (!thr_id) pools[cur_pooln].wait_time += 1; if (!thr_id) pools[cur_pooln].wait_time += 1;
continue; continue;
@ -2078,6 +2097,7 @@ static void *miner_thread(void *userdata)
case ALGO_VELTOR: case ALGO_VELTOR:
minmax = 0x80000; minmax = 0x80000;
break; break;
case ALGO_CRYPTONIGHT:
case ALGO_SCRYPT_JANE: case ALGO_SCRYPT_JANE:
minmax = 0x1000; minmax = 0x1000;
break; break;
@ -2140,6 +2160,9 @@ static void *miner_thread(void *userdata)
case ALGO_C11: case ALGO_C11:
rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
break; break;
case ALGO_CRYPTONIGHT:
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_DECRED: case ALGO_DECRED:
//applog(LOG_BLUE, "version %x, nbits %x, ntime %x extra %x", //applog(LOG_BLUE, "version %x, nbits %x, ntime %x extra %x",
// work.data[0], work.data[29], work.data[34], work.data[38]); // work.data[0], work.data[29], work.data[34], work.data[38]);
@ -2283,6 +2306,7 @@ static void *miner_thread(void *userdata)
// todo: update all algos to use work->nonces and pdata[19] as counter // todo: update all algos to use work->nonces and pdata[19] as counter
switch (opt_algo) { switch (opt_algo) {
case ALGO_BLAKE2S: case ALGO_BLAKE2S:
case ALGO_CRYPTONIGHT:
case ALGO_DECRED: case ALGO_DECRED:
case ALGO_LBRY: case ALGO_LBRY:
case ALGO_SIA: case ALGO_SIA:
@ -2710,7 +2734,7 @@ wait_stratum_url:
} }
} }
if (opt_algo == ALGO_WILDKECCAK) { if (stratum.rpc2) {
rpc2_stratum_thread_stuff(pool); rpc2_stratum_thread_stuff(pool);
} }
@ -2806,7 +2830,7 @@ static void show_usage_and_exit(int status)
if (opt_algo == ALGO_SCRYPT || opt_algo == ALGO_SCRYPT_JANE) { if (opt_algo == ALGO_SCRYPT || opt_algo == ALGO_SCRYPT_JANE) {
printf(scrypt_usage); printf(scrypt_usage);
} }
if (opt_algo == ALGO_WILDKECCAK) { if (opt_algo == ALGO_WILDKECCAK || opt_algo == ALGO_CRYPTONIGHT) {
printf(xmr_usage); printf(xmr_usage);
} }
proper_exit(status); proper_exit(status);
@ -3661,9 +3685,14 @@ int main(int argc, char *argv[])
allow_mininginfo = false; allow_mininginfo = false;
} }
if (opt_algo == ALGO_CRYPTONIGHT) {
rpc2_init();
if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
}
if (opt_algo == ALGO_WILDKECCAK) { if (opt_algo == ALGO_WILDKECCAK) {
rpc2_init(); rpc2_init();
applog(LOG_INFO, "Using CryptoNote JSON-RPC 2.0"); if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
GetScratchpad(); GetScratchpad();
} }

22
ccminer.vcxproj

@ -233,6 +233,10 @@
<ClCompile Include="crypto\mman.c" /> <ClCompile Include="crypto\mman.c" />
<ClCompile Include="crypto\wildkeccak-cpu.cpp" /> <ClCompile Include="crypto\wildkeccak-cpu.cpp" />
<ClCompile Include="crypto\xmr-rpc.cpp" /> <ClCompile Include="crypto\xmr-rpc.cpp" />
<ClCompile Include="crypto\aesb.cpp" />
<ClCompile Include="crypto\oaes_lib.cpp" />
<ClCompile Include="crypto\cryptonight-cpu.cpp" />
<ClCompile Include="crypto\cpu\c_keccak.c" />
<ClCompile Include="nvapi.cpp" /> <ClCompile Include="nvapi.cpp" />
<ClCompile Include="pools.cpp" /> <ClCompile Include="pools.cpp" />
<ClCompile Include="util.cpp" /> <ClCompile Include="util.cpp" />
@ -251,9 +255,6 @@
<ClCompile Include="myriadgroestl.cpp" /> <ClCompile Include="myriadgroestl.cpp" />
<ClCompile Include="lyra2\Lyra2.c" /> <ClCompile Include="lyra2\Lyra2.c" />
<ClCompile Include="lyra2\Sponge.c" /> <ClCompile Include="lyra2\Sponge.c" />
<ClInclude Include="crypto\mman.h" />
<ClInclude Include="crypto\wildkeccak.h" />
<ClInclude Include="crypto\xmr-rpc.h" />
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" /> <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
<ClInclude Include="neoscrypt\neoscrypt.h" /> <ClInclude Include="neoscrypt\neoscrypt.h" />
<ClCompile Include="neoscrypt\neoscrypt.cpp" /> <ClCompile Include="neoscrypt\neoscrypt.cpp" />
@ -264,8 +265,17 @@
<CudaCompile Include="Algo256\cuda_bmw.cu"> <CudaCompile Include="Algo256\cuda_bmw.cu">
<MaxRegCount>76</MaxRegCount> <MaxRegCount>76</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile Include="crypto\cryptonight.cu">
<MaxRegCount>128</MaxRegCount>
</CudaCompile>
<CudaCompile Include="crypto\cuda_cryptonight_core.cu">
<MaxRegCount>128</MaxRegCount>
</CudaCompile>
<CudaCompile Include="crypto\cuda_cryptonight_extra.cu">
<MaxRegCount>255</MaxRegCount>
</CudaCompile>
<CudaCompile Include="crypto\wildkeccak.cu"> <CudaCompile Include="crypto\wildkeccak.cu">
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">128</MaxRegCount> <MaxRegCount>128</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile Include="neoscrypt\cuda_neoscrypt.cu"> <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
<MaxRegCount>160</MaxRegCount> <MaxRegCount>160</MaxRegCount>
@ -319,6 +329,10 @@
<ClInclude Include="compat\unistd.h" /> <ClInclude Include="compat\unistd.h" />
<ClInclude Include="compat\winansi.h" /> <ClInclude Include="compat\winansi.h" />
<ClInclude Include="compat\ccminer-config.h" /> <ClInclude Include="compat\ccminer-config.h" />
<ClInclude Include="crypto\mman.h" />
<ClInclude Include="crypto\cryptonight.h" />
<ClInclude Include="crypto\wildkeccak.h" />
<ClInclude Include="crypto\xmr-rpc.h" />
<ClInclude Include="cuda_groestlcoin.h" /> <ClInclude Include="cuda_groestlcoin.h" />
<ClInclude Include="cuda_helper.h" /> <ClInclude Include="cuda_helper.h" />
<ClInclude Include="cuda_vector_uint2x4.h" /> <ClInclude Include="cuda_vector_uint2x4.h" />

45
ccminer.vcxproj.filters

@ -91,6 +91,12 @@
<Filter Include="Source Files\crypto"> <Filter Include="Source Files\crypto">
<UniqueIdentifier>{fea0fce3-c0fe-42f7-aa37-0cbba10b008a}</UniqueIdentifier> <UniqueIdentifier>{fea0fce3-c0fe-42f7-aa37-0cbba10b008a}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Source Files\crypto\xmr">
<UniqueIdentifier>{af52b078-ed91-4c6e-b07a-e9243acc85d2}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\crypto\bbr">
<UniqueIdentifier>{af387eac-e9e6-4e91-a5e8-637b1e7a8d93}</UniqueIdentifier>
</Filter>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClCompile Include="compat\jansson\dump.c"> <ClCompile Include="compat\jansson\dump.c">
@ -285,11 +291,23 @@
<ClCompile Include="crypto\xmr-rpc.cpp"> <ClCompile Include="crypto\xmr-rpc.cpp">
<Filter>Source Files\crypto</Filter> <Filter>Source Files\crypto</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="crypto\aesb.cpp">
<Filter>Source Files\crypto\xmr</Filter>
</ClCompile>
<ClCompile Include="crypto\cpu\c_keccak.c">
<Filter>Source Files\crypto\xmr</Filter>
</ClCompile>
<ClCompile Include="crypto\oaes_lib.cpp">
<Filter>Source Files\crypto\xmr</Filter>
</ClCompile>
<ClCompile Include="crypto\cryptonight-cpu.cpp">
<Filter>Source Files\crypto\xmr</Filter>
</ClCompile>
<ClCompile Include="crypto\mman.c"> <ClCompile Include="crypto\mman.c">
<Filter>Source Files\crypto</Filter> <Filter>Source Files\crypto\bbr</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="crypto\wildkeccak-cpu.cpp"> <ClCompile Include="crypto\wildkeccak-cpu.cpp">
<Filter>Source Files\crypto</Filter> <Filter>Source Files\crypto\bbr</Filter>
</ClCompile> </ClCompile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
@ -494,14 +512,17 @@
<ClInclude Include="sia\sia-rpc.h"> <ClInclude Include="sia\sia-rpc.h">
<Filter>Source Files\sia</Filter> <Filter>Source Files\sia</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="crypto\mman.h"> <ClInclude Include="crypto\xmr-rpc.h">
<Filter>Source Files\crypto</Filter> <Filter>Source Files\crypto</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="crypto\wildkeccak.h"> <ClInclude Include="crypto\cryptonight.h">
<Filter>Source Files\crypto</Filter> <Filter>Source Files\crypto\xmr</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="crypto\xmr-rpc.h"> <ClInclude Include="crypto\mman.h">
<Filter>Source Files\crypto</Filter> <Filter>Source Files\crypto\bbr</Filter>
</ClInclude>
<ClInclude Include="crypto\wildkeccak.h">
<Filter>Source Files\crypto\bbr</Filter>
</ClInclude> </ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
@ -784,6 +805,15 @@
<CudaCompile Include="crypto\wildkeccak.cu"> <CudaCompile Include="crypto\wildkeccak.cu">
<Filter>Source Files\crypto</Filter> <Filter>Source Files\crypto</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="crypto\cryptonight.cu">
<Filter>Source Files\crypto</Filter>
</CudaCompile>
<CudaCompile Include="crypto\cuda_cryptonight_core.cu">
<Filter>Source Files\crypto</Filter>
</CudaCompile>
<CudaCompile Include="crypto\cuda_cryptonight_extra.cu">
<Filter>Source Files\crypto</Filter>
</CudaCompile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Image Include="res\ccminer.ico"> <Image Include="res\ccminer.ico">
@ -801,4 +831,3 @@
</Text> </Text>
</ItemGroup> </ItemGroup>
</Project> </Project>

170
crypto/aesb.cpp

@ -0,0 +1,170 @@
/*
---------------------------------------------------------------------------
Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
The redistribution and use of this software (with or without changes)
is allowed without the payment of fees or royalties provided that:
source code distributions include the above copyright notice, this
list of conditions and the following disclaimer;
binary distributions include the above copyright notice, this list
of conditions and the following disclaimer in their documentation.
This software is provided 'as is' with no explicit or implied warranties
in respect of its operation, including, but not limited to, correctness
and fitness for purpose.
---------------------------------------------------------------------------
Issue Date: 20/12/2007
*/
#include <stdint.h>
#if defined(__cplusplus)
extern "C"
{
#endif
#define TABLE_ALIGN 32
#define WPOLY 0x011b
#define N_COLS 4
#define AES_BLOCK_SIZE 16
#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2))
#if defined(_MSC_VER)
#define ALIGN __declspec(align(TABLE_ALIGN))
#elif defined(__GNUC__)
#define ALIGN __attribute__ ((aligned(16)))
#else
#define ALIGN
#endif
#define rf1(r,c) (r)
#define word_in(x,c) (*((uint32_t*)(x)+(c)))
#define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
#define s(x,c) x[c]
#define si(y,x,c) (s(y,c) = word_in(x, c))
#define so(y,x,c) word_out(y, c, s(x,c))
#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3)
#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
#define round(y,x,k) \
y[0] = (k)[0] ^ (t_fn[0][x[0] & 0xff] ^ t_fn[1][(x[1] >> 8) & 0xff] ^ t_fn[2][(x[2] >> 16) & 0xff] ^ t_fn[3][x[3] >> 24]); \
y[1] = (k)[1] ^ (t_fn[0][x[1] & 0xff] ^ t_fn[1][(x[2] >> 8) & 0xff] ^ t_fn[2][(x[3] >> 16) & 0xff] ^ t_fn[3][x[0] >> 24]); \
y[2] = (k)[2] ^ (t_fn[0][x[2] & 0xff] ^ t_fn[1][(x[3] >> 8) & 0xff] ^ t_fn[2][(x[0] >> 16) & 0xff] ^ t_fn[3][x[1] >> 24]); \
y[3] = (k)[3] ^ (t_fn[0][x[3] & 0xff] ^ t_fn[1][(x[0] >> 8) & 0xff] ^ t_fn[2][(x[1] >> 16) & 0xff] ^ t_fn[3][x[2] >> 24]);
#define to_byte(x) ((x) & 0xff)
#define bval(x,n) to_byte((x) >> (8 * (n)))
#define fwd_var(x,r,c)\
( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
: r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
: r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
: ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
#define sb_data(w) {\
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
#define rc_data(w) {\
w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
w(0x1b), w(0x36) }
#define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
#define h0(x) (x)
#define w0(p) bytes2word(p, 0, 0, 0)
#define w1(p) bytes2word(0, p, 0, 0)
#define w2(p) bytes2word(0, 0, p, 0)
#define w3(p) bytes2word(0, 0, 0, p)
#define u0(p) bytes2word(f2(p), p, p, f3(p))
#define u1(p) bytes2word(f3(p), f2(p), p, p)
#define u2(p) bytes2word(p, f3(p), f2(p), p)
#define u3(p) bytes2word(p, p, f3(p), f2(p))
#define v0(p) bytes2word(fe(p), f9(p), fd(p), fb(p))
#define v1(p) bytes2word(fb(p), fe(p), f9(p), fd(p))
#define v2(p) bytes2word(fd(p), fb(p), fe(p), f9(p))
#define v3(p) bytes2word(f9(p), fd(p), fb(p), fe(p))
#define f2(x) ((x<<1) ^ (((x>>7) & 1) * WPOLY))
#define f4(x) ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
#define f8(x) ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY))
#define f3(x) (f2(x) ^ x)
#define f9(x) (f8(x) ^ x)
#define fb(x) (f8(x) ^ f2(x) ^ x)
#define fd(x) (f8(x) ^ f4(x) ^ x)
#define fe(x) (f8(x) ^ f4(x) ^ f2(x))
#define t_dec(m,n) t_##m##n
#define t_set(m,n) t_##m##n
#define t_use(m,n) t_##m##n
#define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) }
#define four_tables(x,tab,vf,rf,c) \
(tab[0][bval(vf(x,0,c),rf(0,c))] \
^ tab[1][bval(vf(x,1,c),rf(1,c))] \
^ tab[2][bval(vf(x,2,c),rf(2,c))] \
^ tab[3][bval(vf(x,3,c),rf(3,c))])
d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);
void aesb_single_round(const uint8_t *in, uint8_t *out, uint8_t *expandedKey)
{
round(((uint32_t*) out), ((uint32_t*) in), ((uint32_t*) expandedKey));
}
void aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey)
{
uint32_t b1[4];
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey));
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 1 * N_COLS);
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 2 * N_COLS);
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 3 * N_COLS);
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 4 * N_COLS);
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 5 * N_COLS);
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 6 * N_COLS);
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 7 * N_COLS);
round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 8 * N_COLS);
round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 9 * N_COLS);
}
#if defined(__cplusplus)
}
#endif

319
crypto/cn_aes.cuh

@ -0,0 +1,319 @@
#define N_COLS 4
#define WPOLY 0x011b
static __constant__ uint32_t d_t_fn[1024] =
{
0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U,
0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U,
0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U,
0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU,
0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU,
0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU,
0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U,
0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU,
0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU,
0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U,
0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U,
0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU,
0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU,
0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU,
0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU,
0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU,
0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U,
0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU,
0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU,
0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U,
0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U,
0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U,
0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U,
0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U,
0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU,
0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U,
0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU,
0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU,
0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U,
0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U,
0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U,
0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU,
0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U,
0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU,
0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU,
0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U,
0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U,
0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU,
0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U,
0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU,
0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U,
0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U,
0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U,
0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U,
0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU,
0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U,
0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU,
0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U,
0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU,
0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U,
0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU,
0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU,
0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU,
0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU,
0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U,
0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U,
0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U,
0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U,
0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U,
0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U,
0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU,
0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U,
0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU,
0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU,
0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU,
0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U,
0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU,
0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU,
0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U,
0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU,
0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU,
0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU,
0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU,
0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU,
0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U,
0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU,
0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU,
0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U,
0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU,
0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU,
0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU,
0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU,
0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU,
0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U,
0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU,
0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU,
0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU,
0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU,
0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U,
0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U,
0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U,
0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U,
0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU,
0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U,
0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U,
0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU,
0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU,
0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U,
0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U,
0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U,
0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU,
0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U,
0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU,
0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U,
0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU,
0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U,
0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U,
0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU,
0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U,
0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U,
0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U,
0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U,
0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U,
0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U,
0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U,
0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U,
0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU,
0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U,
0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U,
0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U,
0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U,
0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U,
0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U,
0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU,
0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U,
0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U,
0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U,
0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU,
0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU,
0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U,
0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU,
0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U,
0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU,
0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U,
0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU,
0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U,
0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U,
0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU,
0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U,
0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U,
0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U,
0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU,
0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U,
0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U,
0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU,
0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U,
0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U,
0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U,
0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU,
0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU,
0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U,
0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU,
0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU,
0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U,
0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU,
0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U,
0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU,
0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U,
0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U,
0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U,
0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU,
0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U,
0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU,
0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U,
0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU,
0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U,
0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U,
0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU,
0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU,
0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU,
0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U,
0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U,
0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU,
0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U,
0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU,
0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U,
0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU,
0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U,
0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU,
0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU,
0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U,
0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU,
0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U,
0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU,
0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U,
0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U,
0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U,
0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU,
0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU,
0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U,
0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU,
0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U,
0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU,
0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U,
0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU,
0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U,
0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU,
0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U,
0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU,
0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U,
0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U,
0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU,
0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U,
0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U,
0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U,
0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU,
0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U,
0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U,
0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU,
0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U,
0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U,
0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U,
0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU,
0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU,
0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U,
0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU,
0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU,
0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U,
0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU,
0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U,
0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU,
0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U,
0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U,
0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U,
0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU,
0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U,
0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU,
0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U,
0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU,
0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U,
0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U,
0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU,
0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU,
0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU,
0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U,
0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U,
0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU,
0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U,
0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU,
0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U,
0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU,
0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U,
0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU,
0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU,
0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U,
0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU,
0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U,
0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU,
0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U,
0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U,
0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U,
0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU,
0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU,
0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U,
0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU,
0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U
};
#define t_fn0(x) (sharedMemory[ (x)])
#define t_fn1(x) (sharedMemory[256 + (x)])
#define t_fn2(x) (sharedMemory[512 + (x)])
#define t_fn3(x) (sharedMemory[768 + (x)])
#define round(dummy,y,x,k) \
y[0] = (k)[0] ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24) & 0xff)); \
y[1] = (k)[1] ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24) & 0xff)); \
y[2] = (k)[2] ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24) & 0xff)); \
y[3] = (k)[3] ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) & 0xff));
__device__ __forceinline__ static void cn_aes_single_round(uint32_t * __restrict__ sharedMemory, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t * __restrict__ expandedKey)
{
round(sharedMemory, out, in, expandedKey);
}
__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * __restrict__ sharedMemory, uint32_t * __restrict__ val, const uint32_t * __restrict__ expandedKey)
{
uint32_t b1[4];
round(sharedMemory, b1, val, expandedKey);
round(sharedMemory, val, b1, expandedKey + 1 * N_COLS);
round(sharedMemory, b1, val, expandedKey + 2 * N_COLS);
round(sharedMemory, val, b1, expandedKey + 3 * N_COLS);
round(sharedMemory, b1, val, expandedKey + 4 * N_COLS);
round(sharedMemory, val, b1, expandedKey + 5 * N_COLS);
round(sharedMemory, b1, val, expandedKey + 6 * N_COLS);
round(sharedMemory, val, b1, expandedKey + 7 * N_COLS);
round(sharedMemory, b1, val, expandedKey + 8 * N_COLS);
round(sharedMemory, val, b1, expandedKey + 9 * N_COLS);
}
__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory)
{
if(blockDim.x >= 32)
{
if(threadIdx.x < 32)
{
for(int i = 0; i < 1024; i += 32)
{
sharedMemory[threadIdx.x + i] = d_t_fn[threadIdx.x + i];
}
}
}
else
{
if(threadIdx.x < 4)
{
for(int i = 0; i < 1024; i += 4)
{
sharedMemory[threadIdx.x + i] = d_t_fn[threadIdx.x + i];
}
}
}
}

180
crypto/cn_blake.cuh

@ -0,0 +1,180 @@
//#include <cuda_helper.h>
typedef struct {
uint32_t h[8], s[4], t[2];
int buflen, nullt;
uint8_t buf[64];
} blake_state;
#define U8TO32(p) \
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) ))
#define U32TO8(p, v) \
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
#define BLAKE_ROT(x,n) ROTR32(x, n)
#define BLAKE_G(a,b,c,d,e) \
v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \
v[d] = BLAKE_ROT(v[d] ^ v[a],16); \
v[c] += v[d]; \
v[b] = BLAKE_ROT(v[b] ^ v[c],12); \
v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]]) + v[b]; \
v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \
v[c] += v[d]; \
v[b] = BLAKE_ROT(v[b] ^ v[c], 7);
__constant__ uint8_t d_blake_sigma[14][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}
};
__constant__ uint32_t d_blake_cst[16] = {
0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
};
__device__
void cn_blake_compress(blake_state * __restrict__ S, const uint8_t * __restrict__ block)
{
uint32_t v[16], m[16], i;
for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
for (i = 0; i < 8; ++i) v[i] = S->h[i];
v[ 8] = S->s[0] ^ 0x243F6A88;
v[ 9] = S->s[1] ^ 0x85A308D3;
v[10] = S->s[2] ^ 0x13198A2E;
v[11] = S->s[3] ^ 0x03707344;
v[12] = 0xA4093822;
v[13] = 0x299F31D0;
v[14] = 0x082EFA98;
v[15] = 0xEC4E6C89;
if (S->nullt == 0) {
v[12] ^= S->t[0];
v[13] ^= S->t[0];
v[14] ^= S->t[1];
v[15] ^= S->t[1];
}
for (i = 0; i < 14; ++i) {
BLAKE_G(0, 4, 8, 12, 0);
BLAKE_G(1, 5, 9, 13, 2);
BLAKE_G(2, 6, 10, 14, 4);
BLAKE_G(3, 7, 11, 15, 6);
BLAKE_G(3, 4, 9, 14, 14);
BLAKE_G(2, 7, 8, 13, 12);
BLAKE_G(0, 5, 10, 15, 8);
BLAKE_G(1, 6, 11, 12, 10);
}
for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4];
}
__device__ void cn_blake_update(blake_state * __restrict__ S, const uint8_t * __restrict__ data, uint64_t datalen)
{
int left = S->buflen >> 3;
int fill = 64 - left;
if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) {
memcpy((void *) (S->buf + left), (void *) data, fill);
S->t[0] += 512;
if (S->t[0] == 0) S->t[1]++;
cn_blake_compress(S, S->buf);
data += fill;
datalen -= (fill << 3);
left = 0;
}
while (datalen >= 512) {
S->t[0] += 512;
if (S->t[0] == 0) S->t[1]++;
cn_blake_compress(S, data);
data += 64;
datalen -= 512;
}
if (datalen > 0) {
memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
S->buflen = (left << 3) + datalen;
} else {
S->buflen = 0;
}
}
__device__
void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
{
const uint8_t padding[] = {
0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
uint8_t pa = 0x81, pb = 0x01;
uint8_t msglen[8];
uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
if (lo < (unsigned) S->buflen) hi++;
U32TO8(msglen + 0, hi);
U32TO8(msglen + 4, lo);
if (S->buflen == 440) {
S->t[0] -= 8;
cn_blake_update(S, &pa, 8);
} else {
if (S->buflen < 440) {
if (S->buflen == 0) S->nullt = 1;
S->t[0] -= 440 - S->buflen;
cn_blake_update(S, padding, 440 - S->buflen);
} else {
S->t[0] -= 512 - S->buflen;
cn_blake_update(S, padding, 512 - S->buflen);
S->t[0] -= 440;
cn_blake_update(S, padding + 1, 440);
S->nullt = 1;
}
cn_blake_update(S, &pb, 8);
S->t[0] -= 8;
}
S->t[0] -= 64;
cn_blake_update(S, msglen, 64);
U32TO8(digest + 0, S->h[0]);
U32TO8(digest + 4, S->h[1]);
U32TO8(digest + 8, S->h[2]);
U32TO8(digest + 12, S->h[3]);
U32TO8(digest + 16, S->h[4]);
U32TO8(digest + 20, S->h[5]);
U32TO8(digest + 24, S->h[6]);
U32TO8(digest + 28, S->h[7]);
}
__device__
void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out)
{
blake_state bs;
blake_state *S = (blake_state *)&bs;
S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372;
S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C;
S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19;
S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
cn_blake_update(S, (uint8_t *)in, inlen * 8);
cn_blake_final(S, (uint8_t *)out);
}

347
crypto/cn_groestl.cuh

@ -0,0 +1,347 @@
#define GROESTL_ROWS 8
#define GROESTL_LENGTHFIELDLEN GROESTL_ROWS
#define GROESTL_COLS512 8
#define GROESTL_SIZE512 (GROESTL_ROWS*GROESTL_COLS512)
#define GROESTL_ROUNDS512 10
#define GROESTL_HASH_BIT_LEN 256
#define GROESTL_ROTL32(v, n) ROTL32(v, n)
#define li_32(h) 0x##h##u
#define GROESTL_EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
#define u32BIG(a) \
((GROESTL_ROTL32(a,8) & li_32(00FF00FF)) | \
(GROESTL_ROTL32(a,24) & li_32(FF00FF00)))
typedef struct {
uint32_t chaining[GROESTL_SIZE512/sizeof(uint32_t)];
uint32_t block_counter1, block_counter2;
BitSequence buffer[GROESTL_SIZE512];
int buf_ptr;
int bits_in_last_byte;
} groestlHashState;
__constant__ uint32_t d_groestl_T[512] = {
0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0x0d17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc,
0x50f09060, 0x6050c0f0, 0x03050702, 0x02030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5,
0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0x0b1de6fb, 0xfb0bed1d,
0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded,
0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x0206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1,
0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x0818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441,
0x0c141c08, 0x080c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0x0f111b0a, 0x0a0f1411, 0xb5c4eb2f, 0x2fb55ec4,
0x091b150e, 0x0e091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba,
0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616,
0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2,
0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0, 0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c,
0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de,
0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7,
0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x060a0e04, 0x0406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e,
0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x058a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x040cfdf1, 0xf104f90c,
0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0x0e12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7,
0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b,
0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4,
0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e,
0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a,
0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0x0a1e120c, 0x0c0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37,
0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86,
0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x018c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b,
0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x0709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028,
0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3,
0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0x0d861a91, 0x85949b0f, 0xf851e94,
0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x050f0906, 0x06050c0f, 0x0103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836,
0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0,
0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x07890e80, 0xa7f2c133, 0x33a766f2,
0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e,
0x8f8a8903, 0x038f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x0980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3,
0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e
};
#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) { \
temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
v1 = temp_var; \
}
#define GROESTL_COLUMN(x,y,i, c0,c1,c2,c3,c4,c5,c6,c7, tv1,tv2,tu,tl,t) \
tu = d_groestl_T[2*(uint32_t)x[4*c0+0]]; \
tl = d_groestl_T[2*(uint32_t)x[4*c0+0]+1]; \
tv1 = d_groestl_T[2*(uint32_t)x[4*c1+1]]; \
tv2 = d_groestl_T[2*(uint32_t)x[4*c1+1]+1]; \
GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \
tu ^= tv1; \
tl ^= tv2; \
tv1 = d_groestl_T[2*(uint32_t)x[4*c2+2]]; \
tv2 = d_groestl_T[2*(uint32_t)x[4*c2+2]+1]; \
GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \
tu ^= tv1; \
tl ^= tv2; \
tv1 = d_groestl_T[2*(uint32_t)x[4*c3+3]]; \
tv2 = d_groestl_T[2*(uint32_t)x[4*c3+3]+1]; \
GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \
tu ^= tv1; \
tl ^= tv2; \
tl ^= d_groestl_T[2*(uint32_t)x[4*c4+0]]; \
tu ^= d_groestl_T[2*(uint32_t)x[4*c4+0]+1]; \
tv1 = d_groestl_T[2*(uint32_t)x[4*c5+1]]; \
tv2 = d_groestl_T[2*(uint32_t)x[4*c5+1]+1]; \
GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \
tl ^= tv1; \
tu ^= tv2; \
tv1 = d_groestl_T[2*(uint32_t)x[4*c6+2]]; \
tv2 = d_groestl_T[2*(uint32_t)x[4*c6+2]+1]; \
GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \
tl ^= tv1; \
tu ^= tv2; \
tv1 = d_groestl_T[2*(uint32_t)x[4*c7+3]]; \
tv2 = d_groestl_T[2*(uint32_t)x[4*c7+3]+1]; \
GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \
tl ^= tv1; \
tu ^= tv2; \
y[i] = tu; \
y[i+1] = tl;
__device__
void cn_groestl_RND512P(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
{
uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
uint32_t* x32 = (uint32_t*)x;
x32[ 0] ^= 0x00000000^r;
x32[ 2] ^= 0x00000010^r;
x32[ 4] ^= 0x00000020^r;
x32[ 6] ^= 0x00000030^r;
x32[ 8] ^= 0x00000040^r;
x32[10] ^= 0x00000050^r;
x32[12] ^= 0x00000060^r;
x32[14] ^= 0x00000070^r;
GROESTL_COLUMN(x,y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y,10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y,12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y,14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
}
__device__
void cn_groestl_RND512Q(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
{
uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
uint32_t* x32 = (uint32_t*)x;
x32[ 0] = ~x32[ 0];
x32[ 1] ^= 0xffffffff^r;
x32[ 2] = ~x32[ 2];
x32[ 3] ^= 0xefffffff^r;
x32[ 4] = ~x32[ 4];
x32[ 5] ^= 0xdfffffff^r;
x32[ 6] = ~x32[ 6];
x32[ 7] ^= 0xcfffffff^r;
x32[ 8] = ~x32[ 8];
x32[ 9] ^= 0xbfffffff^r;
x32[10] = ~x32[10];
x32[11] ^= 0xafffffff^r;
x32[12] = ~x32[12];
x32[13] ^= 0x9fffffff^r;
x32[14] = ~x32[14];
x32[15] ^= 0x8fffffff^r;
GROESTL_COLUMN(x,y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y,10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y,12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
GROESTL_COLUMN(x,y,14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
}
__device__
void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __restrict__ m)
{
int i;
uint32_t Ptmp[2*GROESTL_COLS512];
uint32_t Qtmp[2*GROESTL_COLS512];
uint32_t y[2*GROESTL_COLS512];
uint32_t z[2*GROESTL_COLS512];
for (i = 0; i < 2*GROESTL_COLS512; i++) {
z[i] = m[i];
Ptmp[i] = h[i]^m[i];
}
cn_groestl_RND512Q((uint8_t*)z, y, 0x00000000);
cn_groestl_RND512Q((uint8_t*)y, z, 0x01000000);
cn_groestl_RND512Q((uint8_t*)z, y, 0x02000000);
cn_groestl_RND512Q((uint8_t*)y, z, 0x03000000);
cn_groestl_RND512Q((uint8_t*)z, y, 0x04000000);
cn_groestl_RND512Q((uint8_t*)y, z, 0x05000000);
cn_groestl_RND512Q((uint8_t*)z, y, 0x06000000);
cn_groestl_RND512Q((uint8_t*)y, z, 0x07000000);
cn_groestl_RND512Q((uint8_t*)z, y, 0x08000000);
cn_groestl_RND512Q((uint8_t*)y, Qtmp, 0x09000000);
cn_groestl_RND512P((uint8_t*)Ptmp, y, 0x00000000);
cn_groestl_RND512P((uint8_t*)y, z, 0x00000001);
cn_groestl_RND512P((uint8_t*)z, y, 0x00000002);
cn_groestl_RND512P((uint8_t*)y, z, 0x00000003);
cn_groestl_RND512P((uint8_t*)z, y, 0x00000004);
cn_groestl_RND512P((uint8_t*)y, z, 0x00000005);
cn_groestl_RND512P((uint8_t*)z, y, 0x00000006);
cn_groestl_RND512P((uint8_t*)y, z, 0x00000007);
cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
cn_groestl_RND512P((uint8_t*)y, Ptmp, 0x00000009);
for (i = 0; i < 2*GROESTL_COLS512; i++)
h[i] ^= Ptmp[i]^Qtmp[i];
}
__device__
void cn_groestl_outputtransformation(groestlHashState *ctx)
{
int j;
uint32_t temp[2*GROESTL_COLS512];
uint32_t y[2*GROESTL_COLS512];
uint32_t z[2*GROESTL_COLS512];
for (j = 0; j < 2*GROESTL_COLS512; j++)
temp[j] = ctx->chaining[j];
cn_groestl_RND512P((uint8_t*)temp, y, 0x00000000);
cn_groestl_RND512P((uint8_t*)y, z, 0x00000001);
cn_groestl_RND512P((uint8_t*)z, y, 0x00000002);
cn_groestl_RND512P((uint8_t*)y, z, 0x00000003);
cn_groestl_RND512P((uint8_t*)z, y, 0x00000004);
cn_groestl_RND512P((uint8_t*)y, z, 0x00000005);
cn_groestl_RND512P((uint8_t*)z, y, 0x00000006);
cn_groestl_RND512P((uint8_t*)y, z, 0x00000007);
cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
cn_groestl_RND512P((uint8_t*)y, temp, 0x00000009);
for (j = 0; j < 2*GROESTL_COLS512; j++)
ctx->chaining[j] ^= temp[j];
}
__device__
void cn_groestl_transform(groestlHashState * __restrict__ ctx,
const uint8_t * __restrict__ input, int msglen)
{
for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) {
cn_groestl_F512(ctx->chaining,(uint32_t*)input);
ctx->block_counter1++;
if (ctx->block_counter1 == 0) ctx->block_counter2++;
}
}
__device__
void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restrict__ output)
{
int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8;
uint8_t *s = (BitSequence*)ctx->chaining;
if (ctx->bits_in_last_byte) {
ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<ctx->bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte);
ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte);
ctx->bits_in_last_byte = 0;
}
else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) {
while (ctx->buf_ptr < GROESTL_SIZE512) {
ctx->buffer[(int)ctx->buf_ptr++] = 0;
}
cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
ctx->buf_ptr = 0;
}
while (ctx->buf_ptr < GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) {
ctx->buffer[(int)ctx->buf_ptr++] = 0;
}
ctx->block_counter1++;
if (ctx->block_counter1 == 0) ctx->block_counter2++;
ctx->buf_ptr = GROESTL_SIZE512;
while (ctx->buf_ptr > GROESTL_SIZE512-(int)sizeof(uint32_t)) {
ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
ctx->block_counter1 >>= 8;
}
while (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) {
ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
ctx->block_counter2 >>= 8;
}
cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
cn_groestl_outputtransformation(ctx);
for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
output[j] = s[i];
}
for (i = 0; i < GROESTL_COLS512; i++) {
ctx->chaining[i] = 0;
}
for (i = 0; i < GROESTL_SIZE512; i++) {
ctx->buffer[i] = 0;
}
}
__device__
void cn_groestl_update(groestlHashState* __restrict__ ctx,
const BitSequence* __restrict__ input, DataLength databitlen)
{
int index = 0;
int msglen = (int)(databitlen/8);
int rem = (int)(databitlen%8);
if (ctx->buf_ptr) {
while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen) {
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
}
if (ctx->buf_ptr < GROESTL_SIZE512) {
if (rem) {
ctx->bits_in_last_byte = rem;
ctx->buffer[(int)ctx->buf_ptr++] = input[index];
}
return;
}
ctx->buf_ptr = 0;
cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
}
cn_groestl_transform(ctx, input+index, msglen-index);
index += ((msglen-index)/GROESTL_SIZE512)*GROESTL_SIZE512;
while (index < msglen) {
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
}
if (rem) {
ctx->bits_in_last_byte = rem;
ctx->buffer[(int)ctx->buf_ptr++] = input[index];
}
}
__device__
void cn_groestl_init(groestlHashState* ctx)
{
int i = 0;
for(;i<(GROESTL_SIZE512/sizeof(uint32_t));i++)
ctx->chaining[i] = 0;
ctx->chaining[2*GROESTL_COLS512-1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN);
ctx->buf_ptr = 0;
ctx->block_counter1 = 0;
ctx->block_counter2 = 0;
ctx->bits_in_last_byte = 0;
}
__device__
void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
{
DataLength databitlen = len << 3;
groestlHashState context;
cn_groestl_init(&context);
cn_groestl_update(&context, data, databitlen);
cn_groestl_final(&context, hashval);
}

293
crypto/cn_jh.cuh

@ -0,0 +1,293 @@
typedef struct {
int hashbitlen;
unsigned long long databitlen;
unsigned long long datasize_in_buffer;
uint64_t x[8][2];
unsigned char buffer[64];
} jhHashState;
__constant__ unsigned char d_JH256_H0[512] = {
0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1,
0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3,
0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77,
0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8,
0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62,
0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c,
0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf,
0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69
};
__constant__ unsigned char d_E8_rc[42][32] = {
{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}
};
#define JH_SWAP1(x) (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
#define JH_SWAP2(x) (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
#define JH_SWAP4(x) (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
#define JH_SWAP8(x) (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
#define JH_SWAP16(x) (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
#define JH_SWAP32(x) (x) = (((x) << 32) | ((x) >> 32));
#define JH_L(m0,m1,m2,m3,m4,m5,m6,m7) \
(m4) ^= (m1); \
(m5) ^= (m2); \
(m6) ^= (m0) ^ (m3); \
(m7) ^= (m0); \
(m0) ^= (m5); \
(m1) ^= (m6); \
(m2) ^= (m4) ^ (m7); \
(m3) ^= (m4);
#define JH_SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1) \
m3 = ~(m3); \
m7 = ~(m7); \
m0 ^= ((~(m2)) & (cc0)); \
m4 ^= ((~(m6)) & (cc1)); \
temp0 = (cc0) ^ ((m0) & (m1));\
temp1 = (cc1) ^ ((m4) & (m5));\
m0 ^= ((m2) & (m3)); \
m4 ^= ((m6) & (m7)); \
m3 ^= ((~(m1)) & (m2)); \
m7 ^= ((~(m5)) & (m6)); \
m1 ^= ((m0) & (m2)); \
m5 ^= ((m4) & (m6)); \
m2 ^= ((m0) & (~(m3))); \
m6 ^= ((m4) & (~(m7))); \
m0 ^= ((m1) | (m3)); \
m4 ^= ((m5) | (m7)); \
m3 ^= ((m1) & (m2)); \
m7 ^= ((m5) & (m6)); \
m1 ^= (temp0 & (m0)); \
m5 ^= (temp1 & (m4)); \
m2 ^= temp0; \
m6 ^= temp1;
__device__
void cn_jh_E8(jhHashState *state)
{
uint64_t i,roundnumber,temp0,temp1;
for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7)
{
for (i = 0; i < 2; i++) {
JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
((uint64_t *)d_E8_rc[roundnumber+0])[i],((uint64_t *)d_E8_rc[roundnumber+0])[i+2] );
JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
JH_SWAP1(state->x[1][i]); JH_SWAP1(state->x[3][i]); JH_SWAP1(state->x[5][i]); JH_SWAP1(state->x[7][i]);
}
for (i = 0; i < 2; i++) {
JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
((uint64_t *)d_E8_rc[roundnumber+1])[i],((uint64_t *)d_E8_rc[roundnumber+1])[i+2] );
JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
JH_SWAP2(state->x[1][i]); JH_SWAP2(state->x[3][i]); JH_SWAP2(state->x[5][i]); JH_SWAP2(state->x[7][i]);
}
for (i = 0; i < 2; i++) {
JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
((uint64_t *)d_E8_rc[roundnumber+2])[i],((uint64_t *)d_E8_rc[roundnumber+2])[i+2] );
JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
JH_SWAP4(state->x[1][i]); JH_SWAP4(state->x[3][i]); JH_SWAP4(state->x[5][i]); JH_SWAP4(state->x[7][i]);
}
for (i = 0; i < 2; i++) {
JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
((uint64_t *)d_E8_rc[roundnumber+3])[i],((uint64_t *)d_E8_rc[roundnumber+3])[i+2] );
JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
JH_SWAP8(state->x[1][i]); JH_SWAP8(state->x[3][i]); JH_SWAP8(state->x[5][i]); JH_SWAP8(state->x[7][i]);
}
for (i = 0; i < 2; i++) {
JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
((uint64_t *)d_E8_rc[roundnumber+4])[i],((uint64_t *)d_E8_rc[roundnumber+4])[i+2] );
JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
JH_SWAP16(state->x[1][i]); JH_SWAP16(state->x[3][i]); JH_SWAP16(state->x[5][i]); JH_SWAP16(state->x[7][i]);
}
for (i = 0; i < 2; i++) {
JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
((uint64_t *)d_E8_rc[roundnumber+5])[i],((uint64_t *)d_E8_rc[roundnumber+5])[i+2] );
JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]);
}
for (i = 0; i < 2; i++) {
JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] );
JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
}
for (i = 1; i < 8; i = i+2) {
temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; state->x[i][1] = temp0;
}
}
}
__device__
void cn_jh_F8(jhHashState *state)
{
uint64_t i;
for (i = 0; i < 8; i++) {
state->x[i >> 1][i & 1] ^= ((uint64_t *)state->buffer)[i];
}
cn_jh_E8(state);
for (i = 0; i < 8; i++) {
state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64_t *)state->buffer)[i];
}
}
__device__
void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
{
DataLength index;
state->databitlen += databitlen;
index = 0;
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) )
{
if ( (databitlen & 7) == 0 ) {
memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3));
}
else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1);
state->datasize_in_buffer += databitlen;
databitlen = 0;
}
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) {
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
index = 64-(state->datasize_in_buffer >> 3);
databitlen = databitlen - (512 - state->datasize_in_buffer);
cn_jh_F8(state);
state->datasize_in_buffer = 0;
}
for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) {
memcpy(state->buffer, data+index, 64);
cn_jh_F8(state);
}
if ( databitlen > 0) {
if ((databitlen & 7) == 0)
memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
else
memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
state->datasize_in_buffer = databitlen;
}
}
/* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */
__device__
void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ hashval)
{
unsigned int i;
//uint32_t *bufptr = (uint32_t *)state->buffer;
if ( (state->databitlen & 0x1ff) == 0 ) {
/* pad the message when databitlen is multiple of 512 bits, then process the padded block */
memset(state->buffer, 0, 64);
//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
state->buffer[0] = 0x80;
state->buffer[63] = state->databitlen & 0xff;
state->buffer[62] = (state->databitlen >> 8) & 0xff;
state->buffer[61] = (state->databitlen >> 16) & 0xff;
state->buffer[60] = (state->databitlen >> 24) & 0xff;
state->buffer[59] = (state->databitlen >> 32) & 0xff;
state->buffer[58] = (state->databitlen >> 40) & 0xff;
state->buffer[57] = (state->databitlen >> 48) & 0xff;
state->buffer[56] = (state->databitlen >> 56) & 0xff;
cn_jh_F8(state);
} else {
/*set the rest of the bytes in the buffer to 0*/
if ( (state->datasize_in_buffer & 7) == 0) {
for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
} else {
for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) state->buffer[i] = 0;
}
/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
cn_jh_F8(state);
memset(state->buffer, 0, 64);
//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
state->buffer[63] = state->databitlen & 0xff;
state->buffer[62] = (state->databitlen >> 8) & 0xff;
state->buffer[61] = (state->databitlen >> 16) & 0xff;
state->buffer[60] = (state->databitlen >> 24) & 0xff;
state->buffer[59] = (state->databitlen >> 32) & 0xff;
state->buffer[58] = (state->databitlen >> 40) & 0xff;
state->buffer[57] = (state->databitlen >> 48) & 0xff;
state->buffer[56] = (state->databitlen >> 56) & 0xff;
cn_jh_F8(state);
}
memcpy(hashval, (unsigned char*)state->x+64+32, 32);
}
__device__
void cn_jh_init(jhHashState *state, int hashbitlen)
{
state->databitlen = 0;
state->datasize_in_buffer = 0;
state->hashbitlen = hashbitlen;
memcpy(state->x, d_JH256_H0, 128);
}
__device__
void cn_jh(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
{
int hashbitlen = 256;
DataLength databitlen = len << 3;
jhHashState state;
cn_jh_init(&state, hashbitlen);
cn_jh_update(&state, data, databitlen);
cn_jh_final(&state, hashval);
}

211
crypto/cn_keccak.cuh

@ -0,0 +1,211 @@
__constant__ uint64_t keccakf_rndc[24] = {
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
};
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
{
uint2 result;
if(offset >= 32)
{
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
}
else
{
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#define rotl64_1(x, y) (cuda_rotl64((x), (y)))
#else
#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y))))
#endif
#define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y))
#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
__device__ __forceinline__
void cn_keccakf2(uint64_t *s)
{
uint8_t i;
for(i = 0; i < 24; ++i)
{
uint64_t bc[5], tmpxor[5], tmp1, tmp2;
tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
tmp1 = s[1] ^ bc[0];
s[0] ^= bc[4];
s[1] = rotl64_2(s[6] ^ bc[0], 12);
s[6] = rotl64_1(s[9] ^ bc[3], 20);
s[9] = rotl64_2(s[22] ^ bc[1], 29);
s[22] = rotl64_2(s[14] ^ bc[3], 7);
s[14] = rotl64_1(s[20] ^ bc[4], 18);
s[20] = rotl64_2(s[2] ^ bc[1], 30);
s[2] = rotl64_2(s[12] ^ bc[1], 11);
s[12] = rotl64_1(s[13] ^ bc[2], 25);
s[13] = rotl64_1(s[19] ^ bc[3], 8);
s[19] = rotl64_2(s[23] ^ bc[2], 24);
s[23] = rotl64_2(s[15] ^ bc[4], 9);
s[15] = rotl64_1(s[4] ^ bc[3], 27);
s[4] = rotl64_1(s[24] ^ bc[3], 14);
s[24] = rotl64_1(s[21] ^ bc[0], 2);
s[21] = rotl64_2(s[8] ^ bc[2], 23);
s[8] = rotl64_2(s[16] ^ bc[0], 13);
s[16] = rotl64_2(s[5] ^ bc[4], 4);
s[5] = rotl64_1(s[3] ^ bc[2], 28);
s[3] = rotl64_1(s[18] ^ bc[2], 21);
s[18] = rotl64_1(s[17] ^ bc[1], 15);
s[17] = rotl64_1(s[11] ^ bc[0], 10);
s[11] = rotl64_1(s[7] ^ bc[1], 6);
s[7] = rotl64_1(s[10] ^ bc[4], 3);
s[10] = rotl64_1(tmp1, 1);
tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
s[0] ^= keccakf_rndc[i];
}
}
__device__ __forceinline__
void cn_keccakf(uint64_t *s)
{
uint64_t bc[5], tmpxor[5], tmp1, tmp2;
tmpxor[0] = s[0] ^ s[5];
tmpxor[1] = s[1] ^ s[6] ^ 0x8000000000000000ULL;
tmpxor[2] = s[2] ^ s[7];
tmpxor[3] = s[3] ^ s[8];
tmpxor[4] = s[4] ^ s[9];
bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
tmp1 = s[1] ^ bc[0];
s[0] ^= bc[4];
s[1] = rotl64_2(s[6] ^ bc[0], 12);
s[6] = rotl64_1(s[9] ^ bc[3], 20);
s[9] = rotl64_2(bc[1], 29);
s[22] = rotl64_2(bc[3], 7);
s[14] = rotl64_1(bc[4], 18);
s[20] = rotl64_2(s[2] ^ bc[1], 30);
s[2] = rotl64_2(bc[1], 11);
s[12] = rotl64_1(bc[2], 25);
s[13] = rotl64_1(bc[3], 8);
s[19] = rotl64_2(bc[2], 24);
s[23] = rotl64_2(bc[4], 9);
s[15] = rotl64_1(s[4] ^ bc[3], 27);
s[4] = rotl64_1(bc[3], 14);
s[24] = rotl64_1(bc[0], 2);
s[21] = rotl64_2(s[8] ^ bc[2], 23);
s[8] = rotl64_2(0x8000000000000000ULL ^ bc[0], 13);
s[16] = rotl64_2(s[5] ^ bc[4], 4);
s[5] = rotl64_1(s[3] ^ bc[2], 28);
s[3] = rotl64_1(bc[2], 21);
s[18] = rotl64_1(bc[1], 15);
s[17] = rotl64_1(bc[0], 10);
s[11] = rotl64_1(s[7] ^ bc[1], 6);
s[7] = rotl64_1(bc[4], 3);
s[10] = rotl64_1(tmp1, 1);
tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
s[0] ^= 0x0000000000000001;
for(int i = 1; i < 24; ++i)
{
tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
tmp1 = s[1] ^ bc[0];
s[0] ^= bc[4];
s[1] = rotl64_2(s[6] ^ bc[0], 12);
s[6] = rotl64_1(s[9] ^ bc[3], 20);
s[9] = rotl64_2(s[22] ^ bc[1], 29);
s[22] = rotl64_2(s[14] ^ bc[3], 7);
s[14] = rotl64_1(s[20] ^ bc[4], 18);
s[20] = rotl64_2(s[2] ^ bc[1], 30);
s[2] = rotl64_2(s[12] ^ bc[1], 11);
s[12] = rotl64_1(s[13] ^ bc[2], 25);
s[13] = rotl64_1(s[19] ^ bc[3], 8);
s[19] = rotl64_2(s[23] ^ bc[2], 24);
s[23] = rotl64_2(s[15] ^ bc[4], 9);
s[15] = rotl64_1(s[4] ^ bc[3], 27);
s[4] = rotl64_1(s[24] ^ bc[3], 14);
s[24] = rotl64_1(s[21] ^ bc[0], 2);
s[21] = rotl64_2(s[8] ^ bc[2], 23);
s[8] = rotl64_2(s[16] ^ bc[0], 13);
s[16] = rotl64_2(s[5] ^ bc[4], 4);
s[5] = rotl64_1(s[3] ^ bc[2], 28);
s[3] = rotl64_1(s[18] ^ bc[2], 21);
s[18] = rotl64_1(s[17] ^ bc[1], 15);
s[17] = rotl64_1(s[11] ^ bc[0], 10);
s[11] = rotl64_1(s[7] ^ bc[1], 6);
s[7] = rotl64_1(s[10] ^ bc[4], 3);
s[10] = rotl64_1(tmp1, 1);
tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
s[0] ^= keccakf_rndc[i];
}
}
__device__ __forceinline__
void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md)
{
uint64_t st[25];
MEMCPY4(st, in, 19);
MEMSET8(&st[10], 0x00, 15);
st[9] = (st[9] & 0x00000000FFFFFFFFULL) | 0x0000000100000000ULL;
st[16] = 0x8000000000000000ULL;
cn_keccakf(st);
MEMCPY8(md, st, 25);
return;
}

345
crypto/cn_skein.cuh

@ -0,0 +1,345 @@
typedef unsigned int uint_t; /* native unsigned integer */
#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */
#define SKEIN_256_STATE_WORDS ( 4)
#define SKEIN_512_STATE_WORDS ( 8)
#define SKEIN1024_STATE_WORDS (16)
#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS)
#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */
#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */
#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */
#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */
#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */
#define SKEIN_T1_FLAG_FIRST (((uint64_t) 1 ) << SKEIN_T1_POS_FIRST)
#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t) 1 ) << SKEIN_T1_POS_BIT_PAD)
#define SKEIN_T1_FLAG_FINAL (((uint64_t) 1 ) << SKEIN_T1_POS_FINAL)
#define SKEIN_BLK_TYPE_MSG (48) /* message processing */
#define SKEIN_BLK_TYPE_OUT (63) /* output stage */
#define SKEIN_T1_BLK_TYPE(T) (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */
#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */
#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \
Skein_Set_T0(ctxPtr,(T0)); \
Skein_Set_T1(ctxPtr,(T1)); }
#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \
{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; }
#define KW_TWK_BASE (0)
#define KW_KEY_BASE (3)
#define ks (kw + KW_KEY_BASE)
#define ts (kw + KW_TWK_BASE)
#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \
X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \
X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \
X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \
X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6;
#define I512(R) \
X0 += ks[((R)+1) % 9]; \
X1 += ks[((R)+2) % 9]; \
X2 += ks[((R)+3) % 9]; \
X3 += ks[((R)+4) % 9]; \
X4 += ks[((R)+5) % 9]; \
X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
X7 += ks[((R)+8) % 9] + (R)+1;
#define R512_8_rounds(R) \
R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
I512(2*(R)); \
R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
I512(2*(R)+1);
typedef struct
{
size_t hashBitLen;
size_t bCnt;
uint64_t T[SKEIN_MODIFIER_WORDS];
} Skein_Ctxt_Hdr_t;
typedef struct {
Skein_Ctxt_Hdr_t h;
uint64_t X[SKEIN_256_STATE_WORDS];
uint8_t b[SKEIN_256_BLOCK_BYTES];
} Skein_256_Ctxt_t;
typedef struct {
Skein_Ctxt_Hdr_t h;
uint64_t X[SKEIN_512_STATE_WORDS];
uint8_t b[SKEIN_512_BLOCK_BYTES];
} Skein_512_Ctxt_t;
typedef struct {
Skein_Ctxt_Hdr_t h;
uint64_t X[SKEIN1024_STATE_WORDS];
uint8_t b[SKEIN1024_BLOCK_BYTES];
} Skein1024_Ctxt_t;
typedef struct {
uint_t statebits;
union {
Skein_Ctxt_Hdr_t h;
Skein_256_Ctxt_t ctx_256;
Skein_512_Ctxt_t ctx_512;
Skein1024_Ctxt_t ctx1024;
} u;
} skeinHashState;
__device__
void cn_skein_init(skeinHashState *state, size_t hashBitLen)
{
const uint64_t SKEIN_512_IV_256[] =
{
SKEIN_MK_64(0xCCD044A1,0x2FDB3E13),
SKEIN_MK_64(0xE8359030,0x1A79A9EB),
SKEIN_MK_64(0x55AEA061,0x4F816E6F),
SKEIN_MK_64(0x2A2767A4,0xAE9B94DB),
SKEIN_MK_64(0xEC06025E,0x74DD7683),
SKEIN_MK_64(0xE7A436CD,0xC4746251),
SKEIN_MK_64(0xC36FBAF9,0x393AD185),
SKEIN_MK_64(0x3EEDBA18,0x33EDFC13)
};
Skein_512_Ctxt_t *ctx = &state->u.ctx_512;
ctx->h.hashBitLen = hashBitLen;
memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X));
Skein_Start_New_Type(ctx, MSG);
}
__device__
void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd)
{
enum {
R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22
};
uint64_t X0,X1,X2,X3,X4,X5,X6,X7;
uint64_t w[SKEIN_512_STATE_WORDS];
uint64_t kw[SKEIN_512_STATE_WORDS+4];
ts[0] = ctx->h.T[0];
ts[1] = ctx->h.T[1];
do {
ts[0] += byteCntAdd;
ks[0] = ctx->X[0];
ks[1] = ctx->X[1];
ks[2] = ctx->X[2];
ks[3] = ctx->X[3];
ks[4] = ctx->X[4];
ks[5] = ctx->X[5];
ks[6] = ctx->X[6];
ks[7] = ctx->X[7];
ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
ts[2] = ts[0] ^ ts[1];
memcpy(w, blkPtr, SKEIN_512_STATE_WORDS << 3);
X0 = w[0] + ks[0];
X1 = w[1] + ks[1];
X2 = w[2] + ks[2];
X3 = w[3] + ks[3];
X4 = w[4] + ks[4];
X5 = w[5] + ks[5] + ts[0];
X6 = w[6] + ks[6] + ts[1];
X7 = w[7] + ks[7];
blkPtr += SKEIN_512_BLOCK_BYTES;
R512_8_rounds( 0);
R512_8_rounds( 1);
R512_8_rounds( 2);
R512_8_rounds( 3);
R512_8_rounds( 4);
R512_8_rounds( 5);
R512_8_rounds( 6);
R512_8_rounds( 7);
R512_8_rounds( 8);
ctx->X[0] = X0 ^ w[0];
ctx->X[1] = X1 ^ w[1];
ctx->X[2] = X2 ^ w[2];
ctx->X[3] = X3 ^ w[3];
ctx->X[4] = X4 ^ w[4];
ctx->X[5] = X5 ^ w[5];
ctx->X[6] = X6 ^ w[6];
ctx->X[7] = X7 ^ w[7];
ts[1] &= ~SKEIN_T1_FLAG_FIRST;
} while (--blkCnt);
ctx->h.T[0] = ts[0];
ctx->h.T[1] = ts[1];
}
__device__
void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
{
size_t i,n,byteCnt;
uint64_t X[SKEIN_512_STATE_WORDS];
Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
//size_t tmp;
//uint8_t *p8;
//uint64_t *p64;
ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) {
memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
//p8 = &ctx->b[ctx->h.bCnt];
//tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
//for( i = 0; i < tmp; i++ ) *(p8+i) = 0;
}
cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt);
byteCnt = (ctx->h.hashBitLen + 7) >> 3;
//uint8_t b[SKEIN_512_BLOCK_BYTES] == 64
memset(ctx->b,0,sizeof(ctx->b));
//p64 = (uint64_t *)ctx->b;
//for( i = 0; i < 8; i++ ) *(p64+i) = 0;
memcpy(X,ctx->X,sizeof(X));
for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) {
((uint64_t *)ctx->b)[0]= (uint64_t)i;
Skein_Start_New_Type(ctx,OUT_FINAL);
cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t));
n = byteCnt - i*SKEIN_512_BLOCK_BYTES;
if (n >= SKEIN_512_BLOCK_BYTES)
n = SKEIN_512_BLOCK_BYTES;
memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);
memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */
}
}
__device__
void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt)
{
size_t n;
if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
{
if (ctx->h.bCnt) {
n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
if (n) {
memcpy(&ctx->b[ctx->h.bCnt],msg,n);
msgByteCnt -= n;
msg += n;
ctx->h.bCnt += n;
}
cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
ctx->h.bCnt = 0;
}
if (msgByteCnt > SKEIN_512_BLOCK_BYTES) {
n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;
cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
msg += n * SKEIN_512_BLOCK_BYTES;
}
}
if (msgByteCnt) {
memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
ctx->h.bCnt += msgByteCnt;
}
}
__device__
void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
{
if ((databitlen & 7) == 0) {
cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3);
}
else {
size_t bCnt = (databitlen >> 3) + 1;
uint8_t b,mask;
mask = (uint8_t) (1u << (7 - (databitlen & 7)));
b = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask);
cn_skein512_update(&state->u.ctx_512, data, bCnt-1);
cn_skein512_update(&state->u.ctx_512, &b, 1);
Skein_Set_Bit_Pad_Flag(state->u.h);
}
}
__device__
void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
{
int hashbitlen = 256;
DataLength databitlen = len << 3;
skeinHashState state;
state.statebits = 64*SKEIN_512_STATE_WORDS;
cn_skein_init(&state, hashbitlen);
cn_skein_update(&state, data, databitlen);
cn_skein_final(&state, hashval);
}

122
crypto/cpu/c_keccak.c

@ -0,0 +1,122 @@
// keccak.c
// 19-Nov-11 Markku-Juhani O. Saarinen <mjos@iki.fi>
// A baseline Keccak (3rd round) implementation.
#include "c_keccak.h"
const uint64_t keccakf_rndc[24] =
{
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
};
const int keccakf_rotc[24] =
{
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
};
const int keccakf_piln[24] =
{
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
};
// update the state with given number of rounds
void keccakf(uint64_t st[25], int rounds)
{
int i, j, round;
uint64_t t, bc[5];
for (round = 0; round < rounds; ++round) {
// Theta
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
for (i = 0; i < 5; ++i) {
t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
st[i ] ^= t;
st[i + 5] ^= t;
st[i + 10] ^= t;
st[i + 15] ^= t;
st[i + 20] ^= t;
}
// Rho Pi
t = st[1];
for (i = 0; i < 24; ++i) {
bc[0] = st[keccakf_piln[i]];
st[keccakf_piln[i]] = ROTL64(t, keccakf_rotc[i]);
t = bc[0];
}
// Chi
for (j = 0; j < 25; j += 5) {
bc[0] = st[j ];
bc[1] = st[j + 1];
bc[2] = st[j + 2];
bc[3] = st[j + 3];
bc[4] = st[j + 4];
st[j ] ^= (~bc[1]) & bc[2];
st[j + 1] ^= (~bc[2]) & bc[3];
st[j + 2] ^= (~bc[3]) & bc[4];
st[j + 3] ^= (~bc[4]) & bc[0];
st[j + 4] ^= (~bc[0]) & bc[1];
}
// Iota
st[0] ^= keccakf_rndc[round];
}
}
// compute a keccak hash (md) of given byte length from "in"
typedef uint64_t state_t[25];
int keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
{
state_t st;
uint8_t temp[144];
int i, rsiz, rsizw;
rsiz = sizeof(state_t) == mdlen ? HASH_DATA_AREA : 200 - 2 * mdlen;
rsizw = rsiz / 8;
memset(st, 0, sizeof(st));
for ( ; inlen >= rsiz; inlen -= rsiz, in += rsiz) {
for (i = 0; i < rsizw; i++)
st[i] ^= ((uint64_t *) in)[i];
keccakf(st, KECCAK_ROUNDS);
}
// last block and padding
memcpy(temp, in, inlen);
temp[inlen++] = 1;
memset(temp + inlen, 0, rsiz - inlen);
temp[rsiz - 1] |= 0x80;
for (i = 0; i < rsizw; i++)
st[i] ^= ((uint64_t *) temp)[i];
keccakf(st, KECCAK_ROUNDS);
memcpy(md, st, mdlen);
return 0;
}
void keccak1600(const uint8_t *in, int inlen, uint8_t *md)
{
keccak(in, inlen, md, sizeof(state_t));
}

34
crypto/cpu/c_keccak.h

@ -0,0 +1,34 @@
// keccak.h
// 19-Nov-11 Markku-Juhani O. Saarinen <mjos@iki.fi>
#ifndef KECCAK_H
#define KECCAK_H
#include <stdint.h>
#include <string.h>
#ifndef KECCAK_ROUNDS
#define KECCAK_ROUNDS 24
#endif
#ifndef ROTL64
#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
#endif
#ifndef HASH_SIZE
#define HASH_SIZE 32
#endif
#ifndef HASH_DATA_AREA
#define HASH_DATA_AREA 136
#endif
// compute a keccak hash (md) of given byte length from "in"
int keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
// update the state
void keccakf(uint64_t st[25], int norounds);
void keccak1600(const uint8_t *in, int inlen, uint8_t *md);
#endif

226
crypto/cryptonight-cpu.cpp

@ -0,0 +1,226 @@
#include <miner.h>
#include <memory.h>
#include "oaes_lib.h"
#include "cryptonight.h"
extern "C" {
#include <sph/sph_blake.h>
#include <sph/sph_groestl.h>
#include <sph/sph_jh.h>
#include <sph/sph_skein.h>
#include "cpu/c_keccak.h"
}
struct cryptonight_ctx {
uint8_t long_state[MEMORY];
union cn_slow_hash_state state;
uint8_t text[INIT_SIZE_BYTE];
uint8_t a[AES_BLOCK_SIZE];
uint8_t b[AES_BLOCK_SIZE];
uint8_t c[AES_BLOCK_SIZE];
oaes_ctx* aes_ctx;
};
static void do_blake_hash(const void* input, size_t len, void* output)
{
uchar hash[32];
sph_blake256_context ctx;
sph_blake256_set_rounds(14);
sph_blake256_init(&ctx);
sph_blake256(&ctx, input, len);
sph_blake256_close(&ctx, hash);
memcpy(output, hash, 32);
}
static void do_groestl_hash(const void* input, size_t len, void* output)
{
uchar hash[32];
sph_groestl256_context ctx;
sph_groestl256_init(&ctx);
sph_groestl256(&ctx, input, len);
sph_groestl256_close(&ctx, hash);
memcpy(output, hash, 32);
}
static void do_jh_hash(const void* input, size_t len, void* output)
{
uchar hash[64];
sph_jh256_context ctx;
sph_jh256_init(&ctx);
sph_jh256(&ctx, input, len);
sph_jh256_close(&ctx, hash);
memcpy(output, hash, 32);
}
static void do_skein_hash(const void* input, size_t len, void* output)
{
uchar hash[32];
sph_skein256_context ctx;
sph_skein256_init(&ctx);
sph_skein256(&ctx, input, len);
sph_skein256_close(&ctx, hash);
memcpy(output, hash, 32);
}
// todo: use sph if possible
static void keccak_hash_permutation(union hash_state *state) {
keccakf((uint64_t*)state, 24);
}
static void keccak_hash_process(union hash_state *state, const uint8_t *buf, size_t count) {
keccak1600(buf, (int)count, (uint8_t*)state);
}
extern "C" int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
extern "C" int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
extern "C" int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
extern "C" int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
static void (* const extra_hashes[4])(const void*, size_t, void *) = {
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
};
uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
{
// multiplier = ab = a * 2^32 + b
// multiplicand = cd = c * 2^32 + d
// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
uint64_t a = hi_dword(multiplier);
uint64_t b = lo_dword(multiplier);
uint64_t c = hi_dword(multiplicand);
uint64_t d = lo_dword(multiplicand);
uint64_t ac = a * c;
uint64_t ad = a * d;
uint64_t bc = b * c;
uint64_t bd = b * d;
uint64_t adbc = ad + bc;
uint64_t adbc_carry = adbc < ad ? 1 : 0;
// multiplier * multiplicand = product_hi * 2^64 + product_lo
uint64_t product_lo = bd + (adbc << 32);
uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
return product_lo;
}
static size_t e2i(const uint8_t* a) {
return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (MEMORY / AES_BLOCK_SIZE - 1);
}
static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) {
((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res);
}
static void sum_half_blocks(uint8_t* a, const uint8_t* b) {
((uint64_t*) a)[0] += ((uint64_t*) b)[0];
((uint64_t*) a)[1] += ((uint64_t*) b)[1];
}
static void sum_half_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[0] = ((uint64_t*) a)[0] + ((uint64_t*) b)[0];
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] + ((uint64_t*) b)[1];
}
static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, uint8_t* dst) {
((uint64_t*) dst)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) dst) + ((uint64_t*) c)[1];
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
}
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0];
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo;
}
static void copy_block(uint8_t* dst, const uint8_t* src) {
((uint64_t*) dst)[0] = ((uint64_t*) src)[0];
((uint64_t*) dst)[1] = ((uint64_t*) src)[1];
}
static void xor_blocks(uint8_t* a, const uint8_t* b) {
((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
}
static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
}
static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx)
{
size_t i, j;
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND
#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
RND(0);
RND(1);
RND(2);
RND(3);
RND(4);
RND(5);
RND(6);
RND(7);
memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
}
xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
for (i = 0; likely(i < ITER / 4); ++i) {
j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);
j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]);
}
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND
#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
RND(0);
RND(1);
RND(2);
RND(3);
RND(4);
RND(5);
RND(6);
RND(7);
}
memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
keccak_hash_permutation(&ctx->state.hs);
int extra_algo = ctx->state.hs.b[0] & 3;
extra_hashes[extra_algo](&ctx->state, 200, output);
oaes_free((OAES_CTX **) &ctx->aes_ctx);
}
void cryptonight_hash(void* output, const void* input, size_t len)
{
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
cryptonight_hash_ctx(output, input, len, ctx);
free(ctx);
}

170
crypto/cryptonight.cu

@ -0,0 +1,170 @@
#include <ctype.h>
#include <unistd.h>
#include <stdio.h>
#include <stdint.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <miner.h>
#include "cryptonight.h"
extern char *device_config[MAX_GPUS]; // -l 24x32
uint32_t cn_blocks = 24;
uint32_t cn_threads = 32;
static uint32_t *d_long_state[MAX_GPUS];
static uint32_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_key1[MAX_GPUS];
static uint32_t *d_ctx_key2[MAX_GPUS];
static uint32_t *d_ctx_text[MAX_GPUS];
static uint32_t *d_ctx_a[MAX_GPUS];
static uint32_t *d_ctx_b[MAX_GPUS];
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
int res = 0;
uint32_t throughput = 0;
uint32_t *ptarget = work->target;
uint8_t *pdata = (uint8_t*) work->data;
uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
const uint32_t first_nonce = *nonceptr;
uint32_t nonce = first_nonce;
if(opt_benchmark) {
ptarget[7] = 0x00ff;
}
if(!init[thr_id])
{
if (device_config[thr_id]) {
sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
gpulog(LOG_INFO, thr_id, "Using %u x %u threads kernel launch config", cn_blocks, cn_threads);
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
} else {
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
}
if(sizeof(size_t) == 4 && throughput > UINT32_MAX / MEMORY) {
gpulog(LOG_ERR, thr_id, "THE 32bit VERSION CAN'T ALLOCATE MORE THAN 4GB OF MEMORY!");
gpulog(LOG_ERR, thr_id, "PLEASE REDUCE THE NUMBER OF THREADS OR BLOCKS");
exit(1);
}
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage (linux)
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
CUDA_LOG_ERROR();
}
const size_t alloc = MEMORY * throughput;
cryptonight_extra_cpu_init(thr_id, throughput);
cudaMalloc(&d_long_state[thr_id], alloc);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_key2[thr_id], 40 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_text[thr_id], 32 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_a[thr_id], 4 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
init[thr_id] = true;
}
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_blocks);
do
{
const uint32_t Htarg = ptarget[7];
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
*hashes_done = nonce - first_nonce + throughput;
if(resNonces[0] != UINT32_MAX)
{
uint32_t vhash[8];
uint32_t tempdata[19];
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
memcpy(tempdata, pdata, 76);
*tempnonceptr = resNonces[0];
gpulog(LOG_DEBUG, thr_id, "found nonce %x", resNonces[0]);
cryptonight_hash(vhash, tempdata, 76);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
res = 1;
work->nonces[0] = resNonces[0];
work_set_target_ratio(work, vhash);
// second nonce
if(resNonces[1] != UINT32_MAX)
{
*tempnonceptr = resNonces[1];
cryptonight_hash(vhash, tempdata, 76);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
res++;
work->nonces[1] = resNonces[1];
} else if (vhash[7] > Htarg) {
gpulog(LOG_WARNING, thr_id, "result for second nonce %08x does not validate on CPU!", resNonces[1]);
}
}
goto done;
} else if (vhash[7] > Htarg) {
gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", resNonces[0]);
}
}
if ((uint64_t) throughput + nonce >= max_nonce - 127) {
nonce = max_nonce;
break;
}
nonce += throughput;
gpulog(LOG_DEBUG, thr_id, "nonce %08x", nonce);
} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + nonce);
done:
gpulog(LOG_DEBUG, thr_id, "nonce %08x exit", nonce);
*nonceptr = nonce;
return res;
}
void free_cryptonight(int thr_id)
{
if (!init[thr_id])
return;
cudaFree(d_long_state[thr_id]);
cudaFree(d_ctx_state[thr_id]);
cudaFree(d_ctx_key1[thr_id]);
cudaFree(d_ctx_key2[thr_id]);
cudaFree(d_ctx_text[thr_id]);
cudaFree(d_ctx_a[thr_id]);
cudaFree(d_ctx_b[thr_id]);
cryptonight_extra_cpu_free(thr_id);
cudaDeviceSynchronize();
init[thr_id] = false;
}

156
crypto/cryptonight.h

@ -0,0 +1,156 @@
#pragma once
#include <cuda_runtime.h>
#include <miner.h>
#ifdef __INTELLISENSE__
/* avoid red underlining */
#define __CUDA_ARCH__ 520
struct uint3 {
unsigned int x, y, z;
};
struct uint3 threadIdx;
struct uint3 blockIdx;
struct uint3 blockDim;
#define __funnelshift_r(a,b,c) 1
#define __syncthreads()
#define asm(x)
#define __shfl(a,b,c) 1
#endif
#define MEMORY (1 << 21) // 2 MiB / 2097152 B
#define ITER (1 << 20) // 1048576
#define AES_BLOCK_SIZE 16
#define AES_KEY_SIZE 32
#define INIT_SIZE_BLK 8
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
#define AES_RKEY_LEN 4
#define AES_COL_LEN 4
#define AES_ROUND_BASE 7
#ifndef HASH_SIZE
#define HASH_SIZE 32
#endif
#ifndef HASH_DATA_AREA
#define HASH_DATA_AREA 136
#endif
#define hi_dword(x) (x >> 32)
#define lo_dword(x) (x & 0xFFFFFFFF)
#define C32(x) ((uint32_t)(x ## U))
#define T32(x) ((x) & C32(0xFFFFFFFF))
#ifndef ROTL64
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#define ROTL64(x, n) (cuda_ROTL64(x, n))
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#endif
#ifndef ROTL32
#if __CUDA_ARCH__ < 350
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
#endif
#ifndef ROTR32
#if __CUDA_ARCH__ < 350
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#else
#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
#endif
#endif
#define MEMSET8(dst,what,cnt) { \
int i_memset8; \
uint64_t *out_memset8 = (uint64_t *)(dst); \
for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
out_memset8[i_memset8] = (what); }
#define MEMSET4(dst,what,cnt) { \
int i_memset4; \
uint32_t *out_memset4 = (uint32_t *)(dst); \
for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
out_memset4[i_memset4] = (what); }
#define MEMCPY8(dst,src,cnt) { \
int i_memcpy8; \
uint64_t *in_memcpy8 = (uint64_t *)(src); \
uint64_t *out_memcpy8 = (uint64_t *)(dst); \
for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }
#define MEMCPY4(dst,src,cnt) { \
int i_memcpy4; \
uint32_t *in_memcpy4 = (uint32_t *)(src); \
uint32_t *out_memcpy4 = (uint32_t *)(dst); \
for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
#define XOR_BLOCKS(a,b) { \
((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \
((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; }
#define XOR_BLOCKS_DST(x,y,z) { \
((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
#define MUL_SUM_XOR_DST(a,c,dst) { \
uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
hi += ((uint64_t *)c)[0]; \
((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
((uint64_t *)dst)[0] = hi; \
((uint64_t *)dst)[1] = lo; }
#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
union hash_state {
uint8_t b[200];
uint64_t w[25];
};
union cn_slow_hash_state {
union hash_state hs;
struct {
uint8_t k[64];
uint8_t init[INIT_SIZE_BYTE];
};
};
static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
{
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) {
gpulog(LOG_ERR, thr_id, "%s %s line %d", cudaGetErrorString(err), src, line);
exit(1);
}
}
void hash_permutation(union hash_state *state);
void hash_process(union hash_state *state, const uint8_t *buf, size_t count);
void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
void cryptonight_extra_cpu_free(int thr_id);
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint32_t *d_ctx_state);

262
crypto/cuda_cryptonight_core.cu

@ -0,0 +1,262 @@
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "cryptonight.h"
#ifdef WIN32
int cn_bfactor = 6;
int cn_bsleep = 100;
#else
int cn_bfactor = 0;
int cn_bsleep = 0;
#endif
#include "cn_aes.cuh"
__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
{
*product_hi = __umul64hi(multiplier, multiplicand);
return(multiplier * multiplicand);
}
__global__
void cryptonight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
{
__shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
const int sub = (threadIdx.x & 7) << 2;
if(thread < threads)
{
uint32_t key[40], text[4];
MEMCPY8(key, ctx_key1 + thread * 40, 20);
MEMCPY8(text, ctx_state + thread * 50 + sub + 16, 2);
__syncthreads();
for(int i = 0; i < 0x80000; i += 32)
{
cn_aes_pseudo_round_mut(sharedMemory, text, key);
MEMCPY8(&long_state[(thread << 19) + sub + i], text, 2);
}
}
}
__global__
void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
{
__shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
#if __CUDA_ARCH__ >= 300
const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
const int sub = threadIdx.x & 3;
if(thread < threads)
{
const int batchsize = ITER >> (2 + bfactor);
const int start = partidx * batchsize;
const int end = start + batchsize;
uint32_t * __restrict__ long_state = &d_long_state[thread << 19];
uint32_t * __restrict__ ctx_a = d_ctx_a + thread * 4;
uint32_t * __restrict__ ctx_b = d_ctx_b + thread * 4;
uint32_t a, b, c, x[4];
uint32_t t1[4], t2[4], res;
uint64_t reshi, reslo;
int j;
a = ctx_a[sub];
b = ctx_b[sub];
#pragma unroll 8
for(int i = start; i < end; ++i)
{
//j = ((uint32_t *)a)[0] & 0x1FFFF0;
j = (__shfl((int)a, 0, 4) & 0x1FFFF0) >> 2;
//cn_aes_single_round(sharedMemory, &long_state[j], c, a);
x[0] = long_state[j + sub];
x[1] = __shfl((int)x[0], sub + 1, 4);
x[2] = __shfl((int)x[0], sub + 2, 4);
x[3] = __shfl((int)x[0], sub + 3, 4);
c = a ^
t_fn0(x[0] & 0xff) ^
t_fn1((x[1] >> 8) & 0xff) ^
t_fn2((x[2] >> 16) & 0xff) ^
t_fn3((x[3] >> 24) & 0xff);
//XOR_BLOCKS_DST(c, b, &long_state[j]);
long_state[j + sub] = c ^ b;
//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
j = (__shfl((int)c, 0, 4) & 0x1FFFF0) >> 2;
#pragma unroll
for(int k = 0; k < 2; k++)
t1[k] = __shfl((int)c, k, 4);
#pragma unroll
for(int k = 0; k < 4; k++)
t2[k] = __shfl((int)a, k, 4);
asm(
"mad.lo.u64 %0, %2, %3, %4;\n\t"
"mad.hi.u64 %1, %2, %3, %5;\n\t"
: "=l"(reslo), "=l"(reshi)
: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0);
a = long_state[j + sub] ^ res;
long_state[j + sub] = res;
//j = ((uint32_t *)a)[0] & 0x1FFFF0;
j = (__shfl((int)a, 0, 4) & 0x1FFFF0) >> 2;
//cn_aes_single_round(sharedMemory, &long_state[j], b, a);
x[0] = long_state[j + sub];
x[1] = __shfl((int)x[0], sub + 1, 4);
x[2] = __shfl((int)x[0], sub + 2, 4);
x[3] = __shfl((int)x[0], sub + 3, 4);
b = a ^
t_fn0(x[0] & 0xff) ^
t_fn1((x[1] >> 8) & 0xff) ^
t_fn2((x[2] >> 16) & 0xff) ^
t_fn3((x[3] >> 24) & 0xff);
//XOR_BLOCKS_DST(b, c, &long_state[j]);
long_state[j + sub] = c ^ b;
//MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
j = (__shfl((int)b, 0, 4) & 0x1FFFF0) >> 2;
#pragma unroll
for(int k = 0; k < 2; k++)
t1[k] = __shfl((int)b, k, 4);
#pragma unroll
for(int k = 0; k < 4; k++)
t2[k] = __shfl((int)a, k, 4);
asm(
"mad.lo.u64 %0, %2, %3, %4;\n\t"
"mad.hi.u64 %1, %2, %3, %5;\n\t"
: "=l"(reslo), "=l"(reshi)
: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0);
a = long_state[j + sub] ^ res;
long_state[j + sub] = res;
}
if(bfactor > 0)
{
ctx_a[sub] = a;
ctx_b[sub] = b;
}
}
#else // __CUDA_ARCH__ < 300
const int thread = blockDim.x * blockIdx.x + threadIdx.x;
if(thread < threads)
{
const int batchsize = ITER >> (2 + bfactor);
const int start = partidx * batchsize;
const int end = start + batchsize;
uint32_t * __restrict__ long_state = &d_long_state[thread << 19];
uint32_t * __restrict__ ctx_a = d_ctx_a + thread * 4;
uint32_t * __restrict__ ctx_b = d_ctx_b + thread * 4;
uint32_t a[4], b[4], c[4];
int j;
MEMCPY8(a, ctx_a, 2);
MEMCPY8(b, ctx_b, 2);
for(int i = start; i < end; ++i)
{
j = (a[0] & 0x1FFFF0) >> 2;
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
XOR_BLOCKS_DST(c, b, &long_state[j]);
MUL_SUM_XOR_DST(c, a, (uint8_t *)&long_state[(c[0] & 0x1FFFF0) >> 2]);
j = (a[0] & 0x1FFFF0) >> 2;
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
XOR_BLOCKS_DST(b, c, &long_state[j]);
MUL_SUM_XOR_DST(b, a, &long_state[(b[0] & 0x1FFFF0) >> 2]);
}
if(bfactor > 0)
{
MEMCPY8(ctx_a, a, 2);
MEMCPY8(ctx_b, b, 2);
}
}
#endif // __CUDA_ARCH__ >= 300
}
__global__
void cryptonight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2)
{
__shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
int sub = (threadIdx.x & 7) << 2;
if(thread < threads)
{
uint32_t key[40], text[4];
MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
__syncthreads();
for(int i = 0; i < 0x80000; i += 32)
{
#pragma unroll
for(int j = 0; j < 4; ++j)
text[j] ^= long_state[(thread << 19) + sub + i + j];
cn_aes_pseudo_round_mut(sharedMemory, text, key);
}
MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
}
}
__host__
void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
{
dim3 grid(blocks);
dim3 block(threads);
dim3 block4(threads << 2);
dim3 block8(threads << 3);
const int bfactor = cn_bfactor; // device_bfactor[thr_id];
const int bsleep = cn_bsleep; //device_bsleep[thr_id];
int i, partcount = 1 << bfactor;
int dev_id = device_map[thr_id];
cryptonight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
for(i = 0; i < partcount; i++)
{
cryptonight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
}
cryptonight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}

234
crypto/cuda_cryptonight_extra.cu

@ -0,0 +1,234 @@
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#ifndef _WIN32
#include <unistd.h>
#endif
#include <miner.h>
#include <cuda_helper.h>
//#include <cuda.h>
//#include <cuda_runtime.h>
#include "cryptonight.h"
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
static uint32_t *d_input[MAX_GPUS] = { 0 };
static uint32_t *d_target[MAX_GPUS];
static uint32_t *d_result[MAX_GPUS];
#include "cn_keccak.cuh"
#include "cn_blake.cuh"
#include "cn_groestl.cuh"
#include "cn_jh.cuh"
#include "cn_skein.cuh"
__constant__ uint8_t d_sub_byte[16][16] = {
{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76},
{0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0},
{0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15},
{0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75},
{0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84},
{0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf},
{0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8},
{0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2},
{0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73},
{0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb},
{0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79},
{0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08},
{0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a},
{0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e},
{0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf},
{0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}
};
__device__ __forceinline__
void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data)
{
const uint32_t aes_gf[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
};
MEMSET4(key, 0, 40);
MEMCPY4(key, data, 8);
#pragma unroll
for(int i = 8; i < 40; i++)
{
uint8_t temp[4];
*(uint32_t *)temp = key[i - 1];
if(i % 8 == 0) {
*(uint32_t *)temp = ROTR32(*(uint32_t *)temp, 8);
for(int j = 0; j < 4; j++)
temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f];
*(uint32_t *)temp ^= aes_gf[i / 8 - 1];
}
else if(i % 8 == 4) {
#pragma unroll
for(int j = 0; j < 4; j++)
temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f];
}
key[i] = key[(i - 8)] ^ *(uint32_t *)temp;
}
}
__global__
void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce,
uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if(thread < threads)
{
uint32_t ctx_state[50];
uint32_t ctx_a[4];
uint32_t ctx_b[4];
uint32_t ctx_key1[40];
uint32_t ctx_key2[40];
uint32_t input[19];
MEMCPY4(input, d_input, 19);
*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
cn_keccak((uint8_t *)input, (uint8_t *)ctx_state);
cryptonight_aes_set_key(ctx_key1, ctx_state);
cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
memcpy(d_ctx_state + thread * 50, ctx_state, 50 * 4);
memcpy(d_ctx_a + thread * 4, ctx_a, 4 * 4);
memcpy(d_ctx_b + thread * 4, ctx_b, 4 * 4);
memcpy(d_ctx_key1 + thread * 40, ctx_key1, 40 * 4);
memcpy(d_ctx_key2 + thread * 40, ctx_key2, 40 * 4);
}
}
__global__
void cryptonight_extra_gpu_keccakf2(uint32_t threads, uint32_t * __restrict__ d_ctx_state)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if(thread < threads)
{
uint64_t*ctx_state = (uint64_t*) (&d_ctx_state[thread * 50]);
uint64_t state[25];
#pragma unroll
for(int i = 0; i < 25; i++)
state[i] = ctx_state[i];
cn_keccakf2(state);
#pragma unroll
for(int i = 0; i < 25; i++)
ctx_state[i] = state[i];
}
}
__global__
void cryptonight_extra_gpu_nonces(uint32_t threads, const uint32_t startNonce, const uint32_t * __restrict__ d_ctx_state,
const uint32_t* d_target, uint32_t * resNonces)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if(thread < threads)
{
uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 50]);
uint64_t state[25];
#pragma unroll
for(int i = 0; i < 25; i++)
state[i] = ctx_state[i];
uint32_t hash[8];
switch(((uint8_t *)state)[0] & 0x03)
{
case 0: {
cn_blake((uint8_t*)state, 200, (uint8_t*)hash);
break;
}
case 1: {
cn_groestl((BitSequence*)state, 200, (BitSequence*)hash);
break;
}
case 2: {
// to double check..
cn_jh((BitSequence*)state, 200, (BitSequence*)hash);
break;
}
case 3: {
cn_skein((BitSequence*)state, 200, (BitSequence*)hash);
break;
}
}
if(hash[7] <= d_target[1] && hash[6] <= d_target[0])
{
const uint32_t nonce = startNonce + thread;
uint32_t tmp = atomicExch(resNonces, nonce);
if(tmp != UINT32_MAX)
resNonces[1] = tmp;
}
}
}
__host__
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget)
{
uint32_t *pTargetIn = (uint32_t*) ptarget;
cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads)
{
cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t));
cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t));
cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
{
int threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cryptonight_extra_gpu_prepare <<<grid, block >>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state)
{
int threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_extra_gpu_keccakf2 <<<grid, block >>> (threads, d_ctx_state);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_extra_gpu_nonces <<<grid, block >>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_free(int thr_id)
{
if (d_input[thr_id]) {
cudaFree(d_input[thr_id]);
cudaFree(d_target[thr_id]);
cudaFree(d_result[thr_id]);
d_input[thr_id] = NULL;
}
}

51
crypto/oaes_config.h

@ -0,0 +1,51 @@
/*
* ---------------------------------------------------------------------------
* OpenAES License
* ---------------------------------------------------------------------------
* Copyright (c) 2012, Nabil S. Al Ramli, www.nalramli.com
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* ---------------------------------------------------------------------------
*/
#ifndef _OAES_CONFIG_H
#define _OAES_CONFIG_H
#ifdef __cplusplus
extern "C" {
#endif
//#ifndef OAES_HAVE_ISAAC
//#define OAES_HAVE_ISAAC 1
//#endif // OAES_HAVE_ISAAC
//#ifndef OAES_DEBUG
//#define OAES_DEBUG 0
//#endif // OAES_DEBUG
//#define OAES_DEBUG 1
#ifdef __cplusplus
}
#endif
#endif // _OAES_CONFIG_H

1446
crypto/oaes_lib.cpp

File diff suppressed because it is too large Load Diff

214
crypto/oaes_lib.h

@ -0,0 +1,214 @@
/*
* ---------------------------------------------------------------------------
* OpenAES License
* ---------------------------------------------------------------------------
* Copyright (c) 2012, Nabil S. Al Ramli, www.nalramli.com
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* ---------------------------------------------------------------------------
*/
#ifndef _OAES_LIB_H
#define _OAES_LIB_H
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _WIN32
# ifdef OAES_SHARED
# ifdef oaes_lib_EXPORTS
# define OAES_API __declspec(dllexport)
# else
# define OAES_API __declspec(dllimport)
# endif
# else
# define OAES_API
# endif
#else
# define OAES_API
#endif // WIN32
#define OAES_VERSION "0.8.1"
#define OAES_BLOCK_SIZE 16
typedef void OAES_CTX;
typedef enum
{
OAES_RET_FIRST = 0,
OAES_RET_SUCCESS = 0,
OAES_RET_UNKNOWN,
OAES_RET_ARG1,
OAES_RET_ARG2,
OAES_RET_ARG3,
OAES_RET_ARG4,
OAES_RET_ARG5,
OAES_RET_NOKEY,
OAES_RET_MEM,
OAES_RET_BUF,
OAES_RET_HEADER,
OAES_RET_COUNT
} OAES_RET;
/*
* oaes_set_option() takes one of these values for its [option] parameter
* some options accept either an optional or a required [value] parameter
*/
// no option
#define OAES_OPTION_NONE 0
// enable ECB mode, disable CBC mode
#define OAES_OPTION_ECB 1
// enable CBC mode, disable ECB mode
// value is optional, may pass uint8_t iv[OAES_BLOCK_SIZE] to specify
// the value of the initialization vector, iv
#define OAES_OPTION_CBC 2
#ifdef OAES_DEBUG
typedef int ( * oaes_step_cb ) (
const uint8_t state[OAES_BLOCK_SIZE],
const char * step_name,
int step_count,
void * user_data );
// enable state stepping mode
// value is required, must pass oaes_step_cb to receive the state at each step
#define OAES_OPTION_STEP_ON 4
// disable state stepping mode
#define OAES_OPTION_STEP_OFF 8
#endif // OAES_DEBUG
typedef uint16_t OAES_OPTION;
typedef struct _oaes_key
{
size_t data_len;
uint8_t *data;
size_t exp_data_len;
uint8_t *exp_data;
size_t num_keys;
size_t key_base;
} oaes_key;
typedef struct _oaes_ctx
{
#ifdef OAES_HAVE_ISAAC
randctx * rctx;
#endif // OAES_HAVE_ISAAC
#ifdef OAES_DEBUG
oaes_step_cb step_cb;
#endif // OAES_DEBUG
oaes_key * key;
OAES_OPTION options;
uint8_t iv[OAES_BLOCK_SIZE];
} oaes_ctx;
/*
* // usage:
*
* OAES_CTX * ctx = oaes_alloc();
* .
* .
* .
* {
* oaes_gen_key_xxx( ctx );
* {
* oaes_key_export( ctx, _buf, &_buf_len );
* // or
* oaes_key_export_data( ctx, _buf, &_buf_len );\
* }
* }
* // or
* {
* oaes_key_import( ctx, _buf, _buf_len );
* // or
* oaes_key_import_data( ctx, _buf, _buf_len );
* }
* .
* .
* .
* oaes_encrypt( ctx, m, m_len, c, &c_len );
* .
* .
* .
* oaes_decrypt( ctx, c, c_len, m, &m_len );
* .
* .
* .
* oaes_free( &ctx );
*/
OAES_API OAES_CTX * oaes_alloc(void);
OAES_API OAES_RET oaes_free( OAES_CTX ** ctx );
OAES_API OAES_RET oaes_set_option( OAES_CTX * ctx,
OAES_OPTION option, const void * value );
OAES_API OAES_RET oaes_key_gen_128( OAES_CTX * ctx );
OAES_API OAES_RET oaes_key_gen_192( OAES_CTX * ctx );
OAES_API OAES_RET oaes_key_gen_256( OAES_CTX * ctx );
// export key with header information
// set data == NULL to get the required data_len
OAES_API OAES_RET oaes_key_export( OAES_CTX * ctx,
uint8_t * data, size_t * data_len );
// directly export the data from key
// set data == NULL to get the required data_len
OAES_API OAES_RET oaes_key_export_data( OAES_CTX * ctx,
uint8_t * data, size_t * data_len );
// import key with header information
OAES_API OAES_RET oaes_key_import( OAES_CTX * ctx,
const uint8_t * data, size_t data_len );
// directly import data into key
OAES_API OAES_RET oaes_key_import_data( OAES_CTX * ctx,
const uint8_t * data, size_t data_len );
// set c == NULL to get the required c_len
OAES_API OAES_RET oaes_encrypt( OAES_CTX * ctx,
const uint8_t * m, size_t m_len, uint8_t * c, size_t * c_len );
// set m == NULL to get the required m_len
OAES_API OAES_RET oaes_decrypt( OAES_CTX * ctx,
const uint8_t * c, size_t c_len, uint8_t * m, size_t * m_len );
// set buf == NULL to get the required buf_len
OAES_API OAES_RET oaes_sprintf(
char * buf, size_t * buf_len, const uint8_t * data, size_t data_len );
OAES_API OAES_RET oaes_encryption_round( const uint8_t * key, uint8_t * c );
OAES_API OAES_RET oaes_pseudo_encrypt_ecb( OAES_CTX * ctx, uint8_t * c );
#ifdef __cplusplus
}
#endif
#endif // _OAES_LIB_H

68
crypto/xmr-rpc.cpp

@ -27,6 +27,7 @@
#define PRIu64 "I64u" #define PRIu64 "I64u"
#endif #endif
#include <algos.h>
#include "xmr-rpc.h" #include "xmr-rpc.h"
#include "wildkeccak.h" #include "wildkeccak.h"
@ -376,7 +377,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
goto err_out; goto err_out;
} }
if(!addendums_decode(job)) { if(opt_algo == ALGO_WILDKECCAK && !addendums_decode(job)) {
applog(LOG_ERR, "JSON failed to process addendums"); applog(LOG_ERR, "JSON failed to process addendums");
goto err_out; goto err_out;
} }
@ -396,6 +397,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
applog(LOG_ERR, "JSON invalid blob length"); applog(LOG_ERR, "JSON invalid blob length");
goto err_out; goto err_out;
} }
if (blobLen != 0) if (blobLen != 0)
{ {
pthread_mutex_lock(&rpc2_job_lock); pthread_mutex_lock(&rpc2_job_lock);
@ -420,7 +422,6 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
if(rpc2_target != target) { if(rpc2_target != target) {
double difficulty = (((double) UINT32_MAX) / target); double difficulty = (((double) UINT32_MAX) / target);
stratum.job.diff = difficulty; stratum.job.diff = difficulty;
//applog(LOG_WARNING, "Stratum difficulty set to %.1f M", difficulty/1e6);
rpc2_target = target; rpc2_target = target;
} }
@ -430,6 +431,7 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
rpc2_job_id = strdup(job_id); rpc2_job_id = strdup(job_id);
pthread_mutex_unlock(&rpc2_job_lock); pthread_mutex_unlock(&rpc2_job_lock);
} }
if(work) if(work)
{ {
if (!rpc2_blob) { if (!rpc2_blob) {
@ -438,16 +440,13 @@ bool rpc2_job_decode(const json_t *job, struct work *work)
} }
memcpy(work->data, rpc2_blob, rpc2_bloblen); memcpy(work->data, rpc2_blob, rpc2_bloblen);
memset(work->target, 0xff, sizeof(work->target)); memset(work->target, 0xff, sizeof(work->target));
// hmmpff ? seems wrong
//*((uint64_t*)&work->target[6]) = rpc2_target;
work->target[7] = rpc2_target; work->target[7] = rpc2_target;
work->targetdiff = target_to_diff(work->target); work->targetdiff = target_to_diff(work->target);
snprintf(work->job_id, sizeof(work->job_id), "%s", rpc2_job_id); snprintf(work->job_id, sizeof(work->job_id), "%s", rpc2_job_id);
} }
if (opt_algo == ALGO_WILDKECCAK)
wildkeccak_scratchpad_need_update(pscratchpad_buff); wildkeccak_scratchpad_need_update(pscratchpad_buff);
return true; return true;
@ -465,8 +464,9 @@ bool rpc2_stratum_job(struct stratum_ctx *sctx, json_t *id, json_t *params)
pthread_mutex_lock(&rpc2_work_lock); pthread_mutex_lock(&rpc2_work_lock);
ret = rpc2_job_decode(params, &rpc2_work); ret = rpc2_job_decode(params, &rpc2_work);
// update miner threads work // update miner threads work
rpc2_stratum_gen_work(sctx, &g_work); if (ret) rpc2_stratum_gen_work(sctx, &g_work);
//memcpy(&g_work, &rpc2_work, sizeof(struct work)); //memcpy(&g_work, &rpc2_work, sizeof(struct work));
restart_threads();
pthread_mutex_unlock(&rpc2_work_lock); pthread_mutex_unlock(&rpc2_work_lock);
return ret; return ret;
} }
@ -480,7 +480,10 @@ bool rpc2_stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
stratum_diff = sctx->job.diff; stratum_diff = sctx->job.diff;
if (opt_showdiff && work->targetdiff != stratum_diff) if (opt_showdiff && work->targetdiff != stratum_diff)
snprintf(sdiff, 32, " (%.5f)", work->targetdiff); snprintf(sdiff, 32, " (%.5f)", work->targetdiff);
if (stratum_diff >= 1e6)
applog(LOG_WARNING, "Stratum difficulty set to %.1f M%s", stratum_diff/1e6, sdiff); applog(LOG_WARNING, "Stratum difficulty set to %.1f M%s", stratum_diff/1e6, sdiff);
else
applog(LOG_WARNING, "Stratum difficulty set to %.0f%s", stratum_diff, sdiff);
} }
if (work->target[7] != rpc2_target) { if (work->target[7] != rpc2_target) {
work->target[7] = rpc2_target; work->target[7] = rpc2_target;
@ -501,19 +504,30 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
char *noncestr, *hashhex; char *noncestr, *hashhex;
memcpy(&data[0], work->data, 88); memcpy(&data[0], work->data, 88);
memcpy(&data[1], work->nonces, 8);
if (opt_algo == ALGO_WILDKECCAK) {
memcpy(&data[1], work->nonces, 8);
// pass if the previous hash is not the current previous hash // pass if the previous hash is not the current previous hash
if(!submit_old && memcmp(&work->data[3], &g_work.data[3], 28)) { if(!submit_old && memcmp(&work->data[3], &g_work.data[3], 28)) {
if (opt_debug) applog(LOG_DEBUG, "stale work detected", work->sharediff, work->targetdiff); if (opt_debug) applog(LOG_DEBUG, "stale work detected", work->sharediff, work->targetdiff);
pool->stales_count++; pool->stales_count++;
return true; return true;
} }
noncestr = bin2hex((unsigned char*) &data[1], 8); noncestr = bin2hex((unsigned char*) &data[1], 8);
memcpy(&last_found_nonce, work->nonces, 8); // "nonce":"5794ec8000000000" => 0x0000000080ec9457 memcpy(&last_found_nonce, work->nonces, 8); // "nonce":"5794ec8000000000" => 0x0000000080ec9457
wildkeccak_hash(hash, data, NULL, 0); wildkeccak_hash(hash, data, NULL, 0);
work_set_target_ratio(work, (uint32_t*) hash);
}
else if (opt_algo == ALGO_CRYPTONIGHT) {
uint32_t nonce;
memcpy(&nonce, &data[39], 4);
noncestr = bin2hex((unsigned char*) &nonce, 4);
last_found_nonce = nonce;
cryptonight_hash(hash, data, 76);
work_set_target_ratio(work, (uint32_t*) hash);
}
//applog(LOG_DEBUG, "submit diff %g > %g", work->sharediff, work->targetdiff); //applog(LOG_DEBUG, "submit diff %g > %g", work->sharediff, work->targetdiff);
//applog_hex(data, 81); //applog_hex(data, 81);
//applog_hex(hash, 32); //applog_hex(hash, 32);
@ -592,7 +606,8 @@ bool store_scratchpad_to_file(bool do_fsync)
FILE *fp; FILE *fp;
int ret; int ret;
if(!scratchpad_size || !pscratchpad_buff) return true; // opt_algo != ALGO_WILDKECCAK || if(opt_algo != ALGO_WILDKECCAK) return true;
if(!scratchpad_size || !pscratchpad_buff) return true;
snprintf(file_name_buff, sizeof(file_name_buff), "%s.tmp", pscratchpad_local_cache); snprintf(file_name_buff, sizeof(file_name_buff), "%s.tmp", pscratchpad_local_cache);
unlink(file_name_buff); unlink(file_name_buff);
@ -646,6 +661,8 @@ bool load_scratchpad_from_file(const char *fname)
FILE *fp; FILE *fp;
long flen; long flen;
if(opt_algo != ALGO_WILDKECCAK) return true;
fp = fopen(fname, "rb"); fp = fopen(fname, "rb");
if (!fp) { if (!fp) {
if (errno != ENOENT) { if (errno != ENOENT) {
@ -692,6 +709,8 @@ bool load_scratchpad_from_file(const char *fname)
bool dump_scratchpad_to_file_debug() bool dump_scratchpad_to_file_debug()
{ {
char file_name_buff[1024] = { 0 }; char file_name_buff[1024] = { 0 };
if(opt_algo != ALGO_WILDKECCAK) return true;
snprintf(file_name_buff, sizeof(file_name_buff), "scratchpad_%" PRIu64 "_%llx.scr", snprintf(file_name_buff, sizeof(file_name_buff), "scratchpad_%" PRIu64 "_%llx.scr",
current_scratchpad_hi.height, (long long) last_found_nonce); current_scratchpad_hi.height, (long long) last_found_nonce);
@ -1027,6 +1046,7 @@ static bool rpc2_stratum_getscratchpad(struct stratum_ctx *sctx)
json_t *val = NULL; json_t *val = NULL;
json_error_t err; json_error_t err;
char *s, *sret; char *s, *sret;
if(opt_algo != ALGO_WILDKECCAK) return true;
s = (char*) calloc(1, 1024); s = (char*) calloc(1, 1024);
if (!s) if (!s)
@ -1067,14 +1087,21 @@ bool rpc2_stratum_authorize(struct stratum_ctx *sctx, const char *user, const ch
bool ret = false; bool ret = false;
json_t *val = NULL, *res_val, *err_val, *job_val = NULL; json_t *val = NULL, *res_val, *err_val, *job_val = NULL;
json_error_t err; json_error_t err;
char *s, *sret; char *sret;
char *s = (char*) calloc(1, 320 + strlen(user) + strlen(pass));
if (opt_algo == ALGO_WILDKECCAK) {
char *prevhash = bin2hex((const unsigned char*)current_scratchpad_hi.prevhash, 32); char *prevhash = bin2hex((const unsigned char*)current_scratchpad_hi.prevhash, 32);
s = (char*) calloc(1, 320 + strlen(user) + strlen(pass));
sprintf(s, "{\"method\":\"login\",\"params\":{\"login\":\"%s\",\"pass\":\"%s\"," sprintf(s, "{\"method\":\"login\",\"params\":{\"login\":\"%s\",\"pass\":\"%s\","
"\"hi\":{\"height\":%" PRIu64 ",\"block_id\":\"%s\"}," "\"hi\":{\"height\":%" PRIu64 ",\"block_id\":\"%s\"},"
"\"agent\":\"" USER_AGENT "\"},\"id\":2}", "\"agent\":\"" USER_AGENT "\"},\"id\":2}",
user, pass, current_scratchpad_hi.height, prevhash); user, pass, current_scratchpad_hi.height, prevhash);
free(prevhash); free(prevhash);
} else {
sprintf(s, "{\"method\":\"login\",\"params\":{\"login\":\"%s\",\"pass\":\"%s\","
"\"agent\":\"" USER_AGENT "\"},\"id\":2}",
user, pass);
}
if (!stratum_send_line(sctx, s)) if (!stratum_send_line(sctx, s))
goto out; goto out;
@ -1146,12 +1173,16 @@ bool rpc2_stratum_request_job(struct stratum_ctx *sctx)
return ret; return ret;
} }
if (opt_algo == ALGO_WILDKECCAK) {
char* prevhash = bin2hex((const unsigned char*)current_scratchpad_hi.prevhash, 32); char* prevhash = bin2hex((const unsigned char*)current_scratchpad_hi.prevhash, 32);
sprintf(s, "{\"method\":\"getjob\",\"params\": {" sprintf(s, "{\"method\":\"getjob\",\"params\": {"
"\"id\":\"%s\", \"hi\": {\"height\": %" PRIu64 ",\"block_id\":\"%s\" }, \"agent\": \"" USER_AGENT "\"}," "\"id\":\"%s\", \"hi\": {\"height\": %" PRIu64 ",\"block_id\":\"%s\" }, \"agent\": \"" USER_AGENT "\"},"
"\"id\":1}", "\"id\":1}",
rpc2_id, current_scratchpad_hi.height, prevhash); rpc2_id, current_scratchpad_hi.height, prevhash);
free(prevhash); free(prevhash);
} else {
sprintf(s, "{\"method\":\"getjob\",\"params\":{\"id\":\"%s\"},\"id\":1}", rpc2_id);
}
if(!stratum_send_line(sctx, s)) { if(!stratum_send_line(sctx, s)) {
applog(LOG_ERR, "Stratum failed to send getjob line"); applog(LOG_ERR, "Stratum failed to send getjob line");
@ -1210,7 +1241,7 @@ int rpc2_stratum_thread_stuff(struct pool_infos* pool)
} }
} }
if(!scratchpad_size) { if(!scratchpad_size && opt_algo == ALGO_WILDKECCAK) {
if(!rpc2_stratum_getscratchpad(&stratum)) { if(!rpc2_stratum_getscratchpad(&stratum)) {
stratum_disconnect(&stratum); stratum_disconnect(&stratum);
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
@ -1226,6 +1257,15 @@ int rpc2_stratum_thread_stuff(struct pool_infos* pool)
} }
} }
// if getjob supported
if(0 && opt_algo == ALGO_CRYPTONIGHT) {
if(!rpc2_stratum_request_job(&stratum)) {
stratum_disconnect(&stratum);
applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
sleep(opt_fail_pause);
}
}
/* save every 12 hours */ /* save every 12 hours */
if ((time(NULL) - prev_save) > 12*3600) { if ((time(NULL) - prev_save) > 12*3600) {
store_scratchpad_to_file(false); store_scratchpad_to_file(false);

3
miner.h

@ -267,6 +267,7 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce,
extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -315,6 +316,7 @@ extern void free_blake256(int thr_id);
extern void free_blake2s(int thr_id); extern void free_blake2s(int thr_id);
extern void free_bmw(int thr_id); extern void free_bmw(int thr_id);
extern void free_c11(int thr_id); extern void free_c11(int thr_id);
extern void free_cryptonight(int thr_id);
extern void free_decred(int thr_id); extern void free_decred(int thr_id);
extern void free_deep(int thr_id); extern void free_deep(int thr_id);
extern void free_keccak256(int thr_id); extern void free_keccak256(int thr_id);
@ -803,6 +805,7 @@ void blake256hash(void *output, const void *input, int8_t rounds);
void blake2s_hash(void *output, const void *input); void blake2s_hash(void *output, const void *input);
void bmw_hash(void *state, const void *input); void bmw_hash(void *state, const void *input);
void c11hash(void *output, const void *input); void c11hash(void *output, const void *input);
void cryptonight_hash(void* output, const void* input, size_t len);
void decred_hash(void *state, const void *input); void decred_hash(void *state, const void *input);
void deephash(void *state, const void *input); void deephash(void *state, const void *input);
void luffa_hash(void *state, const void *input); void luffa_hash(void *state, const void *input);

4
pools.cpp

@ -254,7 +254,7 @@ bool pool_switch(int thr_id, int pooln)
// temporary... until stratum code cleanup // temporary... until stratum code cleanup
stratum = p->stratum; stratum = p->stratum;
stratum.pooln = cur_pooln; stratum.pooln = cur_pooln;
stratum.rpc2 = (p->algo == ALGO_WILDKECCAK); stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT);
// unlock the stratum thread // unlock the stratum thread
tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url)); tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
@ -277,7 +277,7 @@ bool pool_switch(int thr_id, int pooln)
} }
stratum.rpc2 = (p->algo == ALGO_WILDKECCAK); stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT);
return true; return true;
} }

5
util.cpp

@ -1856,7 +1856,7 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
ret = stratum_show_message(sctx, id, params); ret = stratum_show_message(sctx, id, params);
goto out; goto out;
} }
if (sctx->rpc2 && !strcasecmp(method, "job")) { // cryptonote if (sctx->rpc2 && !strcasecmp(method, "job")) { // xmr/bbr
ret = rpc2_stratum_job(sctx, id, params); ret = rpc2_stratum_job(sctx, id, params);
goto out; goto out;
} }
@ -2141,6 +2141,9 @@ void print_hash_tests(void)
c11hash(&hash[0], &buf[0]); c11hash(&hash[0], &buf[0]);
printpfx("c11", hash); printpfx("c11", hash);
cryptonight_hash(&hash[0], &buf[0], 76);
printpfx("cryptonight", hash);
memset(buf, 0, 180); memset(buf, 0, 180);
decred_hash(&hash[0], &buf[0]); decred_hash(&hash[0], &buf[0]);
printpfx("decred", hash); printpfx("decred", hash);

Loading…
Cancel
Save