Browse Source

cubehash: improve a bit and cleanup

master
Tanguy Pruvot 7 years ago
parent
commit
9c053042c9
  1. 160
      x11/cuda_x11_cubehash512.cu

160
x11/cuda_x11_cubehash512.cu

@ -1,6 +1,5 @@ @@ -1,6 +1,5 @@
#include "cuda_helper.h"
typedef unsigned char BitSequence;
#include <cuda_helper.h>
#include <cuda_vectors.h>
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
@ -18,17 +17,14 @@ typedef unsigned char BitSequence; @@ -18,17 +17,14 @@ typedef unsigned char BitSequence;
__device__ __constant__
static const uint32_t c_IV_512[32] = {
0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
0x4167D83E, 0x3FEE2313, 0xC701CF8C,
0xCC39968E, 0x50AC5695, 0x4D42C787,
0xA647A8B3, 0x97CF0BEF, 0x825B4537,
0xEEF864D2, 0xF22090C4, 0xD0E5CD33,
0xA23911AE, 0xFCD398D9, 0x148FE485,
0x1B017BEF, 0xB6444532, 0x6A536159,
0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
0xD65C8A2B, 0xA5A70E75, 0xB1C62456,
0xBC796576, 0x1921C8F7, 0xE7989AF1,
0x7795D246, 0xD43E3B44
0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
};
__device__ __forceinline__
@ -149,107 +145,68 @@ static void rrounds(uint32_t x[2][2][2][2][2]) @@ -149,107 +145,68 @@ static void rrounds(uint32_t x[2][2][2][2][2])
}
__device__ __forceinline__
static void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2])
{
int k;
int l;
int m;
uint32_t *in = block;
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][0][k][l][m] ^= *in++;
// read 32 bytes input from global mem with uint2 chunks
AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]);
AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]);
AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]);
AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]);
}
__device__ __forceinline__
static void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2])
{
int j;
int k;
int l;
int m;
uint32_t *out = hash;
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
*out++ = x[0][j][k][l][m];
// used to write final hash to global mem
AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]);
AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]);
AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]);
AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]);
AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]);
AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]);
AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]);
AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]);
}
__device__
void Init(uint32_t x[2][2][2][2][2])
{
int i,j,k,l,m;
#if 0
/* "the first three state words x_00000, x_00001, x_00010" */
/* "are set to the integers h/8, b, r respectively." */
/* "the remaining state words are set to 0." */
#pragma unroll 2
for (i = 0;i < 2;++i)
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[i][j][k][l][m] = 0;
x[0][0][0][0][0] = 512/8;
x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;
x[0][0][0][1][0] = CUBEHASH_ROUNDS;
/* "the state is then transformed invertibly through 10r identical rounds */
for (i = 0;i < 10;++i) rrounds(x);
#else
const uint32_t *iv = c_IV_512;
#pragma unroll 2
for (i = 0;i < 2;++i)
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[i][j][k][l][m] = *iv++;
#endif
}
#define Init(x) \
AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \
AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \
AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \
AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \
AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \
AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \
AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \
AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \
AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \
AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \
AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \
AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \
AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \
AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \
AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \
AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]);
__device__ __forceinline__
static void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data)
{
/* "xor the block into the first b bytes of the state" */
block_tox(data, x);
/* "and then transform the state invertibly through r identical rounds" */
block_tox((uint32_t*)data, x);
rrounds(x);
}
__device__ __forceinline__
static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
{
int i;
/* "the integer 1 is xored into the last state word x_11111" */
x[1][1][1][1][1] ^= 1;
/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 10
for (i = 0;i < 10;++i) rrounds(x);
#pragma unroll 10
for (int i = 0; i < 10; i++) rrounds(x);
/* "output the first h/8 bytes of the state" */
hash_fromx((uint32_t*)hashval, x);
hash_fromx(hashval, x);
}
@ -269,20 +226,17 @@ void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_ @@ -269,20 +226,17 @@ void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_
uint32_t x[2][2][2][2][2];
Init(x);
// erste Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)Hash);
// zweite Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)(Hash+8));
Update32(x, &Hash[0]);
Update32(x, &Hash[8]);
// Padding Block
uint32_t last[8];
last[0] = 0x80;
#pragma unroll 7
for (int i=1; i < 8; i++) last[i] = 0;
Update32(x, (const BitSequence*)last);
Update32(x, last);
Final(x, (BitSequence*)Hash);
Final(x, Hash);
}
}
@ -332,12 +286,12 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, @@ -332,12 +286,12 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
// first 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]);
Update32(x, (const BitSequence*)message);
Update32(x, message);
// second 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[8]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]);
Update32(x, (const BitSequence*)message);
Update32(x, message);
// last 16 bytes + Padding
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]);
@ -346,9 +300,9 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, @@ -346,9 +300,9 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
message[5] = 0;
message[6] = 0;
message[7] = 0;
Update32(x, (const BitSequence*)message);
Update32(x, message);
BitSequence* output = (BitSequence*) (&g_outhash[(size_t)8 * thread]);
uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]);
Final(x, output);
}
}

Loading…
Cancel
Save