1
0
mirror of https://github.com/GOSTSec/ccminer synced 2025-01-08 22:07:56 +00:00

cubehash: improve a bit and cleanup

This commit is contained in:
Tanguy Pruvot 2017-07-27 23:36:09 +02:00
parent 08a3f3402c
commit 9c053042c9

View File

@ -1,6 +1,5 @@
#include "cuda_helper.h"
typedef unsigned char BitSequence;
#include <cuda_helper.h>
#include <cuda_vectors.h>
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
@ -18,17 +17,14 @@ typedef unsigned char BitSequence;
__device__ __constant__
static const uint32_t c_IV_512[32] = {
0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
0x4167D83E, 0x3FEE2313, 0xC701CF8C,
0xCC39968E, 0x50AC5695, 0x4D42C787,
0xA647A8B3, 0x97CF0BEF, 0x825B4537,
0xEEF864D2, 0xF22090C4, 0xD0E5CD33,
0xA23911AE, 0xFCD398D9, 0x148FE485,
0x1B017BEF, 0xB6444532, 0x6A536159,
0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
0xD65C8A2B, 0xA5A70E75, 0xB1C62456,
0xBC796576, 0x1921C8F7, 0xE7989AF1,
0x7795D246, 0xD43E3B44
0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
};
__device__ __forceinline__
@ -149,107 +145,68 @@ static void rrounds(uint32_t x[2][2][2][2][2])
}
__device__ __forceinline__
static void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2])
{
int k;
int l;
int m;
uint32_t *in = block;
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][0][k][l][m] ^= *in++;
// read 32 bytes input from global mem with uint2 chunks
AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]);
AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]);
AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]);
AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]);
}
__device__ __forceinline__
static void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2])
{
int j;
int k;
int l;
int m;
uint32_t *out = hash;
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
*out++ = x[0][j][k][l][m];
// used to write final hash to global mem
AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]);
AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]);
AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]);
AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]);
AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]);
AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]);
AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]);
AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]);
}
__device__
void Init(uint32_t x[2][2][2][2][2])
#define Init(x) \
AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \
AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \
AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \
AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \
AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \
AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \
AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \
AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \
AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \
AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \
AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \
AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \
AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \
AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \
AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \
AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]);
__device__ __forceinline__
static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data)
{
int i,j,k,l,m;
#if 0
/* "the first three state words x_00000, x_00001, x_00010" */
/* "are set to the integers h/8, b, r respectively." */
/* "the remaining state words are set to 0." */
#pragma unroll 2
for (i = 0;i < 2;++i)
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[i][j][k][l][m] = 0;
x[0][0][0][0][0] = 512/8;
x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;
x[0][0][0][1][0] = CUBEHASH_ROUNDS;
/* "the state is then transformed invertibly through 10r identical rounds */
for (i = 0;i < 10;++i) rrounds(x);
#else
const uint32_t *iv = c_IV_512;
#pragma unroll 2
for (i = 0;i < 2;++i)
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[i][j][k][l][m] = *iv++;
#endif
/* "xor the block into the first b bytes of the state" */
block_tox(data, x);
/* "and then transform the state invertibly through r identical rounds" */
rrounds(x);
}
__device__ __forceinline__
static void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
{
/* "xor the block into the first b bytes of the state" */
/* "and then transform the state invertibly through r identical rounds" */
block_tox((uint32_t*)data, x);
rrounds(x);
}
/* "the integer 1 is xored into the last state word x_11111" */
x[1][1][1][1][1] ^= 1;
__device__ __forceinline__
static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
{
int i;
/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 10
for (int i = 0; i < 10; i++) rrounds(x);
/* "the integer 1 is xored into the last state word x_11111" */
x[1][1][1][1][1] ^= 1;
/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 10
for (i = 0;i < 10;++i) rrounds(x);
/* "output the first h/8 bytes of the state" */
hash_fromx((uint32_t*)hashval, x);
/* "output the first h/8 bytes of the state" */
hash_fromx(hashval, x);
}
@ -269,20 +226,17 @@ void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_
uint32_t x[2][2][2][2][2];
Init(x);
// erste Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)Hash);
// zweite Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)(Hash+8));
Update32(x, &Hash[0]);
Update32(x, &Hash[8]);
// Padding Block
uint32_t last[8];
last[0] = 0x80;
#pragma unroll 7
for (int i=1; i < 8; i++) last[i] = 0;
Update32(x, (const BitSequence*)last);
Update32(x, last);
Final(x, (BitSequence*)Hash);
Final(x, Hash);
}
}
@ -332,12 +286,12 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
// first 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]);
Update32(x, (const BitSequence*)message);
Update32(x, message);
// second 32 bytes
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[8]);
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]);
Update32(x, (const BitSequence*)message);
Update32(x, message);
// last 16 bytes + Padding
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]);
@ -346,9 +300,9 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
message[5] = 0;
message[6] = 0;
message[7] = 0;
Update32(x, (const BitSequence*)message);
Update32(x, message);
BitSequence* output = (BitSequence*) (&g_outhash[(size_t)8 * thread]);
uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]);
Final(x, output);
}
}