mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-25 14:04:34 +00:00
x11: echo and cubehash optimization
echo : 40.056ms -> 39.241ms cube : 14.490ms -> 13.511ms cube hash change look like useless (__device__ code in generally inlined) but the reality proves that cuda documentation is wrong... tpruvot: fixed dos lines ending in echo, and used my style for cuda function attributes
This commit is contained in:
parent
12fafd5687
commit
5be6811dcf
@ -319,49 +319,32 @@ static void aes_round(
|
|||||||
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
|
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
|
||||||
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
||||||
{
|
{
|
||||||
uint32_t idx0, idx1, idx2, idx3;
|
|
||||||
|
|
||||||
idx0 = __byte_perm(x0, 0, 0x4440);
|
|
||||||
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
|
|
||||||
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
|
|
||||||
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
|
|
||||||
y0 = xor4_32(
|
y0 = xor4_32(
|
||||||
sharedMemory[idx0],
|
sharedMemory[__byte_perm(x0, 0, 0x4440)],
|
||||||
sharedMemory[idx1],
|
sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
|
||||||
sharedMemory[idx2],
|
sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
|
||||||
sharedMemory[idx3]);
|
sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
|
||||||
y0 ^= k0;
|
|
||||||
|
|
||||||
idx0 = __byte_perm(x1, 0, 0x4440);
|
|
||||||
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
|
|
||||||
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
|
|
||||||
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
|
|
||||||
|
|
||||||
y1 = xor4_32(
|
y1 = xor4_32(
|
||||||
sharedMemory[idx0],
|
sharedMemory[__byte_perm(x1, 0, 0x4440)],
|
||||||
sharedMemory[idx1],
|
sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
|
||||||
sharedMemory[idx2],
|
sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
|
||||||
sharedMemory[idx3]);
|
sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
|
||||||
|
|
||||||
idx0 = __byte_perm(x2, 0, 0x4440);
|
|
||||||
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
|
|
||||||
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
|
|
||||||
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
|
|
||||||
y2 = xor4_32(
|
y2 = xor4_32(
|
||||||
sharedMemory[idx0],
|
sharedMemory[__byte_perm(x2, 0, 0x4440)],
|
||||||
sharedMemory[idx1],
|
sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
|
||||||
sharedMemory[idx2],
|
sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
|
||||||
sharedMemory[idx3]); // ^k2
|
sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
|
||||||
|
|
||||||
|
y0 ^= k0;
|
||||||
|
|
||||||
idx0 = __byte_perm(x3, 0, 0x4440);
|
|
||||||
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
|
|
||||||
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
|
|
||||||
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
|
|
||||||
y3 = xor4_32(
|
y3 = xor4_32(
|
||||||
sharedMemory[idx0],
|
sharedMemory[__byte_perm(x3, 0, 0x4440)],
|
||||||
sharedMemory[idx1],
|
sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
|
||||||
sharedMemory[idx2],
|
sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
|
||||||
sharedMemory[idx3]); // ^k3
|
sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__
|
__device__
|
||||||
@ -370,46 +353,27 @@ static void aes_round(
|
|||||||
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
|
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
|
||||||
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
||||||
{
|
{
|
||||||
uint32_t idx0, idx1, idx2, idx3;
|
|
||||||
|
|
||||||
idx0 = __byte_perm(x0, 0, 0x4440);
|
|
||||||
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
|
|
||||||
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
|
|
||||||
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
|
|
||||||
y0 = xor4_32(
|
y0 = xor4_32(
|
||||||
sharedMemory[idx0],
|
sharedMemory[__byte_perm(x0, 0, 0x4440)],
|
||||||
sharedMemory[idx1],
|
sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
|
||||||
sharedMemory[idx2],
|
sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
|
||||||
sharedMemory[idx3]);
|
sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
|
||||||
|
|
||||||
idx0 = __byte_perm(x1, 0, 0x4440);
|
|
||||||
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
|
|
||||||
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
|
|
||||||
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
|
|
||||||
|
|
||||||
y1 = xor4_32(
|
y1 = xor4_32(
|
||||||
sharedMemory[idx0],
|
sharedMemory[__byte_perm(x1, 0, 0x4440)],
|
||||||
sharedMemory[idx1],
|
sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
|
||||||
sharedMemory[idx2],
|
sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
|
||||||
sharedMemory[idx3]);
|
sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
|
||||||
|
|
||||||
idx0 = __byte_perm(x2, 0, 0x4440);
|
|
||||||
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
|
|
||||||
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
|
|
||||||
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
|
|
||||||
y2 = xor4_32(
|
y2 = xor4_32(
|
||||||
sharedMemory[idx0],
|
sharedMemory[__byte_perm(x2, 0, 0x4440)],
|
||||||
sharedMemory[idx1],
|
sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
|
||||||
sharedMemory[idx2],
|
sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
|
||||||
sharedMemory[idx3]); // ^k2
|
sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
|
||||||
|
|
||||||
idx0 = __byte_perm(x3, 0, 0x4440);
|
|
||||||
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
|
|
||||||
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
|
|
||||||
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
|
|
||||||
y3 = xor4_32(
|
y3 = xor4_32(
|
||||||
sharedMemory[idx0],
|
sharedMemory[__byte_perm(x3, 0, 0x4440)],
|
||||||
sharedMemory[idx1],
|
sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
|
||||||
sharedMemory[idx2],
|
sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
|
||||||
sharedMemory[idx3]); // ^k3
|
sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
|
||||||
}
|
}
|
||||||
|
@ -34,7 +34,8 @@ static const uint32_t c_IV_512[32] = {
|
|||||||
0x7795D246, 0xD43E3B44
|
0x7795D246, 0xD43E3B44
|
||||||
};
|
};
|
||||||
|
|
||||||
static __device__ void rrounds(uint32_t x[2][2][2][2][2])
|
__device__ __forceinline__
|
||||||
|
static void rrounds(uint32_t x[2][2][2][2][2])
|
||||||
{
|
{
|
||||||
int r;
|
int r;
|
||||||
int j;
|
int j;
|
||||||
@ -150,8 +151,8 @@ static __device__ void rrounds(uint32_t x[2][2][2][2][2])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__
|
||||||
static __device__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
|
static void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
|
||||||
{
|
{
|
||||||
int k;
|
int k;
|
||||||
int l;
|
int l;
|
||||||
@ -167,7 +168,8 @@ static __device__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
|
|||||||
x[0][0][k][l][m] ^= *in++;
|
x[0][0][k][l][m] ^= *in++;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
|
__device__ __forceinline__
|
||||||
|
static void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
|
||||||
{
|
{
|
||||||
int j;
|
int j;
|
||||||
int k;
|
int k;
|
||||||
@ -186,7 +188,8 @@ static __device__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
|
|||||||
*out++ = x[0][j][k][l][m];
|
*out++ = x[0][j][k][l][m];
|
||||||
}
|
}
|
||||||
|
|
||||||
void __device__ Init(uint32_t x[2][2][2][2][2])
|
__device__
|
||||||
|
void Init(uint32_t x[2][2][2][2][2])
|
||||||
{
|
{
|
||||||
int i,j,k,l,m;
|
int i,j,k,l,m;
|
||||||
#if 0
|
#if 0
|
||||||
@ -227,7 +230,8 @@ void __device__ Init(uint32_t x[2][2][2][2][2])
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void __device__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
|
__device__ __forceinline__
|
||||||
|
void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
|
||||||
{
|
{
|
||||||
/* "xor the block into the first b bytes of the state" */
|
/* "xor the block into the first b bytes of the state" */
|
||||||
/* "and then transform the state invertibly through r identical rounds" */
|
/* "and then transform the state invertibly through r identical rounds" */
|
||||||
@ -235,7 +239,8 @@ void __device__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
|
|||||||
rrounds(x);
|
rrounds(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
void __device__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
|
__device__ __forceinline__
|
||||||
|
void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@ -252,8 +257,9 @@ void __device__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
|
|||||||
|
|
||||||
|
|
||||||
/***************************************************/
|
/***************************************************/
|
||||||
// Die Hash-Funktion
|
// GPU Hash Function
|
||||||
__global__ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
__global__
|
||||||
|
void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
||||||
{
|
{
|
||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
|
@ -75,41 +75,38 @@ __device__ __forceinline__ void cuda_echo_round(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Mix Columns
|
// Mix Columns
|
||||||
#pragma unroll 4
|
#pragma unroll
|
||||||
for(int i=0;i<4;i++) // Schleife über je 2*uint32_t
|
for (int i = 0; i<4; i++) // Schleife über je 2*uint32_t
|
||||||
{
|
{
|
||||||
#pragma unroll 4
|
#pragma unroll 64
|
||||||
for(int j=0;j<4;j++) // Schleife über die elemnte
|
for (int idx = 0; idx<64; idx += 16) // Schleife über die elemnte
|
||||||
{
|
{
|
||||||
int idx = j<<2; // j*4
|
|
||||||
|
|
||||||
uint32_t a = W[ ((idx + 0)<<2) + i];
|
uint32_t a = W[idx + i];
|
||||||
uint32_t b = W[ ((idx + 1)<<2) + i];
|
uint32_t b = W[idx + i + 4];
|
||||||
uint32_t c = W[ ((idx + 2)<<2) + i];
|
uint32_t c = W[idx + i + 8];
|
||||||
uint32_t d = W[ ((idx + 3)<<2) + i];
|
uint32_t d = W[idx + i + 12];
|
||||||
|
|
||||||
uint32_t ab = a ^ b;
|
uint32_t ab = a ^ b;
|
||||||
uint32_t bc = b ^ c;
|
uint32_t bc = b ^ c;
|
||||||
uint32_t cd = c ^ d;
|
uint32_t cd = c ^ d;
|
||||||
|
|
||||||
uint32_t t;
|
uint32_t t, t2, t3;
|
||||||
t = ((ab & 0x80808080) >> 7);
|
t = (ab & 0x80808080);
|
||||||
uint32_t abx = t<<4 ^ t<<3 ^ t<<1 ^ t;
|
t2 = (bc & 0x80808080);
|
||||||
t = ((bc & 0x80808080) >> 7);
|
t3 = (cd & 0x80808080);
|
||||||
uint32_t bcx = t<<4 ^ t<<3 ^ t<<1 ^ t;
|
|
||||||
t = ((cd & 0x80808080) >> 7);
|
|
||||||
uint32_t cdx = t<<4 ^ t<<3 ^ t<<1 ^ t;
|
|
||||||
|
|
||||||
abx ^= ((ab & 0x7F7F7F7F) << 1);
|
uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
|
||||||
bcx ^= ((bc & 0x7F7F7F7F) << 1);
|
uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
|
||||||
cdx ^= ((cd & 0x7F7F7F7F) << 1);
|
uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
|
||||||
|
|
||||||
W[ ((idx + 0)<<2) + i] = abx ^ bc ^ d;
|
W[idx + i] = abx ^ bc ^ d;
|
||||||
W[ ((idx + 1)<<2) + i] = bcx ^ a ^ cd;
|
W[idx + i + 4] = bcx ^ a ^ cd;
|
||||||
W[ ((idx + 2)<<2) + i] = cdx ^ ab ^ d;
|
W[idx + i + 8] = cdx ^ ab ^ d;
|
||||||
W[ ((idx + 3)<<2) + i] = abx ^ bcx ^ cdx ^ ab ^ c;
|
W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
__global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user