mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-09 22:38:05 +00:00
fdd5d29071
Previous echo commit was only increasing linux performance, and reducing windows perf compared to the 1.4.9, this one seems to give at least the 1.4.9 on windows, and the same on linux... Shavite optimisation seems ok on both (use now 64 registers) the launch_bounds will force the number of registers, so remove specific Makefile rules on linux... manual "cherry pick" with fixed line endings and some adaptations
320 lines
7.3 KiB
Plaintext
320 lines
7.3 KiB
Plaintext
#include <stdio.h>
|
|
#include <memory.h>
|
|
|
|
#include "cuda_helper.h"
|
|
|
|
// aus heavy.cu
|
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
|
|
|
#include "cuda_x11_aes.cu"
|
|
|
|
__device__ __forceinline__ void AES_2ROUND(
|
|
const uint32_t* __restrict__ sharedMemory,
|
|
uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
|
|
uint32_t &k0)
|
|
{
|
|
uint32_t y0, y1, y2, y3;
|
|
|
|
aes_round(sharedMemory,
|
|
x0, x1, x2, x3,
|
|
k0,
|
|
y0, y1, y2, y3);
|
|
|
|
aes_round(sharedMemory,
|
|
y0, y1, y2, y3,
|
|
x0, x1, x2, x3);
|
|
|
|
// hier werden wir ein carry brauchen (oder auch nicht)
|
|
k0++;
|
|
}
|
|
|
|
__device__ __forceinline__
|
|
void cuda_echo_round(
|
|
const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash)
|
|
{
|
|
const uint32_t P[48] = {
|
|
0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
|
0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
|
//8-12
|
|
0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
|
0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
|
//21-25
|
|
0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751,
|
|
0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
|
//34-38
|
|
0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7,
|
|
0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
|
0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
|
0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
|
|
0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
|
0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
|
|
//58-61
|
|
};
|
|
uint32_t k0;
|
|
uint32_t h[16];
|
|
|
|
#pragma unroll 16
|
|
for (int i = 0; i < 16; i++)
|
|
{
|
|
h[i] = hash[i];
|
|
}
|
|
|
|
k0 = 512 + 8;
|
|
|
|
#pragma unroll
|
|
for (int idx = 0; idx < 16; idx+= 4)
|
|
{
|
|
AES_2ROUND(sharedMemory,
|
|
h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
|
|
}
|
|
k0 += 4;
|
|
|
|
uint32_t W[64];
|
|
|
|
#pragma unroll 4
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
uint32_t a = P[i];
|
|
uint32_t b = P[i + 4];
|
|
uint32_t c = h[i + 8];
|
|
uint32_t d = P[i + 8];
|
|
|
|
uint32_t ab = a ^ b;
|
|
uint32_t bc = b ^ c;
|
|
uint32_t cd = c ^ d;
|
|
|
|
|
|
uint32_t t = (ab & 0x80808080);
|
|
uint32_t t2 = (bc & 0x80808080);
|
|
uint32_t t3 = (cd & 0x80808080);
|
|
|
|
uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
|
|
uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
|
|
uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
|
|
|
|
W[0 + i] = abx ^ bc ^ d;
|
|
W[0 + i + 4] = bcx ^ a ^ cd;
|
|
W[0 + i + 8] = cdx ^ ab ^ d;
|
|
W[0 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
|
|
|
|
a = P[12 + i];
|
|
b = h[i + 4];
|
|
c = P[12 + i + 4];
|
|
d = P[12 + i + 8];
|
|
|
|
ab = a ^ b;
|
|
bc = b ^ c;
|
|
cd = c ^ d;
|
|
|
|
|
|
t = (ab & 0x80808080);
|
|
t2 = (bc & 0x80808080);
|
|
t3 = (cd & 0x80808080);
|
|
|
|
abx = (t >> 7) * 27 ^ ((ab^t) << 1);
|
|
bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
|
|
cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
|
|
|
|
W[16 + i] = abx ^ bc ^ d;
|
|
W[16 + i + 4] = bcx ^ a ^ cd;
|
|
W[16 + i + 8] = cdx ^ ab ^ d;
|
|
W[16 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
|
|
|
|
a = h[i];
|
|
b = P[24 + i + 0];
|
|
c = P[24 + i + 4];
|
|
d = P[24 + i + 8];
|
|
|
|
ab = a ^ b;
|
|
bc = b ^ c;
|
|
cd = c ^ d;
|
|
|
|
|
|
t = (ab & 0x80808080);
|
|
t2 = (bc & 0x80808080);
|
|
t3 = (cd & 0x80808080);
|
|
|
|
abx = (t >> 7) * 27 ^ ((ab^t) << 1);
|
|
bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
|
|
cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
|
|
|
|
W[32 + i] = abx ^ bc ^ d;
|
|
W[32 + i + 4] = bcx ^ a ^ cd;
|
|
W[32 + i + 8] = cdx ^ ab ^ d;
|
|
W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
|
|
|
|
a = P[36 + i ];
|
|
b = P[36 + i +4 ];
|
|
c = P[36 + i + 8];
|
|
d = h[i + 12];
|
|
|
|
ab = a ^ b;
|
|
bc = b ^ c;
|
|
cd = c ^ d;
|
|
|
|
t = (ab & 0x80808080);
|
|
t2 = (bc & 0x80808080);
|
|
t3 = (cd & 0x80808080);
|
|
|
|
abx = (t >> 7) * 27 ^ ((ab^t) << 1);
|
|
bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
|
|
cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
|
|
|
|
W[48 + i] = abx ^ bc ^ d;
|
|
W[48 + i + 4] = bcx ^ a ^ cd;
|
|
W[48 + i + 8] = cdx ^ ab ^ d;
|
|
W[48 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
|
|
|
|
}
|
|
|
|
for (int k = 1; k < 10; k++)
|
|
{
|
|
|
|
// Big Sub Words
|
|
#pragma unroll 4
|
|
for (int idx = 0; idx < 64; idx += 16)
|
|
{
|
|
AES_2ROUND(sharedMemory,
|
|
W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3],
|
|
k0);
|
|
AES_2ROUND(sharedMemory,
|
|
W[idx + 4], W[idx + 5], W[idx + 6], W[idx + 7],
|
|
k0);
|
|
AES_2ROUND(sharedMemory,
|
|
W[idx + 8], W[idx + 9], W[idx + 10], W[idx + 11],
|
|
k0);
|
|
AES_2ROUND(sharedMemory,
|
|
W[idx + 12], W[idx + 13], W[idx + 14], W[idx + 15],
|
|
k0);
|
|
}
|
|
|
|
// Shift Rows
|
|
#pragma unroll 4
|
|
for (int i = 0; i < 4; i++)
|
|
{
|
|
uint32_t t;
|
|
|
|
/// 1, 5, 9, 13
|
|
t = W[4 + i];
|
|
W[4 + i] = W[20 + i];
|
|
W[20 + i] = W[36 + i];
|
|
W[36 + i] = W[52 + i];
|
|
W[52 + i] = t;
|
|
|
|
// 2, 6, 10, 14
|
|
t = W[8 + i];
|
|
W[8 + i] = W[40 + i];
|
|
W[40 + i] = t;
|
|
t = W[24 + i];
|
|
W[24 + i] = W[56 + i];
|
|
W[56 + i] = t;
|
|
|
|
// 15, 11, 7, 3
|
|
t = W[60 + i];
|
|
W[60 + i] = W[44 + i];
|
|
W[44 + i] = W[28 + i];
|
|
W[28 + i] = W[12 + i];
|
|
W[12 + i] = t;
|
|
}
|
|
|
|
// Mix Columns
|
|
#pragma unroll 4
|
|
for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t
|
|
{
|
|
#pragma unroll 64
|
|
for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte
|
|
{
|
|
uint32_t a = W[idx + i];
|
|
uint32_t b = W[idx + i + 4];
|
|
uint32_t c = W[idx + i + 8];
|
|
uint32_t d = W[idx + i + 12];
|
|
|
|
uint32_t ab = a ^ b;
|
|
uint32_t bc = b ^ c;
|
|
uint32_t cd = c ^ d;
|
|
|
|
uint32_t t, t2, t3;
|
|
t = (ab & 0x80808080);
|
|
t2 = (bc & 0x80808080);
|
|
t3 = (cd & 0x80808080);
|
|
|
|
uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
|
|
uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
|
|
uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
|
|
|
|
W[idx + i] = abx ^ bc ^ d;
|
|
W[idx + i + 4] = bcx ^ a ^ cd;
|
|
W[idx + i + 8] = cdx ^ ab ^ d;
|
|
W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
|
|
}
|
|
}
|
|
}
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i<16; i += 4)
|
|
{
|
|
W[i] ^= W[32 + i] ^ 512;
|
|
W[i + 1] ^= W[32 + i + 1];
|
|
W[i + 2] ^= W[32 + i + 2];
|
|
W[i + 3] ^= W[32 + i + 3];
|
|
}
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i<16; i++)
|
|
hash[i] ^= W[i];
|
|
}
|
|
|
|
__device__ __forceinline__
|
|
void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
|
|
{
|
|
/* each thread startup will fill a uint32 */
|
|
if (threadIdx.x < 128) {
|
|
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
|
|
sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
|
|
sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
|
|
sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
|
|
|
|
sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
|
|
sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
|
|
sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
|
|
sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
|
|
}
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 7) /* will force 72 registers */
|
|
void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
|
{
|
|
__shared__ uint32_t sharedMemory[1024];
|
|
|
|
echo_gpu_init(sharedMemory);
|
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
|
if (thread < threads)
|
|
{
|
|
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
|
|
|
int hashPosition = nounce - startNounce;
|
|
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
|
|
|
|
cuda_echo_round(sharedMemory, Hash);
|
|
}
|
|
}
|
|
|
|
__host__
|
|
void x11_echo512_cpu_init(int thr_id, int threads)
|
|
{
|
|
aes_cpu_init();
|
|
}
|
|
|
|
__host__
|
|
void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
|
{
|
|
const int threadsperblock = 128;
|
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
|
dim3 block(threadsperblock);
|
|
|
|
x11_echo512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
|
MyStreamSynchronize(NULL, order, thr_id);
|
|
}
|