Browse Source

import bmw512 uint2 changes from sp

+ some cleanup... 15KH/s won (750Ti)
2upstream
Tanguy Pruvot 9 years ago
parent
commit
768b5ccb76
  1. 4
      README.txt
  2. 75
      cuda_helper.h
  3. 671
      quark/cuda_bmw512.cu

4
README.txt

@ -1,5 +1,5 @@
ccMiner release 1.5.2-tpruvot (Jan 2015) - "Happy new Year!" ccMiner release 1.5.2-tpruvot (24 Jan 2015) - "Happy new Year!"
--------------------------------------------------------------- ---------------------------------------------------------------
*************************************************************** ***************************************************************
@ -175,7 +175,7 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Jan. 2015 v1.5.2 Jan. 24th 2015 v1.5.2
Add process CPU priority and affinity mask parameters Add process CPU priority and affinity mask parameters
Intelligent duplicate shares check feature (enabled if needed) Intelligent duplicate shares check feature (enabled if needed)
api: Fan RPM (windows), Cuda threads count, linux kernel ver. api: Fan RPM (windows), Cuda threads count, linux kernel ver.

75
cuda_helper.h

@ -424,6 +424,17 @@ static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b)
} }
static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; } static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b)
{
uint2 result;
asm("{\n\t"
"sub.cc.u32 %0,%2,%4; \n\t"
"subc.u32 %1,%3,%5; \n\t"
"}\n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
return result;
}
/** /**
* basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b)) * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
* (what does uint64 "*" operator) * (what does uint64 "*" operator)
@ -500,4 +511,68 @@ uint2 SWAPUINT2(uint2 value)
return make_uint2(value.y, value.x); return make_uint2(value.y, value.x);
} }
/* uint2 for bmw512 - to double check later */
__device__ __forceinline__
static uint2 SHL2(uint2 a, int offset)
{
#if __CUDA_ARCH__ > 300
uint2 result;
if (offset < 32) {
asm("{\n\t"
"shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
"shl.b32 %0,%2,%4; \n\t"
"}\n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
} else {
asm("{\n\t"
"shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
"shl.b32 %0,%2,%4; \n\t"
"}\n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
}
return result;
#else
if (offset <= 32) {
a.y = (a.y << offset) | (a.x >> (32 - offset));
a.x = (a.x << offset);
} else {
a.y = (a.x << (offset-32));
a.x = 0;
}
return a;
#endif
}
__device__ __forceinline__
static uint2 SHR2(uint2 a, int offset)
{
#if __CUDA_ARCH__ > 300
uint2 result;
if (offset<32) {
asm("{\n\t"
"shf.r.clamp.b32 %0,%2,%3,%4; \n\t"
"shr.b32 %1,%3,%4; \n\t"
"}\n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
} else {
asm("{\n\t"
"shf.l.clamp.b32 %0,%2,%3,%4; \n\t"
"shl.b32 %1,%3,%4; \n\t"
"}\n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
}
return result;
#else
if (offset <= 32) {
a.x = (a.x >> offset) | (a.y << (32 - offset));
a.y = (a.y >> offset);
} else {
a.x = (a.y >> (offset - 32));
a.y = 0;
}
return a;
#endif
}
#endif // #ifndef CUDA_HELPER_H #endif // #ifndef CUDA_HELPER_H

671
quark/cuda_bmw512.cu

@ -3,314 +3,473 @@
#include "cuda_helper.h" #include "cuda_helper.h"
// die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
#define SHL(x, n) ((x) << (n)) //#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n)) //#define SHR(x, n) ((x) >> (n))
#define SHR(x, n) SHR2(x, n)
#define SHL(x, n) SHL2(x, n)
#define CONST_EXP2 \ #undef ROTL64
q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \ #define ROTL64 ROL2
q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + SWAPDWORDS(q[i+7]) + \
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
__device__
void Compression512(uint64_t *msg, uint64_t *hash) #define CONST_EXP2(i) q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \
q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + SWAPUINT2(q[i+7]) + \
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
__device__ void Compression512_64_first(uint2 *msg, uint2 *hash)
{ {
// Compression ref. implementation // Compression ref. implementation
uint64_t tmp; uint2 q[32];
uint64_t q[32]; uint2 tmp;
tmp = (msg[ 5] ^ hash[ 5]) - (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]); tmp = (msg[5] ^ hash[5]) - (msg[7] ^ hash[7]) + (hash[10]) + (hash[13]) + (hash[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1]; q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1];
tmp = (msg[ 6] ^ hash[ 6]) - (msg[ 8] ^ hash[ 8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]); tmp = (msg[6] ^ hash[6]) - (msg[8] ^ hash[8]) + (hash[11]) + (hash[14]) - (msg[15] ^ hash[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2]; q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2];
tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]); tmp = (msg[0] ^ hash[0]) + (msg[7] ^ hash[7]) + (hash[9]) - (hash[12]) + (msg[15] ^ hash[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3]; q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3];
tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 1] ^ hash[ 1]) + (msg[ 8] ^ hash[ 8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]); tmp = (msg[0] ^ hash[0]) - (msg[1] ^ hash[1]) + (msg[8] ^ hash[8]) - (hash[10]) + (hash[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4]; q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4];
tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]); tmp = (msg[1] ^ hash[1]) + (msg[2] ^ hash[2]) + (hash[9]) - (hash[11]) - (hash[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + hash[5]; q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 2] ^ hash[ 2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]); tmp = (msg[3] ^ hash[3]) - (msg[2] ^ hash[2]) + (hash[10]) - (hash[12]) + (msg[15] ^ hash[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6]; q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6];
tmp = (msg[ 4] ^ hash[ 4]) - (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]); tmp = (msg[4] ^ hash[4]) - (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) - (hash[11]) + (hash[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7]; q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7];
tmp = (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]); tmp = (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[5] ^ hash[5]) - (hash[12]) - (hash[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8]; q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8];
tmp = (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) - (msg[ 6] ^ hash[ 6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]); tmp = (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) - (msg[6] ^ hash[6]) + (hash[13]) - (msg[15] ^ hash[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9]; q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9];
tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6]) - (msg[ 7] ^ hash[ 7]) + (msg[14] ^ hash[14]); tmp = (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) + (msg[6] ^ hash[6]) - (msg[7] ^ hash[7]) + (hash[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + hash[10]; q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 7] ^ hash[ 7]) + (msg[15] ^ hash[15]); tmp = (msg[8] ^ hash[8]) - (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[7] ^ hash[7]) + (msg[15] ^ hash[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11]; q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11];
tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) + (msg[ 9] ^ hash[ 9]); tmp = (msg[8] ^ hash[8]) - (msg[0] ^ hash[0]) - (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) + (hash[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12]; q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12];
tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 3] ^ hash[ 3]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[10] ^ hash[10]); tmp = (msg[1] ^ hash[1]) + (msg[3] ^ hash[3]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13]; q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13];
tmp = (msg[ 2] ^ hash[ 2]) + (msg[ 4] ^ hash[ 4]) + (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]); tmp = (msg[2] ^ hash[2]) + (msg[4] ^ hash[4]) + (msg[7] ^ hash[7]) + (hash[10]) + (hash[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14]; q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14];
tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 5] ^ hash[ 5]) + (msg[ 8] ^ hash[ 8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]); tmp = (msg[3] ^ hash[3]) - (msg[5] ^ hash[5]) + (msg[8] ^ hash[8]) - (hash[11]) - (hash[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + hash[15]; q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
tmp = (msg[12] ^ hash[12]) - (msg[ 4] ^ hash[ 4]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[13] ^ hash[13]); tmp = (msg[12] ^ hash[12]) - (msg[4] ^ hash[4]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0]; q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0];
// Expand 1 q[0 + 16] =
#pragma unroll 2 (SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) +
for(int i=0;i<2;i++) (SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) +
{ (SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) +
q[i+16] = (SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) +
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ ROTL64(q[i], 13) ^ ROTL64(q[i], 43)) + (SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ ROTL64(q[i+1], 19) ^ ROTL64(q[i+1], 53)) + (SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ ROTL64(q[i+2], 28) ^ ROTL64(q[i+2], 59)) + (SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ ROTL64(q[i+3], 4) ^ ROTL64(q[i+3], 37)) + (SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ ROTL64(q[i+4], 13) ^ ROTL64(q[i+4], 43)) + (SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ ROTL64(q[i+5], 19) ^ ROTL64(q[i+5], 53)) + (SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ ROTL64(q[i+6], 28) ^ ROTL64(q[i+6], 59)) + (SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ ROTL64(q[i+7], 4) ^ ROTL64(q[i+7], 37)) + (SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ ROTL64(q[i+8], 13) ^ ROTL64(q[i+8], 43)) + (SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ ROTL64(q[i+9], 19) ^ ROTL64(q[i+9], 53)) + (SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ ROTL64(q[i+10], 28) ^ ROTL64(q[i+10], 59)) + (SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ ROTL64(q[i+11], 4) ^ ROTL64(q[i+11], 37)) + (SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ ROTL64(q[i+12], 13) ^ ROTL64(q[i+12], 43)) + ((make_uint2(0x55555550ul,0x55555555) + ROTL64(msg[0], 0 + 1) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ ROTL64(q[i+13], 19) ^ ROTL64(q[i+13], 53)) + ROTL64(msg[0 + 3], 0 + 4)) ^ hash[0 + 7]);
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ ROTL64(q[i+14], 28) ^ ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ ROTL64(q[i+15], 4) ^ ROTL64(q[i+15], 37)) + q[1 + 16] =
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) + (SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) +
ROTL64(msg[i+3], i+4) - ROTL64(msg[i+10], i+11) ) ^ hash[i+7]); (SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) +
} (SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) +
(SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) +
(SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) +
(SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) +
(SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) +
(SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) +
(SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) +
(SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) +
(SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) +
(SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) +
(SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) +
(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) +
(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) +
(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) +
((make_uint2(0xAAAAAAA5, 0x5AAAAAAA) + ROTL64(msg[1], 1 + 1) +
ROTL64(msg[1 + 3], 1 + 4)) ^ hash[1 + 7]);
q[2 + 16] = CONST_EXP2(2) +
((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROTL64(msg[2], 2 + 1) +
ROTL64(msg[2 + 3], 2 + 4) - ROTL64(msg[2 + 10], 2 + 11)) ^ hash[2 + 7]);
q[3 + 16] = CONST_EXP2(3) +
((make_uint2(0x5555554F, 0x65555555) + ROTL64(msg[3], 3 + 1) +
ROTL64(msg[3 + 3], 3 + 4) - ROTL64(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]);
q[4 + 16] = CONST_EXP2(4) +
((make_uint2(0xAAAAAAA4, 0x6AAAAAAA) +ROTL64(msg[4], 4 + 1) +
ROTL64(msg[4 + 3], 4 + 4) - ROTL64(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]);
q[5 + 16] = CONST_EXP2(5) +
((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROTL64(msg[5], 5 + 1) +
ROTL64(msg[5 + 3], 5 + 4) - ROTL64(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]);
#pragma unroll 4
for(int i=2;i<6;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
ROTL64(msg[i+3], i+4) - ROTL64(msg[i+10], i+11) ) ^ hash[i+7]);
}
#pragma unroll 3 #pragma unroll 3
for(int i=6;i<9;i++) { for (int i = 6; i<9; i++) {
q[i+16] = CONST_EXP2 + q[i + 16] = CONST_EXP2(i) +
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) + ((vectorize((i + 16)*(0x0555555555555555ull)) + ROTL64(msg[i], i + 1) -
ROTL64(msg[i+3], i+4) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i+7]); ROTL64(msg[i - 6], (i - 6) + 1)) ^ hash[i + 7]);
} }
#pragma unroll 4 #pragma unroll 4
for(int i=9;i<13;i++) { for (int i = 9; i<13; i++) {
q[i+16] = CONST_EXP2 + q[i + 16] = CONST_EXP2(i) +
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) + ((vectorize((i + 16)*(0x0555555555555555ull)) +
ROTL64(msg[i+3], i+4) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i-9]); ROTL64(msg[i + 3], i + 4) - ROTL64(msg[i - 6], (i - 6) + 1)) ^ hash[i - 9]);
} }
#pragma unroll 3
for(int i=13;i<16;i++) { q[13 + 16] = CONST_EXP2(13) +
q[i+16] = CONST_EXP2 + ((make_uint2(0xAAAAAAA1, 0x9AAAAAAA) + ROTL64(msg[13], 13 + 1) +
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) + ROTL64(msg[13 - 13], (13 - 13) + 1) - ROTL64(msg[13 - 6], (13 - 6) + 1)) ^ hash[13 - 9]);
ROTL64(msg[i-13], (i-13)+1) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i-9]); q[14 + 16] = CONST_EXP2(14) +
} ((make_uint2(0xFFFFFFF6, 0x9FFFFFFF) + ROTL64(msg[14], 14 + 1) +
ROTL64(msg[14 - 13], (14 - 13) + 1) - ROTL64(msg[14 - 6], (14 - 6) + 1)) ^ hash[14 - 9]);
uint64_t XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; q[15 + 16] = CONST_EXP2(15) +
uint64_t XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; ((make_uint2(0x5555554B, 0xA5555555) + ROTL64(msg[15], 15 + 1) +
ROTL64(msg[15 - 13], (15 - 13) + 1) - ROTL64(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]);
hash[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ msg[ 0]) + ( XL64 ^ q[24] ^ q[ 0]);
hash[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ msg[ 1]) + ( XL64 ^ q[25] ^ q[ 1]);
hash[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ msg[ 2]) + ( XL64 ^ q[26] ^ q[ 2]); uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23];
hash[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ msg[ 3]) + ( XL64 ^ q[27] ^ q[ 3]); uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31];
hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[ 4]) + ( XL64 ^ q[28] ^ q[ 4]);
hash[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ msg[ 5]) + ( XL64 ^ q[29] ^ q[ 5]); hash[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg[0]) + (XL64 ^ q[24] ^ q[0]);
hash[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ msg[ 6]) + ( XL64 ^ q[30] ^ q[ 6]); hash[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg[1]) + (XL64 ^ q[25] ^ q[1]);
hash[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ msg[ 7]) + ( XL64 ^ q[31] ^ q[ 7]); hash[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg[2]) + (XL64 ^ q[26] ^ q[2]);
hash[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg[3]) + (XL64 ^ q[27] ^ q[3]);
hash[ 8] = ROTL64(hash[4], 9) + ( XH64 ^ q[24] ^ msg[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]); hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[4]) + (XL64 ^ q[28] ^ q[4]);
hash[ 9] = ROTL64(hash[5],10) + ( XH64 ^ q[25] ^ msg[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]); hash[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg[5]) + (XL64 ^ q[29] ^ q[5]);
hash[10] = ROTL64(hash[6],11) + ( XH64 ^ q[26] ^ msg[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); hash[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg[6]) + (XL64 ^ q[30] ^ q[6]);
hash[11] = ROTL64(hash[7],12) + ( XH64 ^ q[27] ^ msg[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); hash[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg[7]) + (XL64 ^ q[31] ^ q[7]);
hash[12] = ROTL64(hash[0],13) + ( XH64 ^ q[28] ^ msg[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
hash[13] = ROTL64(hash[1],14) + ( XH64 ^ q[29] ^ msg[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); hash[8] = ROTL64(hash[4], 9) + (XH64 ^ q[24] ^ msg[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]);
hash[14] = ROTL64(hash[2],15) + ( XH64 ^ q[30] ^ msg[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); hash[9] = ROTL64(hash[5], 10) + (XH64 ^ q[25]) + (SHR(XL64, 6) ^ q[16] ^ q[9]);
hash[15] = ROTL64(hash[3],16) + ( XH64 ^ q[31] ^ msg[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); hash[10] = ROTL64(hash[6], 11) + (XH64 ^ q[26]) + (SHL(XL64, 6) ^ q[17] ^ q[10]);
hash[11] = ROTL64(hash[7], 12) + (XH64 ^ q[27]) + (SHL(XL64, 4) ^ q[18] ^ q[11]);
hash[12] = ROTL64(hash[0], 13) + (XH64 ^ q[28]) + (SHR(XL64, 3) ^ q[19] ^ q[12]);
hash[13] = ROTL64(hash[1], 14) + (XH64 ^ q[29]) + (SHR(XL64, 4) ^ q[20] ^ q[13]);
hash[14] = ROTL64(hash[2], 15) + (XH64 ^ q[30]) + (SHR(XL64, 7) ^ q[21] ^ q[14]);
hash[15] = ROTL64(hash[3], 16) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
} }
static __constant__ uint64_t d_constMem[16];
static uint64_t h_constMem[16] = { __device__ void Compression512(uint2 *msg, uint2 *hash)
SPH_C64(0x8081828384858687),
SPH_C64(0x88898A8B8C8D8E8F),
SPH_C64(0x9091929394959697),
SPH_C64(0x98999A9B9C9D9E9F),
SPH_C64(0xA0A1A2A3A4A5A6A7),
SPH_C64(0xA8A9AAABACADAEAF),
SPH_C64(0xB0B1B2B3B4B5B6B7),
SPH_C64(0xB8B9BABBBCBDBEBF),
SPH_C64(0xC0C1C2C3C4C5C6C7),
SPH_C64(0xC8C9CACBCCCDCECF),
SPH_C64(0xD0D1D2D3D4D5D6D7),
SPH_C64(0xD8D9DADBDCDDDEDF),
SPH_C64(0xE0E1E2E3E4E5E6E7),
SPH_C64(0xE8E9EAEBECEDEEEF),
SPH_C64(0xF0F1F2F3F4F5F6F7),
SPH_C64(0xF8F9FAFBFCFDFEFF)
};
__global__ void quark_bmw512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); // Compression ref. implementation
if (thread < threads) uint2 q[32];
{ uint2 tmp;
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
tmp = (msg[ 5] ^ hash[ 5]) - (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]);
int hashPosition = nounce - startNounce; q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1];
uint64_t *inpHash = &g_hash[8 * hashPosition]; tmp = (msg[ 6] ^ hash[ 6]) - (msg[ 8] ^ hash[ 8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2];
// Init tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
uint64_t h[16]; q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3];
/* tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 1] ^ hash[ 1]) + (msg[ 8] ^ hash[ 8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]);
h[ 0] = SPH_C64(0x8081828384858687); q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4];
h[ 1] = SPH_C64(0x88898A8B8C8D8E8F); tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]);
h[ 2] = SPH_C64(0x9091929394959697); q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
h[ 3] = SPH_C64(0x98999A9B9C9D9E9F); tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 2] ^ hash[ 2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
h[ 4] = SPH_C64(0xA0A1A2A3A4A5A6A7); q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6];
h[ 5] = SPH_C64(0xA8A9AAABACADAEAF); tmp = (msg[ 4] ^ hash[ 4]) - (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]);
h[ 6] = SPH_C64(0xB0B1B2B3B4B5B6B7); q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7];
h[ 7] = SPH_C64(0xB8B9BABBBCBDBEBF); tmp = (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]);
h[ 8] = SPH_C64(0xC0C1C2C3C4C5C6C7); q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8];
h[ 9] = SPH_C64(0xC8C9CACBCCCDCECF); tmp = (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) - (msg[ 6] ^ hash[ 6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]);
h[10] = SPH_C64(0xD0D1D2D3D4D5D6D7); q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9];
h[11] = SPH_C64(0xD8D9DADBDCDDDEDF); tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6]) - (msg[ 7] ^ hash[ 7]) + (msg[14] ^ hash[14]);
h[12] = SPH_C64(0xE0E1E2E3E4E5E6E7); q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
h[13] = SPH_C64(0xE8E9EAEBECEDEEEF); tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 7] ^ hash[ 7]) + (msg[15] ^ hash[15]);
h[14] = SPH_C64(0xF0F1F2F3F4F5F6F7); q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11];
h[15] = SPH_C64(0xF8F9FAFBFCFDFEFF); tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) + (msg[ 9] ^ hash[ 9]);
*/ q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12];
#pragma unroll 16 tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 3] ^ hash[ 3]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[10] ^ hash[10]);
for(int i=0;i<16;i++) q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13];
h[i] = d_constMem[i]; tmp = (msg[ 2] ^ hash[ 2]) + (msg[ 4] ^ hash[ 4]) + (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]);
// Nachricht kopieren (Achtung, die Nachricht hat 64 Byte, q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14];
// BMW arbeitet mit 128 Byte!!! tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 5] ^ hash[ 5]) + (msg[ 8] ^ hash[ 8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]);
uint64_t message[16]; q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
tmp = (msg[12] ^ hash[12]) - (msg[ 4] ^ hash[ 4]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[13] ^ hash[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0];
q[0+16] =
(SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) +
(SHR(q[0+1], 2) ^ SHL(q[0+1], 1) ^ ROTL64(q[0+1], 19) ^ ROTL64(q[0+1], 53)) +
(SHR(q[0+2], 2) ^ SHL(q[0+2], 2) ^ ROTL64(q[0+2], 28) ^ ROTL64(q[0+2], 59)) +
(SHR(q[0+3], 1) ^ SHL(q[0+3], 3) ^ ROTL64(q[0+3], 4) ^ ROTL64(q[0+3], 37)) +
(SHR(q[0+4], 1) ^ SHL(q[0+4], 2) ^ ROTL64(q[0+4], 13) ^ ROTL64(q[0+4], 43)) +
(SHR(q[0+5], 2) ^ SHL(q[0+5], 1) ^ ROTL64(q[0+5], 19) ^ ROTL64(q[0+5], 53)) +
(SHR(q[0+6], 2) ^ SHL(q[0+6], 2) ^ ROTL64(q[0+6], 28) ^ ROTL64(q[0+6], 59)) +
(SHR(q[0+7], 1) ^ SHL(q[0+7], 3) ^ ROTL64(q[0+7], 4) ^ ROTL64(q[0+7], 37)) +
(SHR(q[0+8], 1) ^ SHL(q[0+8], 2) ^ ROTL64(q[0+8], 13) ^ ROTL64(q[0+8], 43)) +
(SHR(q[0+9], 2) ^ SHL(q[0+9], 1) ^ ROTL64(q[0+9], 19) ^ ROTL64(q[0+9], 53)) +
(SHR(q[0+10], 2) ^ SHL(q[0+10], 2) ^ ROTL64(q[0+10], 28) ^ ROTL64(q[0+10], 59)) +
(SHR(q[0+11], 1) ^ SHL(q[0+11], 3) ^ ROTL64(q[0+11], 4) ^ ROTL64(q[0+11], 37)) +
(SHR(q[0+12], 1) ^ SHL(q[0+12], 2) ^ ROTL64(q[0+12], 13) ^ ROTL64(q[0+12], 43)) +
(SHR(q[0+13], 2) ^ SHL(q[0+13], 1) ^ ROTL64(q[0+13], 19) ^ ROTL64(q[0+13], 53)) +
(SHR(q[0+14], 2) ^ SHL(q[0+14], 2) ^ ROTL64(q[0+14], 28) ^ ROTL64(q[0+14], 59)) +
(SHR(q[0+15], 1) ^ SHL(q[0+15], 3) ^ ROTL64(q[0+15], 4) ^ ROTL64(q[0+15], 37)) +
((make_uint2(0x55555550ul, 0x55555555) + ROTL64(msg[0], 0 + 1) +
ROTL64(msg[0+3], 0+4) - ROTL64(msg[0+10], 0+11) ) ^ hash[0+7]);
q[1 + 16] =
(SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) +
(SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) +
(SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) +
(SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) +
(SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) +
(SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) +
(SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) +
(SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) +
(SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) +
(SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) +
(SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) +
(SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) +
(SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) +
(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) +
(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) +
(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) +
((make_uint2(0xAAAAAAA5, 0x5AAAAAAA) + ROTL64(msg[1], 1 + 1) +
ROTL64(msg[1 + 3], 1 + 4) - ROTL64(msg[1 + 10], 1 + 11)) ^ hash[1 + 7]);
q[2 + 16] = CONST_EXP2(2) +
((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROTL64(msg[2], 2 + 1) +
ROTL64(msg[2+3], 2+4) - ROTL64(msg[2+10], 2+11) ) ^ hash[2+7]);
q[3 + 16] = CONST_EXP2(3) +
((make_uint2(0x5555554F, 0x65555555) + ROTL64(msg[3], 3 + 1) +
ROTL64(msg[3 + 3], 3 + 4) - ROTL64(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]);
q[4 + 16] = CONST_EXP2(4) +
((make_uint2(0xAAAAAAA4, 0x6AAAAAAA) + ROTL64(msg[4], 4 + 1) +
ROTL64(msg[4 + 3], 4 + 4) - ROTL64(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]);
q[5 + 16] = CONST_EXP2(5) +
((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROTL64(msg[5], 5 + 1) +
ROTL64(msg[5 + 3], 5 + 4) - ROTL64(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]);
q[6 + 16] = CONST_EXP2(6) +
((make_uint2(0x5555554E, 0x75555555)+ ROTL64(msg[6], 6 + 1) +
ROTL64(msg[6 + 3], 6 + 4) - ROTL64(msg[6 - 6], (6 - 6) + 1)) ^ hash[6 + 7]);
q[7 + 16] = CONST_EXP2(7) +
((make_uint2(0xAAAAAAA3, 0x7AAAAAAA) + ROTL64(msg[7], 7 + 1) +
ROTL64(msg[7 + 3], 7 + 4) - ROTL64(msg[7 - 6], (7 - 6) + 1)) ^ hash[7 + 7]);
q[8 + 16] = CONST_EXP2(8) +
((make_uint2(0xFFFFFFF8, 0x7FFFFFFF) + ROTL64(msg[8], 8 + 1) +
ROTL64(msg[8 + 3], 8 + 4) - ROTL64(msg[8 - 6], (8 - 6) + 1)) ^ hash[8 + 7]);
q[9 + 16] = CONST_EXP2(9) +
((make_uint2(0x5555554D, 0x85555555) + ROTL64(msg[9], 9 + 1) +
ROTL64(msg[9 + 3], 9 + 4) - ROTL64(msg[9 - 6], (9 - 6) + 1)) ^ hash[9 - 9]);
q[10 + 16] = CONST_EXP2(10) +
((make_uint2(0xAAAAAAA2, 0x8AAAAAAA) + ROTL64(msg[10], 10 + 1) +
ROTL64(msg[10 + 3], 10 + 4) - ROTL64(msg[10 - 6], (10 - 6) + 1)) ^ hash[10 - 9]);
q[11 + 16] = CONST_EXP2(11) +
((make_uint2(0xFFFFFFF7, 0x8FFFFFFF) + ROTL64(msg[11], 11 + 1) +
ROTL64(msg[11 + 3], 11 + 4) - ROTL64(msg[11 - 6], (11 - 6) + 1)) ^ hash[11 - 9]);
q[12 + 16] = CONST_EXP2(12) +
((make_uint2(0x5555554C, 0x95555555) + ROTL64(msg[12], 12 + 1) +
ROTL64(msg[12 + 3], 12 + 4) - ROTL64(msg[12 - 6], (12 - 6) + 1)) ^ hash[12 - 9]);
q[13 + 16] = CONST_EXP2(13) +
((make_uint2(0xAAAAAAA1, 0x9AAAAAAA) + ROTL64(msg[13], 13 + 1) +
ROTL64(msg[13 - 13], (13 - 13) + 1) - ROTL64(msg[13 - 6], (13 - 6) + 1)) ^ hash[13 - 9]);
q[14 + 16] = CONST_EXP2(14) +
((make_uint2(0xFFFFFFF6, 0x9FFFFFFF) + ROTL64(msg[14], 14 + 1) +
ROTL64(msg[14 - 13], (14 - 13) + 1) - ROTL64(msg[14 - 6], (14 - 6) + 1)) ^ hash[14 - 9]);
q[15 + 16] = CONST_EXP2(15) +
((make_uint2(0x5555554B, 0xA5555555) + ROTL64(msg[15], 15 + 1) +
ROTL64(msg[15 - 13], (15 - 13) + 1) - ROTL64(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]);
uint2 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31];
hash[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ msg[ 0]) + ( XL64 ^ q[24] ^ q[ 0]);
hash[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ msg[ 1]) + ( XL64 ^ q[25] ^ q[ 1]);
hash[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ msg[ 2]) + ( XL64 ^ q[26] ^ q[ 2]);
hash[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ msg[ 3]) + ( XL64 ^ q[27] ^ q[ 3]);
hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[ 4]) + ( XL64 ^ q[28] ^ q[ 4]);
hash[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ msg[ 5]) + ( XL64 ^ q[29] ^ q[ 5]);
hash[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ msg[ 6]) + ( XL64 ^ q[30] ^ q[ 6]);
hash[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ msg[ 7]) + ( XL64 ^ q[31] ^ q[ 7]);
hash[ 8] = ROTL64(hash[4], 9) + ( XH64 ^ q[24] ^ msg[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]);
hash[ 9] = ROTL64(hash[5],10) + ( XH64 ^ q[25] ^ msg[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]);
hash[10] = ROTL64(hash[6],11) + ( XH64 ^ q[26] ^ msg[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
hash[11] = ROTL64(hash[7],12) + ( XH64 ^ q[27] ^ msg[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
hash[12] = ROTL64(hash[0],13) + ( XH64 ^ q[28] ^ msg[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
hash[13] = ROTL64(hash[1],14) + ( XH64 ^ q[29] ^ msg[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
hash[14] = ROTL64(hash[2],15) + ( XH64 ^ q[30] ^ msg[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
hash[15] = ROTL64(hash[3],16) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
}
__global__
#if __CUDA_ARCH__ > 500
__launch_bounds__(32, 16)
#else
__launch_bounds__(64, 8)
#endif
void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint64_t *inpHash = &g_hash[8 * hashPosition];
// Init
uint2 h[16] = {
{ 0x84858687UL, 0x80818283UL },
{ 0x8C8D8E8FUL, 0x88898A8BUL },
{ 0x94959697UL, 0x90919293UL },
{ 0x9C9D9E9FUL, 0x98999A9BUL },
{ 0xA4A5A6A7UL, 0xA0A1A2A3UL },
{ 0xACADAEAFUL, 0xA8A9AAABUL },
{ 0xB4B5B6B7UL, 0xB0B1B2B3UL },
{ 0xBCBDBEBFUL, 0xB8B9BABBUL },
{ 0xC4C5C6C7UL, 0xC0C1C2C3UL },
{ 0xCCCDCECFUL, 0xC8C9CACBUL },
{ 0xD4D5D6D7UL, 0xD0D1D2D3UL },
{ 0xDCDDDEDFUL, 0xD8D9DADBUL },
{ 0xE4E5E6E7UL, 0xE0E1E2E3UL },
{ 0xECEDEEEFUL, 0xE8E9EAEBUL },
{ 0xF4F5F6F7UL, 0xF0F1F2F3UL },
{ 0xFCFDFEFFUL, 0xF8F9FAFBUL }
};
// Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
// BMW arbeitet mit 128 Byte!!!
uint2 message[16];
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) for(int i=0;i<8;i++)
message[i] = inpHash[i]; message[i] = vectorize(inpHash[i]);
#pragma unroll 6 #pragma unroll 6
for(int i=9;i<15;i++) for(int i=9;i<15;i++)
message[i] = 0; message[i] = make_uint2(0,0);
// Padding einfügen (Byteorder?!?) // Padding einfügen (Byteorder?!?)
message[8] = SPH_C64(0x80); message[8] = make_uint2(0x80,0);
// Länge (in Bits, d.h. 64 Byte * 8 = 512 Bits // Länge (in Bits, d.h. 64 Byte * 8 = 512 Bits
message[15] = SPH_C64(512); message[15] = make_uint2(512,0);
// Compression 1 // Compression 1
Compression512(message, h); Compression512_64_first(message, h);
// Final // Final
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
message[i] = 0xaaaaaaaaaaaaaaa0ull + (uint64_t)i; {
message[i].y = 0xaaaaaaaa;
Compression512(h, message); message[i].x = 0xaaaaaaa0ul + (uint32_t)i;
}
Compression512(h, message);
// fertig // fertig
uint64_t *outpHash = &g_hash[8 * hashPosition]; uint64_t *outpHash = &g_hash[8 * hashPosition];
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) for(int i=0;i<8;i++)
outpHash[i] = message[i+8]; outpHash[i] = devectorize(message[i+8]);
} }
} }
__global__ void quark_bmw512_gpu_hash_80(int threads, uint32_t startNounce, uint64_t *g_hash) __global__ __launch_bounds__(256, 2)
void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
// Init // Init
uint64_t h[16]; uint2 h[16] = {
{ 0x84858687UL, 0x80818283UL },
{ 0x8C8D8E8FUL, 0x88898A8BUL },
{ 0x94959697UL, 0x90919293UL },
{ 0x9C9D9E9FUL, 0x98999A9BUL },
{ 0xA4A5A6A7UL, 0xA0A1A2A3UL },
{ 0xACADAEAFUL, 0xA8A9AAABUL },
{ 0xB4B5B6B7UL, 0xB0B1B2B3UL },
{ 0xBCBDBEBFUL, 0xB8B9BABBUL },
{ 0xC4C5C6C7UL, 0xC0C1C2C3UL, },
{ 0xCCCDCECFUL, 0xC8C9CACBUL, },
{ 0xD4D5D6D7UL, 0xD0D1D2D3UL },
{ 0xDCDDDEDFUL, 0xD8D9DADBUL },
{ 0xE4E5E6E7UL, 0xE0E1E2E3UL },
{ 0xECEDEEEFUL, 0xE8E9EAEBUL },
{ 0xF4F5F6F7UL, 0xF0F1F2F3UL },
{ 0xFCFDFEFFUL, 0xF8F9FAFBUL }
};
// Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
// BMW arbeitet mit 128 Byte!!!
uint2 message[16];
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
h[i] = d_constMem[i]; message[i] = vectorize(c_PaddedMessage80[i]);
// Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
// BMW arbeitet mit 128 Byte!!!
uint64_t message[16];
#pragma unroll 16
for(int i=0;i<16;i++)
message[i] = c_PaddedMessage80[i];
// die Nounce durch die thread-spezifische ersetzen
message[9] = REPLACE_HIWORD(message[9], cuda_swab32(nounce));
// Compression 1 // die Nounce durch die thread-spezifische ersetzen
Compression512(message, h); message[9].x = cuda_swab32(nounce); //REPLACE_HIWORD(message[9], cuda_swab32(nounce));
// Compression 1
Compression512(message, h);
// Final
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
message[i] = 0xaaaaaaaaaaaaaaa0ull + (uint64_t)i; message[i] = make_uint2(0xaaaaaaa0+i,0xaaaaaaaa);
Compression512(h, message); Compression512(h, message);
// fertig // fertig
uint64_t *outpHash = &g_hash[8 * thread]; uint64_t *outpHash = &g_hash[8 * thread];
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) for(int i=0;i<8;i++)
outpHash[i] = message[i+8]; outpHash[i] = devectorize(message[i+8]);
} }
} }
// Setup-Funktionen __host__
__host__ void quark_bmw512_cpu_init(int thr_id, int threads) void quark_bmw512_cpu_init(int thr_id, int threads)
{ {
// nix zu tun ;-)
// jetzt schon :D
cudaMemcpyToSymbol( d_constMem,
h_constMem,
sizeof(h_constMem),
0, cudaMemcpyHostToDevice);
} }
// Bmw512 für 80 Byte grosse Eingangsdaten __host__
__host__ void quark_bmw512_cpu_setBlock_80(void *pdata) void quark_bmw512_cpu_setBlock_80(void *pdata)
{ {
// Message mit Padding bereitstellen
// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
memcpy(PaddedMessage, pdata, 80); memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage+80, 0, 48); memset(PaddedMessage+80, 0, 48);
uint64_t *message = (uint64_t*)PaddedMessage; uint64_t *message = (uint64_t*)PaddedMessage;
// Padding einfügen (Byteorder?!?)
message[10] = SPH_C64(0x80); message[10] = SPH_C64(0x80);
// Länge (in Bits, d.h. 80 Byte * 8 = 640 Bits
message[15] = SPH_C64(640); message[15] = SPH_C64(640);
// die Message zur Berechnung auf der GPU
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
} }
__host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__
void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 128; const uint32_t threadsperblock = 32;
// berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs quark_bmw512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
size_t shared_size = 0; // MyStreamSynchronize(NULL, order, thr_id);
quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
} }
__host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order) __host__
void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 128; const uint32_t threadsperblock = 128;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs dim3 grid((threads + threadsperblock-1)/threadsperblock);
size_t shared_size = 0; dim3 block(threadsperblock);
quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash); quark_bmw512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, order, thr_id);
} }

Loading…
Cancel
Save