@ -18,10 +18,10 @@
@@ -18,10 +18,10 @@
#define Nrow 4
#define Ncol 4
#define uint4x 2 uint28
#define uint2x4 uint28
#define memshift 3
__device__ uint4x 2 *DMatrix;
__device__ uint2x4 *DMatrix;
__device__ __forceinline__
void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
@ -33,7 +33,7 @@ void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
@@ -33,7 +33,7 @@ void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
}
__device__ __forceinline__
void round_lyra_v5(uint4x 2* s)
void round_lyra_v5(uint2x4 * s)
{
Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x);
Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y);
@ -47,9 +47,9 @@ void round_lyra_v5(uint4x2* s)
@@ -47,9 +47,9 @@ void round_lyra_v5(uint4x2* s)
}
__device__ __forceinline__
void reduceDuplex(uint4x 2 state[4], const uint32_t thread)
void reduceDuplex(uint2x4 state[4], const uint32_t thread)
{
uint4x 2 state1[3];
uint2x4 state1[3];
const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
const uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
@ -80,7 +80,7 @@ void reduceDuplex(uint4x2 state[4], const uint32_t thread)
@@ -80,7 +80,7 @@ void reduceDuplex(uint4x2 state[4], const uint32_t thread)
}
__device__ __forceinline__
void reduceDuplex50(uint4x 2 state[4], const uint32_t thread)
void reduceDuplex50(uint2x4 state[4], const uint32_t thread)
{
const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
const uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
@ -104,9 +104,9 @@ void reduceDuplex50(uint4x2 state[4], const uint32_t thread)
@@ -104,9 +104,9 @@ void reduceDuplex50(uint4x2 state[4], const uint32_t thread)
}
__device__ __forceinline__
void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint4x 2 state[4], const uint32_t thread)
void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], const uint32_t thread)
{
uint4x 2 state2[3], state1[3];
uint2x4 state2[3], state1[3];
const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread);
const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
@ -151,7 +151,7 @@ void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOu
@@ -151,7 +151,7 @@ void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOu
#pragma unroll
for (int j = 0; j < 3; j++)
{
uint4x 2 tmp = state1[j] + state2[j];
uint2x4 tmp = state1[j] + state2[j];
state[j] ^= tmp;
}
@ -179,9 +179,9 @@ void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOu
@@ -179,9 +179,9 @@ void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOu
__device__ __forceinline__
void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint4x 2* state, const uint32_t thread)
void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4 * state, const uint32_t thread)
{
uint4x 2 state1[3], state2[3];
uint2x4 state1[3], state2[3];
const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread);
const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
const uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread);
@ -259,7 +259,7 @@ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_
@@ -259,7 +259,7 @@ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint4x 2 blake2b_IV[2];
uint2x4 blake2b_IV[2];
if (threadIdx.x == 0) {
@ -273,7 +273,7 @@ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_
@@ -273,7 +273,7 @@ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_
if (thread < threads)
{
uint4x 2 state[4];
uint2x4 state[4];
((uint2*)state)[0] = __ldg(&g_hash[thread]);
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);