mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-24 21:45:04 +00:00
cuda: get ride of cuda 9 mask warnings
This commit is contained in:
parent
f1a7de4e9e
commit
3761774ccf
@ -111,9 +111,10 @@ endif
|
|||||||
#ccminer_LDADD += -lsodium
|
#ccminer_LDADD += -lsodium
|
||||||
ccminer_LDADD += -lcuda
|
ccminer_LDADD += -lcuda
|
||||||
|
|
||||||
nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
|
nvcc_ARCH :=
|
||||||
|
#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
|
||||||
nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
|
nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
|
||||||
|
nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
|
||||||
#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
|
#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
|
||||||
#nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
|
#nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
AC_INIT([ccminer], [2.2.4], [], [ccminer], [http://github.com/tpruvot/ccminer])
|
AC_INIT([ccminer], [2.2.5], [], [ccminer], [http://github.com/tpruvot/ccminer])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
@ -669,4 +669,14 @@ static uint2 SHR2(uint2 a, int offset)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CUDA 9+ deprecated functions warnings (new mask param)
|
||||||
|
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
|
||||||
|
#undef __shfl
|
||||||
|
#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
|
||||||
|
#undef __shfl_up
|
||||||
|
#define __shfl_up(var, delta, width) __shfl_up_sync(0xFFFFFFFF, var, delta, width)
|
||||||
|
#undef __any
|
||||||
|
#define __any(p) __any_sync(0xFFFFFFFFu, p)
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif // #ifndef CUDA_HELPER_H
|
#endif // #ifndef CUDA_HELPER_H
|
||||||
|
@ -65,7 +65,8 @@
|
|||||||
#define __CUDA_ARCH__ 520
|
#define __CUDA_ARCH__ 520
|
||||||
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
|
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
|
||||||
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
|
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
|
||||||
uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
|
uint32_t __shfl2(uint32_t x, uint32_t y);
|
||||||
|
uint32_t __shfl_sync(uint32_t mask, uint32_t x, uint32_t y);
|
||||||
uint32_t atomicExch(uint32_t *x, uint32_t y);
|
uint32_t atomicExch(uint32_t *x, uint32_t y);
|
||||||
uint32_t atomicAdd(uint32_t *x, uint32_t y);
|
uint32_t atomicAdd(uint32_t *x, uint32_t y);
|
||||||
void __syncthreads(void);
|
void __syncthreads(void);
|
||||||
@ -79,6 +80,14 @@ u32 umin(const u32, const u32);
|
|||||||
u32 umax(const u32, const u32);
|
u32 umax(const u32, const u32);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
|
||||||
|
#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
|
||||||
|
#undef __any
|
||||||
|
#define __any(p) __any_sync(0xFFFFFFFFu, p)
|
||||||
|
#else
|
||||||
|
#define __shfl2 __shfl
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef u32 proof[PROOFSIZE];
|
typedef u32 proof[PROOFSIZE];
|
||||||
|
|
||||||
struct __align__(32) slot {
|
struct __align__(32) slot {
|
||||||
@ -1844,7 +1853,7 @@ __global__ void digit_last_wdc(equi<RB, SM>* eq)
|
|||||||
}
|
}
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
// all threads get the value from lane 0
|
// all threads get the value from lane 0
|
||||||
soli = __shfl(soli, 0);
|
soli = __shfl2(soli, 0);
|
||||||
#else
|
#else
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
soli = eq->edata.srealcont.nsols;
|
soli = eq->edata.srealcont.nsols;
|
||||||
|
@ -16,6 +16,12 @@
|
|||||||
#define __shfl(x, y, z) (x)
|
#define __shfl(x, y, z) (x)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
|
||||||
|
#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
|
||||||
|
#else
|
||||||
|
#define __shfl2 __shfl
|
||||||
|
#endif
|
||||||
|
|
||||||
#if __CUDA_ARCH__ < 320 && !defined(__ldg4)
|
#if __CUDA_ARCH__ < 320 && !defined(__ldg4)
|
||||||
#define __ldg4(x) (*(x))
|
#define __ldg4(x) (*(x))
|
||||||
#endif
|
#endif
|
||||||
@ -89,7 +95,7 @@ typedef struct __align__(16) uint28 {
|
|||||||
typedef uint2x4 uint28; /* name deprecated */
|
typedef uint2x4 uint28; /* name deprecated */
|
||||||
|
|
||||||
typedef struct __builtin_align__(32) uint48 {
|
typedef struct __builtin_align__(32) uint48 {
|
||||||
uint4 s0,s1;
|
uint4 s0,s1;
|
||||||
} uint48;
|
} uint48;
|
||||||
|
|
||||||
typedef struct __builtin_align__(128) uint4x16{
|
typedef struct __builtin_align__(128) uint4x16{
|
||||||
@ -368,10 +374,10 @@ static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulong
|
|||||||
|
|
||||||
static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
|
static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
|
||||||
static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
|
static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
|
||||||
static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
|
static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
|
||||||
static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
|
static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
|
||||||
static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
|
static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
|
||||||
static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
|
static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
|
||||||
|
|
||||||
static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; }
|
static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; }
|
||||||
static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; }
|
static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; }
|
||||||
@ -551,14 +557,14 @@ static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane)
|
|||||||
{
|
{
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
uint28 res;
|
uint28 res;
|
||||||
res.x.x = __shfl(var.x.x, lane);
|
res.x.x = __shfl2(var.x.x, lane);
|
||||||
res.x.y = __shfl(var.x.y, lane);
|
res.x.y = __shfl2(var.x.y, lane);
|
||||||
res.y.x = __shfl(var.y.x, lane);
|
res.y.x = __shfl2(var.y.x, lane);
|
||||||
res.y.y = __shfl(var.y.y, lane);
|
res.y.y = __shfl2(var.y.y, lane);
|
||||||
res.z.x = __shfl(var.z.x, lane);
|
res.z.x = __shfl2(var.z.x, lane);
|
||||||
res.z.y = __shfl(var.z.y, lane);
|
res.z.y = __shfl2(var.z.y, lane);
|
||||||
res.w.x = __shfl(var.w.x, lane);
|
res.w.x = __shfl2(var.w.x, lane);
|
||||||
res.w.y = __shfl(var.w.y, lane);
|
res.w.y = __shfl2(var.w.y, lane);
|
||||||
return res;
|
return res;
|
||||||
#else
|
#else
|
||||||
return var;
|
return var;
|
||||||
@ -569,22 +575,22 @@ static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane)
|
|||||||
{
|
{
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
ulonglong4 res;
|
ulonglong4 res;
|
||||||
uint2 temp;
|
uint2 temp;
|
||||||
temp = vectorize(var.x);
|
temp = vectorize(var.x);
|
||||||
temp.x = __shfl(temp.x, lane);
|
temp.x = __shfl2(temp.x, lane);
|
||||||
temp.y = __shfl(temp.y, lane);
|
temp.y = __shfl2(temp.y, lane);
|
||||||
res.x = devectorize(temp);
|
res.x = devectorize(temp);
|
||||||
temp = vectorize(var.y);
|
temp = vectorize(var.y);
|
||||||
temp.x = __shfl(temp.x, lane);
|
temp.x = __shfl2(temp.x, lane);
|
||||||
temp.y = __shfl(temp.y, lane);
|
temp.y = __shfl2(temp.y, lane);
|
||||||
res.y = devectorize(temp);
|
res.y = devectorize(temp);
|
||||||
temp = vectorize(var.z);
|
temp = vectorize(var.z);
|
||||||
temp.x = __shfl(temp.x, lane);
|
temp.x = __shfl2(temp.x, lane);
|
||||||
temp.y = __shfl(temp.y, lane);
|
temp.y = __shfl2(temp.y, lane);
|
||||||
res.z = devectorize(temp);
|
res.z = devectorize(temp);
|
||||||
temp = vectorize(var.w);
|
temp = vectorize(var.w);
|
||||||
temp.x = __shfl(temp.x, lane);
|
temp.x = __shfl2(temp.x, lane);
|
||||||
temp.y = __shfl(temp.y, lane);
|
temp.y = __shfl2(temp.y, lane);
|
||||||
res.w = devectorize(temp);
|
res.w = devectorize(temp);
|
||||||
return res;
|
return res;
|
||||||
#else
|
#else
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda_helper.h>
|
||||||
|
|
||||||
#include "miner.h"
|
#include "miner.h"
|
||||||
|
|
||||||
#include "salsa_kernel.h"
|
#include "salsa_kernel.h"
|
||||||
@ -18,6 +20,12 @@
|
|||||||
#define TEXWIDTH 32768
|
#define TEXWIDTH 32768
|
||||||
#define THREADS_PER_WU 4 // four threads per hash
|
#define THREADS_PER_WU 4 // four threads per hash
|
||||||
|
|
||||||
|
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
|
||||||
|
#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
|
||||||
|
#else
|
||||||
|
#define __shfl2 __shfl
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef enum
|
typedef enum
|
||||||
{
|
{
|
||||||
ANDERSEN,
|
ANDERSEN,
|
||||||
@ -57,12 +65,12 @@ static __host__ __device__ uint4& operator += (uint4& left, const uint4& right)
|
|||||||
return left;
|
return left;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ uint4 __shfl(const uint4 bx, int target_thread) {
|
static __device__ uint4 shfl4(const uint4 bx, int target_thread) {
|
||||||
return make_uint4(
|
return make_uint4(
|
||||||
__shfl((int)bx.x, target_thread),
|
__shfl2((int)bx.x, target_thread),
|
||||||
__shfl((int)bx.y, target_thread),
|
__shfl2((int)bx.y, target_thread),
|
||||||
__shfl((int)bx.z, target_thread),
|
__shfl2((int)bx.z, target_thread),
|
||||||
__shfl((int)bx.w, target_thread)
|
__shfl2((int)bx.w, target_thread)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,8 +105,8 @@ void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
|
|||||||
|
|
||||||
if (SCHEME == ANDERSEN) {
|
if (SCHEME == ANDERSEN) {
|
||||||
int target_thread = (threadIdx.x + 4)%32;
|
int target_thread = (threadIdx.x + 4)%32;
|
||||||
uint4 t=b, t2=__shfl(bx, target_thread);
|
uint4 t = b, t2 = shfl4(bx, target_thread);
|
||||||
int t2_start = __shfl((int)start, target_thread) + 4;
|
int t2_start = __shfl2((int)start, target_thread) + 4;
|
||||||
bool c = (threadIdx.x & 0x4);
|
bool c = (threadIdx.x & 0x4);
|
||||||
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
|
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
|
||||||
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
|
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
|
||||||
@ -115,7 +123,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
|
|||||||
|
|
||||||
if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
|
if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
|
||||||
if (SCHEME == ANDERSEN) {
|
if (SCHEME == ANDERSEN) {
|
||||||
int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4;
|
int t2_start = __shfl2((int)start, (threadIdx.x + 4)%32) + 4;
|
||||||
if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
|
if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
|
||||||
bool c = (threadIdx.x & 0x4);
|
bool c = (threadIdx.x & 0x4);
|
||||||
if (TEX_DIM == 0) {
|
if (TEX_DIM == 0) {
|
||||||
@ -129,7 +137,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
|
|||||||
bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
|
bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
|
||||||
}
|
}
|
||||||
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
|
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
|
||||||
bx = __shfl(bx, (threadIdx.x + 28)%32);
|
bx = shfl4(bx, (threadIdx.x + 28)%32);
|
||||||
} else {
|
} else {
|
||||||
if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
|
if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
|
||||||
else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
|
else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
|
||||||
@ -149,14 +157,15 @@ void primary_order_shuffle(uint4 &b, uint4 &bx)
|
|||||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
|
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
|
||||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
|
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
|
||||||
|
|
||||||
b.w = __shfl((int)b.w, x1);
|
b.w = __shfl2((int)b.w, x1);
|
||||||
b.z = __shfl((int)b.z, x2);
|
b.z = __shfl2((int)b.z, x2);
|
||||||
b.y = __shfl((int)b.y, x3);
|
b.y = __shfl2((int)b.y, x3);
|
||||||
|
|
||||||
uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
|
uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
|
||||||
|
|
||||||
bx.w = __shfl((int)bx.w, x1);
|
bx.w = __shfl2((int)bx.w, x1);
|
||||||
bx.z = __shfl((int)bx.z, x2);
|
bx.z = __shfl2((int)bx.z, x2);
|
||||||
bx.y = __shfl((int)bx.y, x3);
|
bx.y = __shfl2((int)bx.y, x3);
|
||||||
tmp = bx.y; bx.y = bx.w; bx.w = tmp;
|
tmp = bx.y; bx.y = bx.w; bx.w = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -318,9 +327,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
|
|||||||
/* Unclear if this optimization is needed: These are ordered based
|
/* Unclear if this optimization is needed: These are ordered based
|
||||||
* upon the dependencies needed in the later xors. Compiler should be
|
* upon the dependencies needed in the later xors. Compiler should be
|
||||||
* able to figure this out, but might as well give it a hand. */
|
* able to figure this out, but might as well give it a hand. */
|
||||||
x.y = __shfl((int)x.y, x3);
|
x.y = __shfl2((int)x.y, x3);
|
||||||
x.w = __shfl((int)x.w, x1);
|
x.w = __shfl2((int)x.w, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
|
|
||||||
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
|
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
|
||||||
* but the register targets are rewritten here to swap x[1] and x[3] so that
|
* but the register targets are rewritten here to swap x[1] and x[3] so that
|
||||||
@ -333,9 +342,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
|
|||||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
|
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
|
||||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
|
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
|
||||||
|
|
||||||
x.w = __shfl((int)x.w, x3);
|
x.w = __shfl2((int)x.w, x3);
|
||||||
x.y = __shfl((int)x.y, x1);
|
x.y = __shfl2((int)x.y, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
}
|
}
|
||||||
|
|
||||||
b += x;
|
b += x;
|
||||||
@ -352,18 +361,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
|
|||||||
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
|
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
|
||||||
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
|
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x3);
|
x.y = __shfl2((int)x.y, x3);
|
||||||
x.w = __shfl((int)x.w, x1);
|
x.w = __shfl2((int)x.w, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
|
|
||||||
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
|
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
|
||||||
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
|
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
|
||||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
|
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
|
||||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
|
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
|
||||||
|
|
||||||
x.w = __shfl((int)x.w, x3);
|
x.w = __shfl2((int)x.w, x3);
|
||||||
x.y = __shfl((int)x.y, x1);
|
x.y = __shfl2((int)x.y, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// At the end of these iterations, the data is in primary order again.
|
// At the end of these iterations, the data is in primary order again.
|
||||||
@ -407,9 +416,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
|
|||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
||||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x1);
|
x.y = __shfl2((int)x.y, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
x.w = __shfl((int)x.w, x3);
|
x.w = __shfl2((int)x.w, x3);
|
||||||
|
|
||||||
// Diagonal Mixing phase of chacha
|
// Diagonal Mixing phase of chacha
|
||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
|
||||||
@ -417,9 +426,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
|
|||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
||||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x3);
|
x.y = __shfl2((int)x.y, x3);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
x.w = __shfl((int)x.w, x1);
|
x.w = __shfl2((int)x.w, x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
b += x;
|
b += x;
|
||||||
@ -436,9 +445,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
|
|||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
||||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x1);
|
x.y = __shfl2((int)x.y, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
x.w = __shfl((int)x.w, x3);
|
x.w = __shfl2((int)x.w, x3);
|
||||||
|
|
||||||
// Diagonal Mixing phase of chacha
|
// Diagonal Mixing phase of chacha
|
||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
|
||||||
@ -446,9 +455,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
|
|||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
||||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x3);
|
x.y = __shfl2((int)x.y, x3);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
x.w = __shfl((int)x.w, x1);
|
x.w = __shfl2((int)x.w, x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef CHACHA_PRIMITIVE
|
#undef CHACHA_PRIMITIVE
|
||||||
@ -572,7 +581,7 @@ void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
|
|||||||
} else load_key<ALGO>(d_odata, b, bx);
|
} else load_key<ALGO>(d_odata, b, bx);
|
||||||
|
|
||||||
for (int i = begin; i < end; i++) {
|
for (int i = begin; i < end; i++) {
|
||||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
||||||
uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j);
|
uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j);
|
||||||
b ^= t; bx ^= tx;
|
b ^= t; bx ^= tx;
|
||||||
block_mixer<ALGO>(b, bx, x1, x2, x3);
|
block_mixer<ALGO>(b, bx, x1, x2, x3);
|
||||||
@ -604,7 +613,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
|
|||||||
{
|
{
|
||||||
// better divergent thread handling submitted by nVidia engineers, but
|
// better divergent thread handling submitted by nVidia engineers, but
|
||||||
// supposedly this does not run with the ANDERSEN memory access scheme
|
// supposedly this does not run with the ANDERSEN memory access scheme
|
||||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
||||||
int pos = j/LOOKUP_GAP;
|
int pos = j/LOOKUP_GAP;
|
||||||
int loop = -1;
|
int loop = -1;
|
||||||
uint4 t, tx;
|
uint4 t, tx;
|
||||||
@ -612,7 +621,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
|
|||||||
int i = begin;
|
int i = begin;
|
||||||
while(i < end) {
|
while(i < end) {
|
||||||
if (loop==-1) {
|
if (loop==-1) {
|
||||||
j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
||||||
pos = j/LOOKUP_GAP;
|
pos = j/LOOKUP_GAP;
|
||||||
loop = j-pos*LOOKUP_GAP;
|
loop = j-pos*LOOKUP_GAP;
|
||||||
read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
|
read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
|
||||||
@ -634,7 +643,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
|
|||||||
// this is my original implementation, now used with the ANDERSEN
|
// this is my original implementation, now used with the ANDERSEN
|
||||||
// memory access scheme only.
|
// memory access scheme only.
|
||||||
for (int i = begin; i < end; i++) {
|
for (int i = begin; i < end; i++) {
|
||||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
||||||
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
|
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
|
||||||
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
|
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
|
||||||
while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
|
while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
|
||||||
@ -644,7 +653,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
|
|||||||
}
|
}
|
||||||
|
|
||||||
//for (int i = begin; i < end; i++) {
|
//for (int i = begin; i < end; i++) {
|
||||||
// int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
// int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
||||||
// int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
|
// int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
|
||||||
// uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
|
// uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
|
||||||
// while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
|
// while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
|
||||||
|
@ -11,7 +11,8 @@
|
|||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
#include "cuda_runtime.h"
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda_helper.h>
|
||||||
|
|
||||||
#include "miner.h"
|
#include "miner.h"
|
||||||
#include "salsa_kernel.h"
|
#include "salsa_kernel.h"
|
||||||
@ -176,13 +177,14 @@ static __device__ uint4& operator^=(uint4& left, const uint4& right)
|
|||||||
return left;
|
return left;
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ uint4 __shfl(const uint4 val, unsigned int lane, unsigned int width)
|
__device__ __forceinline__ uint4 shfl4(const uint4 val, unsigned int lane, unsigned int width)
|
||||||
{
|
{
|
||||||
return make_uint4(
|
return make_uint4(
|
||||||
(unsigned int)__shfl((int)val.x, lane, width),
|
(unsigned int)__shfl((int)val.x, lane, width),
|
||||||
(unsigned int)__shfl((int)val.y, lane, width),
|
(unsigned int)__shfl((int)val.y, lane, width),
|
||||||
(unsigned int)__shfl((int)val.z, lane, width),
|
(unsigned int)__shfl((int)val.z, lane, width),
|
||||||
(unsigned int)__shfl((int)val.w, lane, width));
|
(unsigned int)__shfl((int)val.w, lane, width)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing)
|
__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing)
|
||||||
@ -208,13 +210,13 @@ __device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[
|
|||||||
|
|
||||||
// rotate rows
|
// rotate rows
|
||||||
T1[0] = B[0];
|
T1[0] = B[0];
|
||||||
T1[1] = __shfl(B[1], lane8 + 7, 8);
|
T1[1] = shfl4(B[1], lane8 + 7, 8);
|
||||||
T1[2] = __shfl(B[2], lane8 + 6, 8);
|
T1[2] = shfl4(B[2], lane8 + 6, 8);
|
||||||
T1[3] = __shfl(B[3], lane8 + 5, 8);
|
T1[3] = shfl4(B[3], lane8 + 5, 8);
|
||||||
T1[4] = __shfl(C[0], lane8 + 4, 8);
|
T1[4] = shfl4(C[0], lane8 + 4, 8);
|
||||||
T1[5] = __shfl(C[1], lane8 + 3, 8);
|
T1[5] = shfl4(C[1], lane8 + 3, 8);
|
||||||
T1[6] = __shfl(C[2], lane8 + 2, 8);
|
T1[6] = shfl4(C[2], lane8 + 2, 8);
|
||||||
T1[7] = __shfl(C[3], lane8 + 1, 8);
|
T1[7] = shfl4(C[3], lane8 + 1, 8);
|
||||||
|
|
||||||
/* Matrix after row rotates:
|
/* Matrix after row rotates:
|
||||||
|
|
||||||
@ -301,13 +303,13 @@ template <int TEX_DIM> __device__ __forceinline__ void __transposed_read_BC(cons
|
|||||||
|
|
||||||
// rotate rows
|
// rotate rows
|
||||||
B[0] = T2[0];
|
B[0] = T2[0];
|
||||||
B[1] = __shfl(T2[1], lane8 + 1, 8);
|
B[1] = shfl4(T2[1], lane8 + 1, 8);
|
||||||
B[2] = __shfl(T2[2], lane8 + 2, 8);
|
B[2] = shfl4(T2[2], lane8 + 2, 8);
|
||||||
B[3] = __shfl(T2[3], lane8 + 3, 8);
|
B[3] = shfl4(T2[3], lane8 + 3, 8);
|
||||||
C[0] = __shfl(T2[4], lane8 + 4, 8);
|
C[0] = shfl4(T2[4], lane8 + 4, 8);
|
||||||
C[1] = __shfl(T2[5], lane8 + 5, 8);
|
C[1] = shfl4(T2[5], lane8 + 5, 8);
|
||||||
C[2] = __shfl(T2[6], lane8 + 6, 8);
|
C[2] = shfl4(T2[6], lane8 + 6, 8);
|
||||||
C[3] = __shfl(T2[7], lane8 + 7, 8);
|
C[3] = shfl4(T2[7], lane8 + 7, 8);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda_helper.h>
|
||||||
#include "miner.h"
|
#include "miner.h"
|
||||||
|
|
||||||
#include "salsa_kernel.h"
|
#include "salsa_kernel.h"
|
||||||
@ -117,13 +118,14 @@ static __device__ uint4& operator^=(uint4& left, const uint4& right)
|
|||||||
return left;
|
return left;
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ uint4 __shfl(const uint4 val, unsigned int lane, unsigned int width)
|
__device__ __forceinline__ uint4 shfl4(const uint4 val, unsigned int lane, unsigned int width)
|
||||||
{
|
{
|
||||||
return make_uint4(
|
return make_uint4(
|
||||||
(unsigned int)__shfl((int)val.x, lane, width),
|
(unsigned int)__shfl((int)val.x, lane, width),
|
||||||
(unsigned int)__shfl((int)val.y, lane, width),
|
(unsigned int)__shfl((int)val.y, lane, width),
|
||||||
(unsigned int)__shfl((int)val.z, lane, width),
|
(unsigned int)__shfl((int)val.z, lane, width),
|
||||||
(unsigned int)__shfl((int)val.w, lane, width));
|
(unsigned int)__shfl((int)val.w, lane, width)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing)
|
__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing)
|
||||||
@ -149,13 +151,13 @@ __device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[
|
|||||||
|
|
||||||
// rotate rows
|
// rotate rows
|
||||||
T1[0] = B[0];
|
T1[0] = B[0];
|
||||||
T1[1] = __shfl(B[1], lane8 + 7, 8);
|
T1[1] = shfl4(B[1], lane8 + 7, 8);
|
||||||
T1[2] = __shfl(B[2], lane8 + 6, 8);
|
T1[2] = shfl4(B[2], lane8 + 6, 8);
|
||||||
T1[3] = __shfl(B[3], lane8 + 5, 8);
|
T1[3] = shfl4(B[3], lane8 + 5, 8);
|
||||||
T1[4] = __shfl(C[0], lane8 + 4, 8);
|
T1[4] = shfl4(C[0], lane8 + 4, 8);
|
||||||
T1[5] = __shfl(C[1], lane8 + 3, 8);
|
T1[5] = shfl4(C[1], lane8 + 3, 8);
|
||||||
T1[6] = __shfl(C[2], lane8 + 2, 8);
|
T1[6] = shfl4(C[2], lane8 + 2, 8);
|
||||||
T1[7] = __shfl(C[3], lane8 + 1, 8);
|
T1[7] = shfl4(C[3], lane8 + 1, 8);
|
||||||
|
|
||||||
/* Matrix after row rotates:
|
/* Matrix after row rotates:
|
||||||
|
|
||||||
@ -233,13 +235,13 @@ __device__ __forceinline__ void __transposed_read_BC(const uint4 *S, uint4 (&B)[
|
|||||||
|
|
||||||
// rotate rows
|
// rotate rows
|
||||||
B[0] = T2[0];
|
B[0] = T2[0];
|
||||||
B[1] = __shfl(T2[1], lane8 + 1, 8);
|
B[1] = shfl4(T2[1], lane8 + 1, 8);
|
||||||
B[2] = __shfl(T2[2], lane8 + 2, 8);
|
B[2] = shfl4(T2[2], lane8 + 2, 8);
|
||||||
B[3] = __shfl(T2[3], lane8 + 3, 8);
|
B[3] = shfl4(T2[3], lane8 + 3, 8);
|
||||||
C[0] = __shfl(T2[4], lane8 + 4, 8);
|
C[0] = shfl4(T2[4], lane8 + 4, 8);
|
||||||
C[1] = __shfl(T2[5], lane8 + 5, 8);
|
C[1] = shfl4(T2[5], lane8 + 5, 8);
|
||||||
C[2] = __shfl(T2[6], lane8 + 6, 8);
|
C[2] = shfl4(T2[6], lane8 + 6, 8);
|
||||||
C[3] = __shfl(T2[7], lane8 + 7, 8);
|
C[3] = shfl4(T2[7], lane8 + 7, 8);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda_helper.h>
|
||||||
#include "miner.h"
|
#include "miner.h"
|
||||||
|
|
||||||
#include "salsa_kernel.h"
|
#include "salsa_kernel.h"
|
||||||
@ -28,6 +29,12 @@ typedef enum
|
|||||||
#define __ldg(x) (*(x))
|
#define __ldg(x) (*(x))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
|
||||||
|
#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
|
||||||
|
#else
|
||||||
|
#define __shfl2 __shfl
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||||
|
|
||||||
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
|
// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
|
||||||
@ -59,8 +66,12 @@ static __host__ __device__ uint4& operator += (uint4& left, const uint4& right)
|
|||||||
return left;
|
return left;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ uint4 __shfl(const uint4 bx, int target_thread) {
|
|
||||||
return make_uint4(__shfl((int)bx.x, target_thread), __shfl((int)bx.y, target_thread), __shfl((int)bx.z, target_thread), __shfl((int)bx.w, target_thread));
|
static __device__ uint4 shfl4(const uint4 bx, int target_thread) {
|
||||||
|
return make_uint4(
|
||||||
|
__shfl2((int)bx.x, target_thread), __shfl2((int)bx.y, target_thread),
|
||||||
|
__shfl2((int)bx.z, target_thread), __shfl2((int)bx.w, target_thread)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* write_keys writes the 8 keys being processed by a warp to the global
|
/* write_keys writes the 8 keys being processed by a warp to the global
|
||||||
@ -93,8 +104,8 @@ void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
|
|||||||
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
|
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
|
||||||
if (SCHEME == ANDERSEN) {
|
if (SCHEME == ANDERSEN) {
|
||||||
int target_thread = (threadIdx.x + 4)&31;
|
int target_thread = (threadIdx.x + 4)&31;
|
||||||
uint4 t=b, t2=__shfl(bx, target_thread);
|
uint4 t = b, t2 = shfl4(bx, target_thread);
|
||||||
int t2_start = __shfl((int)start, target_thread) + 4;
|
int t2_start = __shfl2((int)start, target_thread) + 4;
|
||||||
bool c = (threadIdx.x & 0x4);
|
bool c = (threadIdx.x & 0x4);
|
||||||
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
|
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
|
||||||
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
|
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
|
||||||
@ -109,12 +120,12 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
|
|||||||
{
|
{
|
||||||
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
|
uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
|
||||||
if (SCHEME == ANDERSEN) {
|
if (SCHEME == ANDERSEN) {
|
||||||
int t2_start = __shfl((int)start, (threadIdx.x + 4)&31) + 4;
|
int t2_start = __shfl2((int)start, (threadIdx.x + 4)&31) + 4;
|
||||||
bool c = (threadIdx.x & 0x4);
|
bool c = (threadIdx.x & 0x4);
|
||||||
b = __ldg((uint4 *)(&scratch[c ? t2_start : start]));
|
b = __ldg((uint4 *)(&scratch[c ? t2_start : start]));
|
||||||
bx = __ldg((uint4 *)(&scratch[c ? start : t2_start]));
|
bx = __ldg((uint4 *)(&scratch[c ? start : t2_start]));
|
||||||
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
|
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
|
||||||
bx = __shfl(bx, (threadIdx.x + 28)&31);
|
bx = shfl4(bx, (threadIdx.x + 28)&31);
|
||||||
} else {
|
} else {
|
||||||
b = *((uint4 *)(&scratch[start]));
|
b = *((uint4 *)(&scratch[start]));
|
||||||
bx = *((uint4 *)(&scratch[start+16]));
|
bx = *((uint4 *)(&scratch[start+16]));
|
||||||
@ -128,14 +139,14 @@ void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) {
|
|||||||
int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+2)&3);
|
int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+2)&3);
|
||||||
int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+3)&3);
|
int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+3)&3);
|
||||||
|
|
||||||
b[3] = __shfl((int)b[3], x1);
|
b[3] = __shfl2((int)b[3], x1);
|
||||||
b[2] = __shfl((int)b[2], x2);
|
b[2] = __shfl2((int)b[2], x2);
|
||||||
b[1] = __shfl((int)b[1], x3);
|
b[1] = __shfl2((int)b[1], x3);
|
||||||
uint32_t tmp = b[1]; b[1] = b[3]; b[3] = tmp;
|
uint32_t tmp = b[1]; b[1] = b[3]; b[3] = tmp;
|
||||||
|
|
||||||
bx[3] = __shfl((int)bx[3], x1);
|
bx[3] = __shfl2((int)bx[3], x1);
|
||||||
bx[2] = __shfl((int)bx[2], x2);
|
bx[2] = __shfl2((int)bx[2], x2);
|
||||||
bx[1] = __shfl((int)bx[1], x3);
|
bx[1] = __shfl2((int)bx[1], x3);
|
||||||
tmp = bx[1]; bx[1] = bx[3]; bx[3] = tmp;
|
tmp = bx[1]; bx[1] = bx[3]; bx[3] = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -146,14 +157,14 @@ void primary_order_shuffle(uint4 &b, uint4 &bx) {
|
|||||||
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+2)&3);
|
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+2)&3);
|
||||||
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+3)&3);
|
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+3)&3);
|
||||||
|
|
||||||
b.w = __shfl((int)b.w, x1);
|
b.w = __shfl2((int)b.w, x1);
|
||||||
b.z = __shfl((int)b.z, x2);
|
b.z = __shfl2((int)b.z, x2);
|
||||||
b.y = __shfl((int)b.y, x3);
|
b.y = __shfl2((int)b.y, x3);
|
||||||
uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
|
uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
|
||||||
|
|
||||||
bx.w = __shfl((int)bx.w, x1);
|
bx.w = __shfl2((int)bx.w, x1);
|
||||||
bx.z = __shfl((int)bx.z, x2);
|
bx.z = __shfl2((int)bx.z, x2);
|
||||||
bx.y = __shfl((int)bx.y, x3);
|
bx.y = __shfl2((int)bx.y, x3);
|
||||||
tmp = bx.y; bx.y = bx.w; bx.w = tmp;
|
tmp = bx.y; bx.y = bx.w; bx.w = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -327,9 +338,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
|
|||||||
/* Unclear if this optimization is needed: These are ordered based
|
/* Unclear if this optimization is needed: These are ordered based
|
||||||
* upon the dependencies needed in the later xors. Compiler should be
|
* upon the dependencies needed in the later xors. Compiler should be
|
||||||
* able to figure this out, but might as well give it a hand. */
|
* able to figure this out, but might as well give it a hand. */
|
||||||
x.y = __shfl((int)x.y, x3);
|
x.y = __shfl2((int)x.y, x3);
|
||||||
x.w = __shfl((int)x.w, x1);
|
x.w = __shfl2((int)x.w, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
|
|
||||||
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
|
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
|
||||||
* but the register targets are rewritten here to swap x[1] and x[3] so that
|
* but the register targets are rewritten here to swap x[1] and x[3] so that
|
||||||
@ -342,9 +353,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
|
|||||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
|
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
|
||||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
|
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
|
||||||
|
|
||||||
x.w = __shfl((int)x.w, x3);
|
x.w = __shfl2((int)x.w, x3);
|
||||||
x.y = __shfl((int)x.y, x1);
|
x.y = __shfl2((int)x.y, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
}
|
}
|
||||||
|
|
||||||
b += x;
|
b += x;
|
||||||
@ -362,18 +373,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
|
|||||||
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
|
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
|
||||||
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
|
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x3);
|
x.y = __shfl2((int)x.y, x3);
|
||||||
x.w = __shfl((int)x.w, x1);
|
x.w = __shfl2((int)x.w, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
|
|
||||||
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
|
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
|
||||||
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
|
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
|
||||||
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
|
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
|
||||||
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
|
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
|
||||||
|
|
||||||
x.w = __shfl((int)x.w, x3);
|
x.w = __shfl2((int)x.w, x3);
|
||||||
x.y = __shfl((int)x.y, x1);
|
x.y = __shfl2((int)x.y, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// At the end of these iterations, the data is in primary order again.
|
// At the end of these iterations, the data is in primary order again.
|
||||||
@ -424,9 +435,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
|
|||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
||||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x1);
|
x.y = __shfl2((int)x.y, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
x.w = __shfl((int)x.w, x3);
|
x.w = __shfl2((int)x.w, x3);
|
||||||
|
|
||||||
// Diagonal Mixing phase of chacha
|
// Diagonal Mixing phase of chacha
|
||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
|
||||||
@ -434,9 +445,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
|
|||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
||||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x3);
|
x.y = __shfl2((int)x.y, x3);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
x.w = __shfl((int)x.w, x1);
|
x.w = __shfl2((int)x.w, x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
b += x;
|
b += x;
|
||||||
@ -454,9 +465,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
|
|||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
||||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x1);
|
x.y = __shfl2((int)x.y, x1);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
x.w = __shfl((int)x.w, x3);
|
x.w = __shfl2((int)x.w, x3);
|
||||||
|
|
||||||
// Diagonal Mixing phase of chacha
|
// Diagonal Mixing phase of chacha
|
||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
|
||||||
@ -464,9 +475,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
|
|||||||
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
|
||||||
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
|
||||||
|
|
||||||
x.y = __shfl((int)x.y, x3);
|
x.y = __shfl2((int)x.y, x3);
|
||||||
x.z = __shfl((int)x.z, x2);
|
x.z = __shfl2((int)x.z, x2);
|
||||||
x.w = __shfl((int)x.w, x1);
|
x.w = __shfl2((int)x.w, x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef CHACHA_PRIMITIVE
|
#undef CHACHA_PRIMITIVE
|
||||||
@ -589,7 +600,7 @@ void titan_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
|
|||||||
} else load_key<ALGO>(d_odata, b, bx);
|
} else load_key<ALGO>(d_odata, b, bx);
|
||||||
|
|
||||||
for (int i = begin; i < end; i++) {
|
for (int i = begin; i < end; i++) {
|
||||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
||||||
uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*j);
|
uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*j);
|
||||||
b ^= t; bx ^= tx;
|
b ^= t; bx ^= tx;
|
||||||
block_mixer<ALGO>(b, bx, x1, x2, x3);
|
block_mixer<ALGO>(b, bx, x1, x2, x3);
|
||||||
@ -623,7 +634,7 @@ void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigne
|
|||||||
{
|
{
|
||||||
// better divergent thread handling submitted by nVidia engineers, but
|
// better divergent thread handling submitted by nVidia engineers, but
|
||||||
// supposedly this does not run with the ANDERSEN memory access scheme
|
// supposedly this does not run with the ANDERSEN memory access scheme
|
||||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
||||||
int pos = j/LOOKUP_GAP;
|
int pos = j/LOOKUP_GAP;
|
||||||
int loop = -1;
|
int loop = -1;
|
||||||
uint4 t, tx;
|
uint4 t, tx;
|
||||||
@ -632,7 +643,7 @@ void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigne
|
|||||||
while(i < end)
|
while(i < end)
|
||||||
{
|
{
|
||||||
if (loop == -1) {
|
if (loop == -1) {
|
||||||
j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
||||||
pos = j/LOOKUP_GAP;
|
pos = j/LOOKUP_GAP;
|
||||||
loop = j-pos*LOOKUP_GAP;
|
loop = j-pos*LOOKUP_GAP;
|
||||||
read_keys_direct<SCHEME>(t, tx, start+32*pos);
|
read_keys_direct<SCHEME>(t, tx, start+32*pos);
|
||||||
@ -655,7 +666,7 @@ void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigne
|
|||||||
// this is my original implementation, now used with the ANDERSEN
|
// this is my original implementation, now used with the ANDERSEN
|
||||||
// memory access scheme only.
|
// memory access scheme only.
|
||||||
for (int i = begin; i < end; i++) {
|
for (int i = begin; i < end; i++) {
|
||||||
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
|
||||||
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
|
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
|
||||||
uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*pos);
|
uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*pos);
|
||||||
while (loop--)
|
while (loop--)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user