ccminer/cuda_helper.h

#ifndef CUDA_HELPER_H
#define CUDA_HELPER_H

#include <cuda.h>
#include <cuda_runtime.h>

#ifdef __INTELLISENSE__
/* reduce vstudio warnings (__byteperm, blockIdx...) */
#include <device_functions.h>
#include <device_launch_parameters.h>
#define __launch_bounds__(max_tpb, min_blocks)
#endif

#include <stdint.h>

extern int device_map[8];
extern int device_sm[8];

// common functions
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);

extern __device__ __device_builtin__ void __syncthreads(void);

#ifndef __CUDA_ARCH__
// define blockDim and threadIdx for host
extern const dim3 blockDim;
extern const uint3 threadIdx;
#endif

#ifndef SPH_C32
#define SPH_C32(x) ((uint32_t)(x ## U))
#endif

#ifndef SPH_C64
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#endif

#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))

#if __CUDA_ARCH__ < 320
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5, 5.0)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif

__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
	return __double_as_longlong(__hiloint2double(HI, LO));
#else
	return (uint64_t)LO | (((uint64_t)HI) << 32);
#endif
}

// das Hi Word in einem 64 Bit Typen ersetzen
__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U);
}

// das Lo Word in einem 64 Bit Typen ersetzen
__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}

// Endian Drehung f<EFBFBD>r 32 Bit Typen
#ifdef __CUDA_ARCH__
__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
{
	/* device */
	return __byte_perm(x, x, 0x0123);
}
#else
	/* host */
	#define cuda_swab32(x) \
	((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
		(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
#endif

// das Lo Word aus einem 64 Bit Typen extrahieren
__device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
	return (uint32_t)__double2loint(__longlong_as_double(x));
#else
	return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}

// das Hi Word aus einem 64 Bit Typen extrahieren
__device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
	return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
	return (uint32_t)(x >> 32);
#endif
}

#ifdef __CUDA_ARCH__
__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
{
	// Input:       77665544 33221100
	// Output:      00112233 44556677
	uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
	return (result << 32) | __byte_perm(_HIWORD(x), 0, 0x0123);
}
#else
	/* host */
	#define cuda_swab64(x) \
		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
#endif

/*********************************************************************/
// Macros to catch CUDA errors in CUDA runtime calls

#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
	cudaError_t err = call;                                           \
	if (cudaSuccess != err) {                                         \
		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
		exit(EXIT_FAILURE);                                           \
	}                                                                 \
} while (0)

#define CUDA_CALL_OR_RET(call) do {                                   \
	cudaError_t err = call;                                           \
	if (cudaSuccess != err) {                                         \
		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
		return;                                                       \
	}                                                                 \
} while (0)

#define CUDA_CALL_OR_RET_X(call, ret) do {                            \
	cudaError_t err = call;                                           \
	if (cudaSuccess != err) {                                         \
		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
		return ret;                                                   \
	}                                                                 \
} while (0)

/*********************************************************************/
#ifdef _WIN64
#define USE_XOR_ASM_OPTS 0
#else
#define USE_XOR_ASM_OPTS 1
#endif

#if USE_XOR_ASM_OPTS
// device asm for whirpool
__device__ __forceinline__
uint64_t xor1(uint64_t a, uint64_t b)
{
	uint64_t result;
	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b));
	return result;
}
#else
#define xor1(a,b) (a ^ b)
#endif

#if USE_XOR_ASM_OPTS
// device asm for whirpool
__device__ __forceinline__
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
{
	uint64_t result;
	asm("xor.b64 %0, %2, %3;\n\t"
	    "xor.b64 %0, %0, %1;\n\t"
		/* output : input registers */
		: "=l"(result) : "l"(a), "l"(b), "l"(c));
	return result;
}
#else
#define xor3(a,b,c) (a ^ b ^ c)
#endif

#if USE_XOR_ASM_OPTS
// device asm for whirpool
__device__ __forceinline__
uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h)
{
	uint64_t result;
	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
	return result;
}
#else
#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h))
#endif

// device asm for x17
__device__ __forceinline__
uint64_t xandx(uint64_t a, uint64_t b, uint64_t c)
{
	uint64_t result;
	asm("{\n\t"
		".reg .u64 n;\n\t"
		"xor.b64 %0, %2, %3;\n\t"
		"and.b64 n, %0, %1;\n\t"
		"xor.b64 %0, n, %3;"
	"}\n"
	: "=l"(result) : "l"(a), "l"(b), "l"(c));
	return result;
}

// device asm for x17
__device__ __forceinline__
uint64_t sph_t64(uint64_t x)
{
	uint64_t result;
	asm("{\n\t"
		"and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t"
	"}\n"
	: "=l"(result) : "l"(x));
	return result;
}

// device asm for x17
__device__ __forceinline__
uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
{
	uint64_t result;
	asm("{\n\t"
		".reg .u64 m,n;\n\t"
		"and.b64 m,  %1, %2;\n\t"
		" or.b64 n,  %1, %2;\n\t"
		"and.b64 %0, n,  %3;\n\t"
		" or.b64 %0, %0, m ;\n\t"
	"}\n"
	: "=l"(result) : "l"(a), "l"(b), "l"(c));
	return result;
}

// device asm for x17
__device__ __forceinline__
uint64_t shr_t64(uint64_t x, uint32_t n)
{
	uint64_t result;
	asm("shr.b64 %0,%1,%2;\n\t"
		"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */
	: "=l"(result) : "l"(x), "r"(n));
	return result;
}

// device asm for ?
__device__ __forceinline__
uint64_t shl_t64(uint64_t x, uint32_t n)
{
	uint64_t result;
	asm("shl.b64 %0,%1,%2;\n\t"
		"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */
	: "=l"(result) : "l"(x), "r"(n));
	return result;
}

#ifndef USE_ROT_ASM_OPT
#define USE_ROT_ASM_OPT 1
#endif

// 64-bit ROTATE RIGHT
#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
__device__ __forceinline__
uint64_t ROTR64(const uint64_t value, const int offset) {
	uint2 result;
	if(offset < 32) {
		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
	} else {
		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
	}
	return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
__device__ __forceinline__
uint64_t ROTR64(const uint64_t x, const int offset)
{
	uint64_t result;
	asm("{\n\t"
		".reg .b64 lhs;\n\t"
		".reg .u32 roff;\n\t"
		"shr.b64 lhs, %1, %2;\n\t"
		"sub.u32 roff, 64, %2;\n\t"
		"shl.b64 %0, %1, roff;\n\t"
		"add.u64 %0, %0, lhs;\n\t"
	"}\n"
	: "=l"(result) : "l"(x), "r"(offset));
	return result;
}
#else
/* host */
#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
#endif

// 64-bit ROTATE LEFT
#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
__device__ __forceinline__
uint64_t ROTL64(const uint64_t value, const int offset) {
	uint2 result;
	if(offset >= 32) {
		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
	} else {
		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
	}
	return  __double_as_longlong(__hiloint2double(result.y, result.x));
}
#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
__device__ __forceinline__
uint64_t ROTL64(const uint64_t x, const int offset)
{
	uint64_t result;
	asm("{\n\t"
		".reg .b64 lhs;\n\t"
		".reg .u32 roff;\n\t"
		"shl.b64 lhs, %1, %2;\n\t"
		"sub.u32 roff, 64, %2;\n\t"
		"shr.b64 %0, %1, roff;\n\t"
		"add.u64 %0, lhs, %0;\n\t"
	"}\n"
	: "=l"(result) : "l"(x), "r"(offset));
	return result;
}
#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
__device__
uint64_t ROTL64(const uint64_t x, const int offset)
{
	uint64_t res;
	asm("{\n\t"
		".reg .u32 tl,th,vl,vh;\n\t"
		".reg .pred p;\n\t"
		"mov.b64 {tl,th}, %1;\n\t"
		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
		"setp.lt.u32 p, %2, 32;\n\t"
		"@!p mov.b64 %0, {vl,vh};\n\t"
		"@p  mov.b64 %0, {vh,vl};\n\t"
		"}"
		: "=l"(res) : "l"(x) , "r"(offset)
	);
	return res;
}
#else
/* host */
#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
#endif

__device__ __forceinline__
uint64_t SWAPDWORDS(uint64_t value)
{
#if __CUDA_ARCH__ >= 320
	uint2 temp;
	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
	return value;
#else
	return ROTL64(value, 32);
#endif
}

#endif // #ifndef CUDA_HELPER_H
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#ifndef CUDA_HELPER_H
 								#define CUDA_HELPER_H
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								#include <cuda.h>
-												cuda: check for errors on cuda mem alloc

											
										
										
											10 years ago
+								#include <cuda_runtime.h>
-												vstudio: fix launch_bounds intellisense warnings in ide

											
										
										
											10 years ago
+								#ifdef __INTELLISENSE__
 								/* reduce vstudio warnings (__byteperm, blockIdx...) */
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								#include <device_functions.h>
 								#include <device_launch_parameters.h>
-												vstudio: fix launch_bounds intellisense warnings in ide

											
										
										
											10 years ago
+								#define __launch_bounds__(max_tpb, min_blocks)
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								#endif
 								#include <stdint.h>
-												cuda: store device SM in a global var

sample usage made for blake and fugue (higher intensity for SM5.2)

add these to cuda_helper and clean unused code

											
										
										
											10 years ago
+								extern int device_map[8];
 								extern int device_sm[8];
-												Import djm34 qubit, deep and doom algos

Indent, and put commonly used functions proto. in cuda_helper.h

And add them to --cputest function

Also change the color option to --nocolor, -C is no more needed

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
(Which is tired to remove these german copy/pasted comments)

											
										
										
											10 years ago
+								// common functions
 								extern void cuda_check_cpu_init(int thr_id, int threads);
 								extern void cuda_check_cpu_setTarget(const void *ptarget);
 								extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
-												checkhash: some work on a faster variant (wip)

This should not be used for all algos... not enabled yet

todo: multiple nounces or blake32 style checkup

											
										
										
											10 years ago
+								extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
-												Remove debug rpc, already exists with -P

											
										
										
											10 years ago
+								extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-												Prepare trap of hardware/mem failures

											
										
										
											10 years ago
+								extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
-												Remove debug rpc, already exists with -P

											
										
										
											10 years ago
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								extern __device__ __device_builtin__ void __syncthreads(void);
 								#ifndef __CUDA_ARCH__
 								// define blockDim and threadIdx for host
 								extern const dim3 blockDim;
 								extern const uint3 threadIdx;
 								#endif
 								#ifndef SPH_C32
 								#define SPH_C32(x) ((uint32_t)(x ## U))
 								#endif
 								#ifndef SPH_C64
 								#define SPH_C64(x) ((uint64_t)(x ## ULL))
 								#endif
 								#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
-												blake512: use a new SWAPDWORDS asm func (0.05ms)

small improvement, do it on pentablake and heavy variants too

based on sp commit (but SWAP32 is already used for 32bit ints)

											
										
										
											10 years ago
+								#if __CUDA_ARCH__ < 320
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								// Kepler (Compute 3.0)
 								#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#else
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								// Kepler (Compute 3.5, 5.0)
 								#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#endif
-												small reg tunes, rename whirlcoin to whirl

											
										
										
											10 years ago
+								__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								{
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#if __CUDA_ARCH__ >= 130
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+									return __double_as_longlong(__hiloint2double(HI, LO));
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#else
-												small reg tunes, rename whirlcoin to whirl

											
										
										
											10 years ago
+									return (uint64_t)LO | (((uint64_t)HI) << 32);
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#endif
 								}
 								// das Hi Word in einem 64 Bit Typen ersetzen
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
-												faster x15, +23KH or 4ms on whirpool (30ms vs 34ms)

tpruvot: i didnt pick the asm replace_hiword, slower on linux

											
										
										
											10 years ago
+									return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U);
-												forgot this file in previous commit

											
										
										
											11 years ago
+								}
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								// das Lo Word in einem 64 Bit Typen ersetzen
 								__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
 									return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
 								}
 								// Endian Drehung f<EFBFBD>r 32 Bit Typen
 								#ifdef __CUDA_ARCH__
 								__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
 								{
 									/* device */
 									return __byte_perm(x, x, 0x0123);
 								}
 								#else
 									/* host */
 									#define cuda_swab32(x) \
 									((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
 										(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
 								#endif
-												forgot this file in previous commit

											
										
										
											11 years ago
+								// das Lo Word aus einem 64 Bit Typen extrahieren
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								__device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#if __CUDA_ARCH__ >= 130
 									return (uint32_t)__double2loint(__longlong_as_double(x));
 								#else
 									return (uint32_t)(x & 0xFFFFFFFFULL);
 								#endif
 								}
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								// das Hi Word aus einem 64 Bit Typen extrahieren
 								__device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
 								#if __CUDA_ARCH__ >= 130
 									return (uint32_t)__double2hiint(__longlong_as_double(x));
 								#else
 									return (uint32_t)(x >> 32);
 								#endif
-												forgot this file in previous commit

											
										
										
											11 years ago
+								}
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								#ifdef __CUDA_ARCH__
 								__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
-												forgot this file in previous commit

											
										
										
											11 years ago
+								{
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+									// Input:       77665544 33221100
 									// Output:      00112233 44556677
-												small reg tunes, rename whirlcoin to whirl

											
										
										
											10 years ago
+									uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
 									return (result << 32) | __byte_perm(_HIWORD(x), 0, 0x0123);
-												forgot this file in previous commit

											
										
										
											11 years ago
+								}
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								#else
 									/* host */
 									#define cuda_swab64(x) \
 										((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
 											(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
 											(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
 											(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
 											(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
 											(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
 											(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
 											(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
 								#endif
-												forgot this file in previous commit

											
										
										
											11 years ago
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								/*********************************************************************/
-												Prepare trap of hardware/mem failures

											
										
										
											10 years ago
+								// Macros to catch CUDA errors in CUDA runtime calls
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								#define CUDA_SAFE_CALL(call)                                          \
 								do {                                                                  \
 									cudaError_t err = call;                                           \
 									if (cudaSuccess != err) {                                         \
-												Prepare trap of hardware/mem failures

											
										
										
											10 years ago
+										fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
 										         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+										exit(EXIT_FAILURE);                                           \
 									}                                                                 \
 								} while (0)
-												Prepare trap of hardware/mem failures

											
										
										
											10 years ago
+								#define CUDA_CALL_OR_RET(call) do {                                   \
 									cudaError_t err = call;                                           \
 									if (cudaSuccess != err) {                                         \
 										cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
 										return;                                                       \
 									}                                                                 \
 								} while (0)
 								#define CUDA_CALL_OR_RET_X(call, ret) do {                            \
 									cudaError_t err = call;                                           \
 									if (cudaSuccess != err) {                                         \
 										cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
 										return ret;                                                   \
 									}                                                                 \
 								} while (0)
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								/*********************************************************************/
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+								#ifdef _WIN64
 								#define USE_XOR_ASM_OPTS 0
 								#else
 								#define USE_XOR_ASM_OPTS 1
 								#endif
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+								#if USE_XOR_ASM_OPTS
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								// device asm for whirpool
 								__device__ __forceinline__
 								uint64_t xor1(uint64_t a, uint64_t b)
 								{
 									uint64_t result;
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+									asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b));
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+									return result;
 								}
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+								#else
 								#define xor1(a,b) (a ^ b)
 								#endif
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+								#if USE_XOR_ASM_OPTS
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								// device asm for whirpool
 								__device__ __forceinline__
 								uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
 								{
 									uint64_t result;
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+									asm("xor.b64 %0, %2, %3;\n\t"
 									    "xor.b64 %0, %0, %1;\n\t"
 										/* output : input registers */
 										: "=l"(result) : "l"(a), "l"(b), "l"(c));
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+									return result;
 								}
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+								#else
 								#define xor3(a,b,c) (a ^ b ^ c)
 								#endif
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+								#if USE_XOR_ASM_OPTS
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								// device asm for whirpool
 								__device__ __forceinline__
 								uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h)
 								{
 									uint64_t result;
 									asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
 									asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
 									asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
 									asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
 									asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
 									asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
 									asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
 									return result;
 								}
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+								#else
-												Prepare trap of hardware/mem failures

											
										
										
											10 years ago
+								#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h))
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+								#endif
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+								// device asm for x17
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								__device__ __forceinline__
 								uint64_t xandx(uint64_t a, uint64_t b, uint64_t c)
 								{
 									uint64_t result;
 									asm("{\n\t"
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+										".reg .u64 n;\n\t"
 										"xor.b64 %0, %2, %3;\n\t"
 										"and.b64 n, %0, %1;\n\t"
 										"xor.b64 %0, n, %3;"
 									"}\n"
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+									: "=l"(result) : "l"(a), "l"(b), "l"(c));
 									return result;
 								}
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+								// device asm for x17
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								__device__ __forceinline__
 								uint64_t sph_t64(uint64_t x)
 								{
 									uint64_t result;
 									asm("{\n\t"
 										"and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t"
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+									"}\n"
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+									: "=l"(result) : "l"(x));
 									return result;
 								}
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+								// device asm for x17
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								__device__ __forceinline__
 								uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
 								{
 									uint64_t result;
 									asm("{\n\t"
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+										".reg .u64 m,n;\n\t"
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+										"and.b64 m,  %1, %2;\n\t"
 										" or.b64 n,  %1, %2;\n\t"
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+										"and.b64 %0, n,  %3;\n\t"
 										" or.b64 %0, %0, m ;\n\t"
 									"}\n"
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+									: "=l"(result) : "l"(a), "l"(b), "l"(c));
 									return result;
 								}
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+								// device asm for x17
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								__device__ __forceinline__
 								uint64_t shr_t64(uint64_t x, uint32_t n)
 								{
 									uint64_t result;
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+									asm("shr.b64 %0,%1,%2;\n\t"
 										"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+									: "=l"(result) : "l"(x), "r"(n));
 									return result;
 								}
-												whirlpool: x64 asm is very slow (30ms win32 vs 90)

											
										
										
											10 years ago
+								// device asm for ?
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								__device__ __forceinline__
 								uint64_t shl_t64(uint64_t x, uint32_t n)
 								{
 									uint64_t result;
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+									asm("shl.b64 %0,%1,%2;\n\t"
 										"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+									: "=l"(result) : "l"(x), "r"(n));
 									return result;
 								}
-												blake: remove int cudaMemcpyToSymbol for MSVC

use clz (leading zeros) asm func for a fast gpu compare of ptarget[6]:[7]

add also missing windows ctz/clz host functions

New NEOS speed: 227MH to 270MH (Gigabyte 750Ti Black Edition)

											
										
										
											10 years ago
+								#ifndef USE_ROT_ASM_OPT
 								#define USE_ROT_ASM_OPT 1
 								#endif
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
 								// 64-bit ROTATE RIGHT
-												blake512: use a new SWAPDWORDS asm func (0.05ms)

small improvement, do it on pentablake and heavy variants too

based on sp commit (but SWAP32 is already used for 32bit ints)

											
										
										
											10 years ago
+								#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
 								__device__ __forceinline__
 								uint64_t ROTR64(const uint64_t value, const int offset) {
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+									uint2 result;
 									if(offset < 32) {
 										asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 										asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 									} else {
 										asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 										asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 									}
 									return __double_as_longlong(__hiloint2double(result.y, result.x));
-												forgot this file in previous commit

											
										
										
											11 years ago
+								}
-												blake: remove int cudaMemcpyToSymbol for MSVC

use clz (leading zeros) asm func for a fast gpu compare of ptarget[6]:[7]

add also missing windows ctz/clz host functions

New NEOS speed: 227MH to 270MH (Gigabyte 750Ti Black Edition)

											
										
										
											10 years ago
+								#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								__device__ __forceinline__
 								uint64_t ROTR64(const uint64_t x, const int offset)
 								{
 									uint64_t result;
 									asm("{\n\t"
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+										".reg .b64 lhs;\n\t"
 										".reg .u32 roff;\n\t"
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+										"shr.b64 lhs, %1, %2;\n\t"
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+										"sub.u32 roff, 64, %2;\n\t"
 										"shl.b64 %0, %1, roff;\n\t"
 										"add.u64 %0, %0, lhs;\n\t"
 									"}\n"
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+									: "=l"(result) : "l"(x), "r"(offset));
 									return result;
 								}
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#else
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								/* host */
 								#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#endif
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								// 64-bit ROTATE LEFT
-												blake512: use a new SWAPDWORDS asm func (0.05ms)

small improvement, do it on pentablake and heavy variants too

based on sp commit (but SWAP32 is already used for 32bit ints)

											
										
										
											10 years ago
+								#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								__device__ __forceinline__
 								uint64_t ROTL64(const uint64_t value, const int offset) {
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+									uint2 result;
 									if(offset >= 32) {
 										asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 										asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 									} else {
 										asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 										asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 									}
 									return  __double_as_longlong(__hiloint2double(result.y, result.x));
-												forgot this file in previous commit

											
										
										
											11 years ago
+								}
-												blake: return to ptarget 6:7 compare

clz can be erroneous, ex 0xE0 vs 0xF0

											
										
										
											10 years ago
+								#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								__device__ __forceinline__
 								uint64_t ROTL64(const uint64_t x, const int offset)
 								{
 									uint64_t result;
 									asm("{\n\t"
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+										".reg .b64 lhs;\n\t"
 										".reg .u32 roff;\n\t"
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+										"shl.b64 lhs, %1, %2;\n\t"
-												add x17 algo, cleaned djm34 commit

todo: visual studio...

											
										
										
											10 years ago
+										"sub.u32 roff, 64, %2;\n\t"
 										"shr.b64 %0, %1, roff;\n\t"
 										"add.u64 %0, lhs, %0;\n\t"
 									"}\n"
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+									: "=l"(result) : "l"(x), "r"(offset));
 									return result;
 								}
-												prepare next version

											
										
										
											10 years ago
+								#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
-												Tune quark part of Xn funcs

based on klaus commits, will increase a bit speed of most algos

PS: main increase is due to the register count tuning in Makefile

and for skein512 on linux, its the ROTL64

but almost no changes on X11 : 2648MH/s vs 2630 before

											
										
										
											10 years ago
+								__device__
 								uint64_t ROTL64(const uint64_t x, const int offset)
 								{
 									uint64_t res;
 									asm("{\n\t"
 										".reg .u32 tl,th,vl,vh;\n\t"
 										".reg .pred p;\n\t"
 										"mov.b64 {tl,th}, %1;\n\t"
 										"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
 										"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
 										"setp.lt.u32 p, %2, 32;\n\t"
 										"@!p mov.b64 %0, {vl,vh};\n\t"
 										"@p  mov.b64 %0, {vh,vl};\n\t"
 										"}"
 										: "=l"(res) : "l"(x) , "r"(offset)
 									);
 									return res;
 								}
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#else
-												x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration

											
										
										
											10 years ago
+								/* host */
 								#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#endif
-												blake512: use a new SWAPDWORDS asm func (0.05ms)

small improvement, do it on pentablake and heavy variants too

based on sp commit (but SWAP32 is already used for 32bit ints)

											
										
										
											10 years ago
+								__device__ __forceinline__
-												Fix left value warning in SWAPDWORDS + groestl change

											
										
										
											10 years ago
+								uint64_t SWAPDWORDS(uint64_t value)
-												blake512: use a new SWAPDWORDS asm func (0.05ms)

small improvement, do it on pentablake and heavy variants too

based on sp commit (but SWAP32 is already used for 32bit ints)

											
										
										
											10 years ago
+								{
 								#if __CUDA_ARCH__ >= 320
 									uint2 temp;
 									asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
 									asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
 									return value;
 								#else
 									return ROTL64(value, 32);
 								#endif
 								}
-												forgot this file in previous commit

											
										
										
											11 years ago
+								#endif // #ifndef CUDA_HELPER_H