ccminer/cuda_vectors.h

#include "cuda_helper.h"

/* Macros for uint2 operations (used by skein) */

__device__ __forceinline__
uint2 ROR8(const uint2 a) {
	uint2 result;
	result.x = __byte_perm(a.x, a.y, 0x4321);
	result.y = __byte_perm(a.y, a.x, 0x4321);
	return result;
}

__device__ __forceinline__
uint2 ROL24(const uint2 a) {
	uint2 result;
	result.x = __byte_perm(a.x, a.y, 0x0765);
	result.y = __byte_perm(a.y, a.x, 0x0765);
	return result;
}

static __device__ __forceinline__ uint2 operator+ (const uint2 a, const uint32_t b)
{
#if 0 && defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
	uint2 result;
	asm(
		"add.cc.u32 %0,%2,%4; \n\t"
		"addc.u32 %1,%3,%5;   \n\t"
	: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
	return result;
#else
	return vectorize(devectorize(a) + b);
#endif
}

/* whirlpool ones */
#ifdef __CUDA_ARCH__
__device__ __forceinline__
uint2 ROL16(const uint2 a) {
	uint2 result;
	result.x = __byte_perm(a.x, a.y, 0x1076);
	result.y = __byte_perm(a.y, a.x, 0x1076);
	return result;
}
#else
#define ROL16(a) make_uint2(a.x, a.y) /* bad, just to define it */
#endif
skein: merge the double implementations in one based on alexis skein kernels, tested ok on SM 2.1 and 3.0 code is a bit hard to read but... well... users dont care :p 8 years ago			`#include "cuda_helper.h"`

			`/* Macros for uint2 operations (used by skein) */`

			`__device__ __forceinline__`
			`uint2 ROR8(const uint2 a) {`
			`uint2 result;`
			`result.x = __byte_perm(a.x, a.y, 0x4321);`
			`result.y = __byte_perm(a.y, a.x, 0x4321);`
			`return result;`
			`}`

			`__device__ __forceinline__`
			`uint2 ROL24(const uint2 a) {`
			`uint2 result;`
			`result.x = __byte_perm(a.x, a.y, 0x0765);`
			`result.y = __byte_perm(a.y, a.x, 0x0765);`
			`return result;`
			`}`

			`static __device__ __forceinline__ uint2 operator+ (const uint2 a, const uint32_t b)`
			`{`
			`#if 0 && defined(__CUDA_ARCH__) && CUDA_VERSION < 7000`
			`uint2 result;`
			`asm(`
			`"add.cc.u32 %0,%2,%4; \n\t"`
			`"addc.u32 %1,%3,%5; \n\t"`
			`: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));`
			`return result;`
			`#else`
			`return vectorize(devectorize(a) + b);`
			`#endif`
			`}`

whirlpool: upgrade with alexis variant (2x) to check on older cards... maybe breaks some 8 years ago			`/* whirlpool ones */`
			`#ifdef __CUDA_ARCH__`
			`__device__ __forceinline__`
			`uint2 ROL16(const uint2 a) {`
			`uint2 result;`
			`result.x = __byte_perm(a.x, a.y, 0x1076);`
			`result.y = __byte_perm(a.y, a.x, 0x1076);`
			`return result;`
			`}`
			`#else`
			`#define ROL16(a) make_uint2(a.x, a.y) /* bad, just to define it */`
			`#endif`