ccminer/crypto/cryptonight.h

#pragma once
#include <cuda_runtime.h>
#include <miner.h>

#ifdef __INTELLISENSE__
/* avoid red underlining */
#define __CUDA_ARCH__ 520
struct uint3 {
	unsigned int x, y, z;
};
struct uint3  threadIdx;
struct uint3  blockIdx;
struct uint3  blockDim;
#define atomicExch(p,y) (*p) = y
#define __funnelshift_r(a,b,c) 1
#define __syncthreads()
#define __threadfence_block()
#define asm(x)
#define __shfl(a,b,c) 1
#define __umul64hi(a,b) a*b
#endif


#define MEMORY         (1U << 21) // 2 MiB / 2097152 B
#define ITER           (1U << 20) // 1048576
#define E2I_MASK       0x1FFFF0u

#define AES_BLOCK_SIZE  16U
#define AES_KEY_SIZE    32
#define INIT_SIZE_BLK   8
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B

#define AES_RKEY_LEN 4
#define AES_COL_LEN 4
#define AES_ROUND_BASE 7

#ifndef HASH_SIZE
#define HASH_SIZE 32
#endif

#ifndef HASH_DATA_AREA
#define HASH_DATA_AREA 136
#endif

#define hi_dword(x) (x >> 32)
#define lo_dword(x) (x & 0xFFFFFFFF)

#define C32(x)    ((uint32_t)(x ## U))
#define T32(x) ((x) & C32(0xFFFFFFFF))

#ifndef ROTL64
    #if __CUDA_ARCH__ >= 350
        __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) {
            uint2 result;
            if(offset >= 32) {
                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
            } else {
                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
            }
            return  __double_as_longlong(__hiloint2double(result.y, result.x));
        }
        #define ROTL64(x, n) (cuda_ROTL64(x, n))
    #else
        #define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
    #endif
#endif

#ifndef ROTL32
    #if __CUDA_ARCH__ < 350
        #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
    #else
        #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
    #endif
#endif

#ifndef ROTR32
    #if __CUDA_ARCH__ < 350
        #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
    #else
        #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
    #endif
#endif

#define MEMSET8(dst,what,cnt) { \
    int i_memset8; \
    uint64_t *out_memset8 = (uint64_t *)(dst); \
    for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
        out_memset8[i_memset8] = (what); }

#define MEMSET4(dst,what,cnt) { \
    int i_memset4; \
    uint32_t *out_memset4 = (uint32_t *)(dst); \
    for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
        out_memset4[i_memset4] = (what); }

#define MEMCPY8(dst,src,cnt) { \
    int i_memcpy8; \
    uint64_t *in_memcpy8 = (uint64_t *)(src); \
    uint64_t *out_memcpy8 = (uint64_t *)(dst); \
    for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
        out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }

#define MEMCPY4(dst,src,cnt) { \
    int i_memcpy4; \
    uint32_t *in_memcpy4 = (uint32_t *)(src); \
    uint32_t *out_memcpy4 = (uint32_t *)(dst); \
    for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
        out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }

#define XOR_BLOCKS_DST(x,y,z) { \
    ((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
    ((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }

#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))

union hash_state {
  uint8_t b[200];
  uint64_t w[25];
};

union cn_slow_hash_state {
    union hash_state hs;
    struct {
        uint8_t k[64];
        uint8_t init[INIT_SIZE_BYTE];
    };
};

static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
{
	cudaError_t err = cudaGetLastError();
	if(err != cudaSuccess) {
		gpulog(LOG_ERR, thr_id, "%s %s line %d", cudaGetErrorString(err), src, line);
		exit(1);
	}
}
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);

void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
void cryptonight_extra_cpu_free(int thr_id);
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
import xmr, to finish todo: fix jh cuda and wrong decimal diff (0xffff problem ?) 8 years ago			`#pragma once`
			`#include <cuda_runtime.h>`
			`#include <miner.h>`

			`#ifdef __INTELLISENSE__`
			`/* avoid red underlining */`
			`#define __CUDA_ARCH__ 520`
			`struct uint3 {`
			`unsigned int x, y, z;`
			`};`
			`struct uint3 threadIdx;`
			`struct uint3 blockIdx;`
			`struct uint3 blockDim;`
xmr: stabilize the final kernel 8 years ago			`#define atomicExch(p,y) (*p) = y`
import xmr, to finish todo: fix jh cuda and wrong decimal diff (0xffff problem ?) 8 years ago			`#define __funnelshift_r(a,b,c) 1`
			`#define __syncthreads()`
xmr: dont use shared mem hack, windows dont like 8 years ago			`#define __threadfence_block()`
import xmr, to finish todo: fix jh cuda and wrong decimal diff (0xffff problem ?) 8 years ago			`#define asm(x)`
			`#define __shfl(a,b,c) 1`
xmr: make it more smooth on windows with defaults also improve a bit the 750 ti on linux... 8 years ago			`#define __umul64hi(a,b) a*b`
import xmr, to finish todo: fix jh cuda and wrong decimal diff (0xffff problem ?) 8 years ago			`#endif`


xmr: vectors rewrite, now the phase2 is using only 40 regs no more constant memory used for aes. tested only on linux cuda 8 for now... wip 8 years ago			`#define MEMORY (1U << 21) // 2 MiB / 2097152 B`
			`#define ITER (1U << 20) // 1048576`
xmr: prefer 32bit uint4 and smaller offsets in core also prefer ulong2 shared load to be closer to the ptx 8 years ago			`#define E2I_MASK 0x1FFFF0u`
handle cryptonight light variant Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 8 years ago
xmr: vectors rewrite, now the phase2 is using only 40 regs no more constant memory used for aes. tested only on linux cuda 8 for now... wip 8 years ago			`#define AES_BLOCK_SIZE 16U`
import xmr, to finish todo: fix jh cuda and wrong decimal diff (0xffff problem ?) 8 years ago			`#define AES_KEY_SIZE 32`
			`#define INIT_SIZE_BLK 8`
			`#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B`

			`#define AES_RKEY_LEN 4`
			`#define AES_COL_LEN 4`
			`#define AES_ROUND_BASE 7`

			`#ifndef HASH_SIZE`
			`#define HASH_SIZE 32`
			`#endif`

			`#ifndef HASH_DATA_AREA`
			`#define HASH_DATA_AREA 136`
			`#endif`

			`#define hi_dword(x) (x >> 32)`
			`#define lo_dword(x) (x & 0xFFFFFFFF)`

			`#define C32(x) ((uint32_t)(x ## U))`
			`#define T32(x) ((x) & C32(0xFFFFFFFF))`

			`#ifndef ROTL64`
			`#if __CUDA_ARCH__ >= 350`
			`__forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) {`
			`uint2 result;`
			`if(offset >= 32) {`
			`asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));`
			`asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));`
			`} else {`
			`asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));`
			`asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));`
			`}`
			`return __double_as_longlong(__hiloint2double(result.y, result.x));`
			`}`
			`#define ROTL64(x, n) (cuda_ROTL64(x, n))`
			`#else`
			`#define ROTL64(x, n) (((x) << (n)) \| ((x) >> (64 - (n))))`
			`#endif`
			`#endif`

			`#ifndef ROTL32`
			`#if __CUDA_ARCH__ < 350`
			`#define ROTL32(x, n) T32(((x) << (n)) \| ((x) >> (32 - (n))))`
			`#else`
			`#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )`
			`#endif`
			`#endif`

			`#ifndef ROTR32`
			`#if __CUDA_ARCH__ < 350`
			`#define ROTR32(x, n) (((x) >> (n)) \| ((x) << (32 - (n))))`
			`#else`
			`#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )`
			`#endif`
			`#endif`

			`#define MEMSET8(dst,what,cnt) { \`
			`int i_memset8; \`
			`uint64_t out_memset8 = (uint64_t )(dst); \`
			`for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \`
			`out_memset8[i_memset8] = (what); }`

			`#define MEMSET4(dst,what,cnt) { \`
			`int i_memset4; \`
			`uint32_t out_memset4 = (uint32_t )(dst); \`
			`for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \`
			`out_memset4[i_memset4] = (what); }`

			`#define MEMCPY8(dst,src,cnt) { \`
			`int i_memcpy8; \`
			`uint64_t in_memcpy8 = (uint64_t )(src); \`
			`uint64_t out_memcpy8 = (uint64_t )(dst); \`
			`for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \`
			`out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }`

			`#define MEMCPY4(dst,src,cnt) { \`
			`int i_memcpy4; \`
			`uint32_t in_memcpy4 = (uint32_t )(src); \`
			`uint32_t out_memcpy4 = (uint32_t )(dst); \`
			`for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \`
			`out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }`

			`#define XOR_BLOCKS_DST(x,y,z) { \`
			`((uint64_t )z)[0] = ((uint64_t )(x))[0] ^ ((uint64_t *)(y))[0]; \`
			`((uint64_t )z)[1] = ((uint64_t )(x))[1] ^ ((uint64_t *)(y))[1]; }`

			`#define E2I(x) ((size_t)(((((uint64_t)(x)) >> 4) & 0x1ffff)))`

			`union hash_state {`
			`uint8_t b[200];`
			`uint64_t w[25];`
			`};`

			`union cn_slow_hash_state {`
			`union hash_state hs;`
			`struct {`
			`uint8_t k[64];`
			`uint8_t init[INIT_SIZE_BYTE];`
			`};`
			`};`

			`static inline void exit_if_cudaerror(int thr_id, const char *src, int line)`
			`{`
			`cudaError_t err = cudaGetLastError();`
			`if(err != cudaSuccess) {`
			`gpulog(LOG_ERR, thr_id, "%s %s line %d", cudaGetErrorString(err), src, line);`
			`exit(1);`
			`}`
			`}`
xmr: prefer 32bit uint4 and smaller offsets in core also prefer ulong2 shared load to be closer to the ptx 8 years ago			`void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t d_long_state, uint64_t d_ctx_state, uint32_t d_ctx_a, uint32_t d_ctx_b, uint32_t d_ctx_key1, uint32_t d_ctx_key2);`
import xmr, to finish todo: fix jh cuda and wrong decimal diff (0xffff problem ?) 8 years ago
			`void cryptonight_extra_cpu_setData(int thr_id, const void data, const void pTargetIn);`
			`void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);`
			`void cryptonight_extra_cpu_free(int thr_id);`
xmr: vectors rewrite, now the phase2 is using only 40 regs no more constant memory used for aes. tested only on linux cuda 8 for now... wip 8 years ago			`void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t d_ctx_state, uint32_t d_ctx_a, uint32_t d_ctx_b, uint32_t d_ctx_key1, uint32_t *d_ctx_key2);`
			`void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t nonce, uint64_t d_ctx_state);`