ccminer-gostd-lite/scrypt/salsa_kernel.h

#ifndef SALSA_KERNEL_H
#define SALSA_KERNEL_H

#include <stdio.h>
#include <stdbool.h>
#include <malloc.h>
#include <string.h>
#include <cuda_runtime.h>

#include "miner.h"

#define MAX_DEVICES MAX_GPUS

// from ccminer.cpp
extern short device_map[MAX_GPUS];
extern int device_batchsize[MAX_GPUS]; // cudaminer -b
extern int device_interactive[MAX_GPUS]; // cudaminer -i
extern int device_texturecache[MAX_GPUS]; // cudaminer -C
extern int device_singlememory[MAX_GPUS]; // cudaminer -m
extern int device_lookup_gap[MAX_GPUS]; // -L
extern int device_backoff[MAX_GPUS]; // WIN32/LINUX var
extern char *device_config[MAX_GPUS]; // -l
extern char *device_name[MAX_GPUS];

extern bool opt_autotune;
extern int opt_nfactor;
extern char *jane_params;
extern bool abort_flag;
extern int parallel;

extern void get_currentalgo(char* buf, int sz);

typedef unsigned int uint32_t; // define this as 32 bit type derived from int

// scrypt variants
#define A_SCRYPT 0
#define A_SCRYPT_JANE 1
static char algo[64] = { 0 };
static int scrypt_algo = -1;
static __inline int get_scrypt_type() {
	if (scrypt_algo != -1) return scrypt_algo;
	get_currentalgo(algo, 64);
	if (!strncasecmp(algo,"scrypt-jane",11)) scrypt_algo = A_SCRYPT_JANE;
	else if (!strncasecmp(algo,"scrypt",6)) scrypt_algo = A_SCRYPT;
	return scrypt_algo;
}
static __inline bool IS_SCRYPT() { get_scrypt_type(); return (scrypt_algo == A_SCRYPT); }
static __inline bool IS_SCRYPT_JANE() { get_scrypt_type(); return (scrypt_algo == A_SCRYPT_JANE); }

// CUDA externals
extern int cuda_throughput(int thr_id);
extern uint32_t *cuda_transferbuffer(int thr_id, int stream);
extern uint32_t *cuda_hashbuffer(int thr_id, int stream);

extern void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream);
extern void cuda_scrypt_serialize(int thr_id, int stream);
extern void cuda_scrypt_core(int thr_id, int stream, unsigned int N);
extern void cuda_scrypt_done(int thr_id, int stream);
extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA);
extern bool cuda_scrypt_sync(int thr_id, int stream);
extern void cuda_scrypt_flush(int thr_id, int stream);

extern bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);

extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);

extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);

#ifdef __NVCC__
extern void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
extern void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
#endif

// If we're in C++ mode, we're either compiling .cu files or scrypt.cpp

#ifdef __NVCC__

/**
 * An pure virtual interface for a CUDA kernel implementation.
 * TODO: encapsulate the kernel launch parameters in some kind of wrapper.
 */
class KernelInterface
{
public:
	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) = 0;
	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) = 0;
	virtual bool bindtexture_1D(uint32_t *d_V, size_t size) { return true; }
	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) { return true; }
	virtual bool unbindtexture_1D() { return true; }
	virtual bool unbindtexture_2D() { return true; }

	virtual char get_identifier() = 0;
	virtual int get_major_version() { return 1; }
	virtual int get_minor_version() { return 0; }
	virtual int max_warps_per_block() = 0;
	virtual int get_texel_width() = 0;
	virtual bool no_textures() { return false; };
	virtual bool single_memory() { return false; };
	virtual int threads_per_wu() { return 1; }
	virtual bool support_lookup_gap() { return false; }
	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; }
	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; }

	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
		return default_prepare_keccak256(thr_id, host_pdata, ptarget);
	}
	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
		default_do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
	}

	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
		return default_prepare_blake256(thr_id, host_pdata, ptarget);
	}
	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
		default_do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
	}
};

// Not performing error checking is actually bad, but...
#define checkCudaErrors(x) x
#define getLastCudaError(x)

#endif // #ifdef __NVCC__

// Define work unit size
#define TOTAL_WARP_LIMIT 4096
#define WU_PER_WARP (32 / THREADS_PER_WU)
#define WU_PER_BLOCK (WU_PER_WARP*WARPS_PER_BLOCK)
#define WU_PER_LAUNCH (GRID_BLOCKS*WU_PER_BLOCK)

// make scratchpad size dependent on N and LOOKUP_GAP
#define SCRATCH   (((N+LOOKUP_GAP-1)/LOOKUP_GAP)*32)

#endif // #ifndef SALSA_KERNEL_H
import and adapt scrypt from cudaminer project scrypt-jane under work... 10 years ago			`#ifndef SALSA_KERNEL_H`
			`#define SALSA_KERNEL_H`

			`#include <stdio.h>`
			`#include <stdbool.h>`
			`#include <malloc.h>`
			`#include <string.h>`
			`#include <cuda_runtime.h>`

			`#include "miner.h"`

			`#define MAX_DEVICES MAX_GPUS`

			`// from ccminer.cpp`
			`extern short device_map[MAX_GPUS];`
scrypt: autotune, launch-config and lookup-gap options also enhance scrypt variants algo checks to be case insensitive 10 years ago			`extern int device_batchsize[MAX_GPUS]; // cudaminer -b`
scrypt: add --interactive option also tested batchsize and texturecache but seems useless, they can be tested/tuned directly in kernel variants... Note: -i cuduminer param is already used in ccminer (--intensity) 10 years ago			`extern int device_interactive[MAX_GPUS]; // cudaminer -i`
scrypt: autotune, launch-config and lookup-gap options also enhance scrypt variants algo checks to be case insensitive 10 years ago			`extern int device_texturecache[MAX_GPUS]; // cudaminer -C`
			`extern int device_singlememory[MAX_GPUS]; // cudaminer -m`
			`extern int device_lookup_gap[MAX_GPUS]; // -L`
			`extern int device_backoff[MAX_GPUS]; // WIN32/LINUX var`
			`extern char *device_config[MAX_GPUS]; // -l`
import and adapt scrypt from cudaminer project scrypt-jane under work... 10 years ago			`extern char *device_name[MAX_GPUS];`

scrypt: autotune, launch-config and lookup-gap options also enhance scrypt variants algo checks to be case insensitive 10 years ago			`extern bool opt_autotune;`
import and adapt scrypt from cudaminer project scrypt-jane under work... 10 years ago			`extern int opt_nfactor;`
			`extern char *jane_params;`
			`extern bool abort_flag;`
			`extern int parallel;`

			`extern void get_currentalgo(char* buf, int sz);`

			`typedef unsigned int uint32_t; // define this as 32 bit type derived from int`

scrypt: autotune, launch-config and lookup-gap options also enhance scrypt variants algo checks to be case insensitive 10 years ago			`// scrypt variants`
			`#define A_SCRYPT 0`
			`#define A_SCRYPT_JANE 1`
import and adapt scrypt from cudaminer project scrypt-jane under work... 10 years ago			`static char algo[64] = { 0 };`
scrypt: autotune, launch-config and lookup-gap options also enhance scrypt variants algo checks to be case insensitive 10 years ago			`static int scrypt_algo = -1;`
			`static __inline int get_scrypt_type() {`
			`if (scrypt_algo != -1) return scrypt_algo;`
			`get_currentalgo(algo, 64);`
scrypt: finish scrypt-jane algo import 10 years ago			`if (!strncasecmp(algo,"scrypt-jane",11)) scrypt_algo = A_SCRYPT_JANE;`
			`else if (!strncasecmp(algo,"scrypt",6)) scrypt_algo = A_SCRYPT;`
scrypt: autotune, launch-config and lookup-gap options also enhance scrypt variants algo checks to be case insensitive 10 years ago			`return scrypt_algo;`
			`}`
			`static __inline bool IS_SCRYPT() { get_scrypt_type(); return (scrypt_algo == A_SCRYPT); }`
			`static __inline bool IS_SCRYPT_JANE() { get_scrypt_type(); return (scrypt_algo == A_SCRYPT_JANE); }`
import and adapt scrypt from cudaminer project scrypt-jane under work... 10 years ago
			`// CUDA externals`
			`extern int cuda_throughput(int thr_id);`
			`extern uint32_t *cuda_transferbuffer(int thr_id, int stream);`
			`extern uint32_t *cuda_hashbuffer(int thr_id, int stream);`

			`extern void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream);`
			`extern void cuda_scrypt_serialize(int thr_id, int stream);`
			`extern void cuda_scrypt_core(int thr_id, int stream, unsigned int N);`
			`extern void cuda_scrypt_done(int thr_id, int stream);`
			`extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA);`
			`extern bool cuda_scrypt_sync(int thr_id, int stream);`
			`extern void cuda_scrypt_flush(int thr_id, int stream);`

			`extern bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);`
			`extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);`

			`extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);`
			`extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);`

			`extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);`
			`extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);`

			`#ifdef __NVCC__`
			`extern void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);`
			`extern void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);`
			`#endif`

			`// If we're in C++ mode, we're either compiling .cu files or scrypt.cpp`

			`#ifdef __NVCC__`

			`/**`
			`* An pure virtual interface for a CUDA kernel implementation.`
			`* TODO: encapsulate the kernel launch parameters in some kind of wrapper.`
			`*/`
			`class KernelInterface`
			`{`
			`public:`
			`virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) = 0;`
			`virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) = 0;`
			`virtual bool bindtexture_1D(uint32_t *d_V, size_t size) { return true; }`
			`virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) { return true; }`
			`virtual bool unbindtexture_1D() { return true; }`
			`virtual bool unbindtexture_2D() { return true; }`

			`virtual char get_identifier() = 0;`
			`virtual int get_major_version() { return 1; }`
			`virtual int get_minor_version() { return 0; }`
			`virtual int max_warps_per_block() = 0;`
			`virtual int get_texel_width() = 0;`
			`virtual bool no_textures() { return false; };`
			`virtual bool single_memory() { return false; };`
			`virtual int threads_per_wu() { return 1; }`
			`virtual bool support_lookup_gap() { return false; }`
			`virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; }`
			`virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; }`

			`virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {`
			`return default_prepare_keccak256(thr_id, host_pdata, ptarget);`
			`}`
			`virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {`
			`default_do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);`
			`}`

			`virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {`
			`return default_prepare_blake256(thr_id, host_pdata, ptarget);`
			`}`
			`virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {`
			`default_do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);`
			`}`
			`};`

			`// Not performing error checking is actually bad, but...`
			`#define checkCudaErrors(x) x`
			`#define getLastCudaError(x)`

			`#endif // #ifdef __NVCC__`

			`// Define work unit size`
			`#define TOTAL_WARP_LIMIT 4096`
			`#define WU_PER_WARP (32 / THREADS_PER_WU)`
			`#define WU_PER_BLOCK (WU_PER_WARP*WARPS_PER_BLOCK)`
			`#define WU_PER_LAUNCH (GRID_BLOCKS*WU_PER_BLOCK)`

			`// make scratchpad size dependent on N and LOOKUP_GAP`
			`#define SCRATCH (((N+LOOKUP_GAP-1)/LOOKUP_GAP)*32)`

			`#endif // #ifndef SALSA_KERNEL_H`