/*
 * m7 algorithm
 *
 */
extern "C"
{
#include "sph/sph_sha2.h"
#include "sph/sph_keccak.h"
#include "sph/sph_ripemd.h"
#include "sph/sph_haval.h"
#include "sph/sph_tiger.h"
#include "sph/sph_whirlpool.h"
#include "sph/sph_blake.h"
#include "miner.h"
}
#include "cuda_helper.h"

// configure with --with-mpir-src=...
#include "mpir.h"

// from cpu-miner.c
extern int device_map[8];
extern bool opt_benchmark;

//static uint64_t *d_hash[8];
static uint64_t *FinalHash[8];
static uint64_t *KeccakH[8];
static uint64_t *WhirlpoolH[8];
static uint64_t *Sha512H[8];
static uint64_t *Sha256H[8];
static uint64_t *HavalH[8];
static uint64_t *TigerH[8];
static uint64_t *RipemdH[8];
static uint64_t *d_prod0[8];
static uint64_t *d_prod1[8];

extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#if 0
static void mpz_set_uint256(mpz_t r, uint8_t *u)
{
	mpz_import(r, 32 / sizeof(unsigned long), -1, sizeof(unsigned long), -1, 0, u);
}

static void mpz_get_uint256(mpz_t r, uint8_t *u)
{
	u=0;
	mpz_export(u, 0, -1, sizeof(unsigned long), -1, 0, r);
}
#endif

static void mpz_set_uint512(mpz_t r, uint8_t *u)
{
	mpz_import(r, 64 / sizeof(unsigned long), -1, sizeof(unsigned long), -1, 0, u);
}

static void set_one_if_zero(uint8_t *hash512) {
	for (int i = 0; i < 32; i++) {
		if (hash512[i] != 0) {
			return;
		}
	}
	hash512[0] = 1;
}

extern uint32_t m7_sha256_cpu_hash_300(int thr_id, int threads, uint32_t startNounce, uint64_t *d_nonceVector, uint64_t *d_hash, int order);

extern void m7_sha256_setBlock_120(void *data,const void *ptarget);
extern void m7_sha256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
extern void m7_sha256_cpu_init(int thr_id, int threads);

extern void m7_sha512_cpu_init(int thr_id, int threads);
extern void m7_sha512_setBlock_120(void *pdata);
extern void m7_sha512_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_hash, int order);

extern void ripemd160_cpu_init(int thr_id, int threads);
extern void ripemd160_setBlock_120(void *pdata);
extern void m7_ripemd160_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_hash, int order);

extern void tiger192_cpu_init(int thr_id, int threads);
extern void tiger192_setBlock_120(void *pdata);
extern void m7_tiger192_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_hash, int order);

extern void m7_bigmul_cpu(int thr_id, int threads, uint64_t* Hash1, uint64_t* Hash2, uint64_t* Hash3, uint64_t* Hash4,
												   uint64_t *Hash5, uint64_t* Hash6, uint64_t *Hash7, uint32_t foundNonce, uint32_t StartNonce,int order);

extern void m7_bigmul1_cpu(int thr_id, int threads, int len1, int len2, uint64_t* Hash1, uint64_t* Hash2, uint64_t *finalHash, int order);
extern void m7_bigmul_init(int thr_id, int threads);
extern void m7_bigmul_unroll1_cpu(int thr_id, int threads,uint64_t* Hash1, uint64_t* Hash2,uint64_t *finalHash,int order);
extern void m7_bigmul_unroll2_cpu(int thr_id, int threads,uint64_t* Hash1, uint64_t* Hash2,uint64_t *finalHash,int order);

extern void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order);
extern void cpu_mulT4(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p, int order);
extern void mul_init();

extern void m7_keccak512_setBlock_120(void *pdata);
extern void m7_keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint64_t *d_hash, int order);
extern void m7_keccak512_cpu_init(int thr_id, int threads);

extern void m7_whirlpool512_cpu_init(int thr_id, int threads, int flag);
extern void m7_whirlpool512_setBlock_120(void *pdata);
extern void m7_whirlpool512_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order);

extern void m7_haval256_cpu_init(int thr_id, int threads);
extern void m7_haval256_setBlock_120(void *data);
extern void m7_haval256_cpu_hash_120(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order);

extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);

// m7 Hashfunktion
extern "C" void m7_hash(void *state, const void *input,uint32_t TheNonce, int debug)
{
	// sha256(sha256*sha512*keccak512*ripemd160*haval*tiger1*whirlpool)

	uint8_t *bdata = 0;
	mpz_t bns[7];
	mpz_t product;

	for(int i=0; i < 7; i++) {
		mpz_init(bns[i]);
	}
	mpz_init(product);

	uint32_t data[32] ;
	uint32_t *data_p64 = data + (116 / sizeof(data[0]));
	uint8_t bhash[7][64];
	uint32_t hash[8];
	memcpy(data,input,122);

	int M7_MIDSTATE_LEN = 116;
	for(int i=0; i < 7; i++) {
		mpz_init(bns[i]);
	}

	sph_sha256_context ctx_final_sha256;
	sph_sha256_context ctx_sha256;
	sph_sha512_context ctx_sha512;
	sph_keccak512_context ctx_keccak;
	sph_whirlpool_context ctx_whirlpool;
	sph_haval256_5_context ctx_haval;
	sph_tiger_context ctx_tiger;
	sph_ripemd160_context ctx_ripemd;

	sph_sha256_init(&ctx_sha256);
	sph_sha256(&ctx_sha256, data, M7_MIDSTATE_LEN);

	sph_sha512_init(&ctx_sha512);
	sph_sha512(&ctx_sha512, data, M7_MIDSTATE_LEN);

	sph_keccak512_init(&ctx_keccak);
	sph_keccak512(&ctx_keccak, data, M7_MIDSTATE_LEN);

	sph_whirlpool_init(&ctx_whirlpool);
	sph_whirlpool(&ctx_whirlpool, data, M7_MIDSTATE_LEN);

	sph_haval256_5_init(&ctx_haval);
	sph_haval256_5(&ctx_haval, data, M7_MIDSTATE_LEN);

	sph_tiger_init(&ctx_tiger);
	sph_tiger(&ctx_tiger, data, M7_MIDSTATE_LEN);

	sph_ripemd160_init(&ctx_ripemd);
	sph_ripemd160(&ctx_ripemd, data, M7_MIDSTATE_LEN);

	sph_sha256_context ctx2_sha256;
	sph_sha512_context ctx2_sha512;
	sph_keccak512_context ctx2_keccak;
	sph_whirlpool_context ctx2_whirlpool;
	sph_haval256_5_context ctx2_haval;
	sph_tiger_context ctx2_tiger;
	sph_ripemd160_context ctx2_ripemd;

	data[29] = TheNonce;

	memset(bhash, 0, 7 * 64);

	ctx2_sha256 = ctx_sha256;
	sph_sha256(&ctx2_sha256, data_p64, 122 - M7_MIDSTATE_LEN);
	sph_sha256_close(&ctx2_sha256, (void*)(bhash[0]));

	ctx2_sha512 = ctx_sha512;
	sph_sha512(&ctx2_sha512, data_p64, 122 - M7_MIDSTATE_LEN);
	sph_sha512_close(&ctx2_sha512, (void*)(bhash[1]));

	ctx2_keccak = ctx_keccak;
	sph_keccak512(&ctx2_keccak, data_p64, 122 - M7_MIDSTATE_LEN);
	sph_keccak512_close(&ctx2_keccak, (void*)(bhash[2]));

	ctx2_whirlpool = ctx_whirlpool;
	sph_whirlpool(&ctx2_whirlpool, data_p64, 122 - M7_MIDSTATE_LEN);
	sph_whirlpool_close(&ctx2_whirlpool, (void*)(bhash[3]));

	ctx2_haval = ctx_haval;
	sph_haval256_5(&ctx2_haval, data_p64, 122 - M7_MIDSTATE_LEN);
	sph_haval256_5_close(&ctx2_haval, (void*)(bhash[4]));

	ctx2_tiger = ctx_tiger;
	sph_tiger(&ctx2_tiger, data_p64, 122 - M7_MIDSTATE_LEN);
	sph_tiger_close(&ctx2_tiger, (void*)(bhash[5]));

	ctx2_ripemd = ctx_ripemd;
	sph_ripemd160(&ctx2_ripemd, data_p64, 122 - M7_MIDSTATE_LEN);
	sph_ripemd160_close(&ctx2_ripemd, (void*)(bhash[6]));

	if (debug == 1) {
		for (int i=0; i<16; i++) {
				applog(LOG_INFO,"sha256[%d]=%02x %02x %02x %02x sha512[%d]=%02x %02x %02x %02x keccak[%d]=%02x %02x %02x %02x whirlpool[2][%d]=%02x %02x %02x %02x "
					"haval[%d]=%02x %02x %02x %02x tiger[%d]=%02x %02x %02x %02x ripemd[%d]=%02x %02x %02x %02x\n",
				i,bhash[0][4*i+3],bhash[0][4*i+2],bhash[0][4*i+1],bhash[0][4*i+0],
				i,bhash[1][4*i+3],bhash[1][4*i+2],bhash[1][4*i+1],bhash[1][4*i+0],
				i,bhash[2][4*i+3],bhash[2][4*i+2],bhash[2][4*i+1],bhash[2][4*i+0],
				i,bhash[3][4*i+3],bhash[3][4*i+2],bhash[3][4*i+1],bhash[3][4*i+0],
				i,bhash[4][4*i+3],bhash[4][4*i+2],bhash[4][4*i+1],bhash[4][4*i+0],
				i,bhash[5][4*i+3],bhash[5][4*i+2],bhash[5][4*i+1],bhash[5][4*i+0],
				i,bhash[6][4*i+3],bhash[6][4*i+2],bhash[6][4*i+1],bhash[6][4*i+0]
			);
		}
	}

	for(int i=0; i < 7; i++){
		set_one_if_zero(bhash[i]);
		mpz_set_uint512(bns[i],bhash[i]);
	}

	for(int i=6; i > 0; i--){
		mpz_mul(bns[i-1], bns[i-1], bns[i]);
	}

	int bytes = mpz_sizeinbase(bns[0], 256);
	bdata = (uint8_t *)realloc(bdata, bytes);
	mpz_export((void *)bdata, NULL, -1, 1, 0, 0, bns[0]);
	sph_sha256_init(&ctx_final_sha256);
	sph_sha256(&ctx_final_sha256, bdata, bytes);
	sph_sha256_close(&ctx_final_sha256, (void*)(hash));

	memcpy(state, hash, 32);
}


extern "C" int scanhash_m7(int thr_id, uint32_t *pdata,
	const uint32_t *ptarget, uint32_t max_nonce,
	unsigned long  *hashes_done)
{
	const int throughput = 256*256*8*2;
	const uint32_t FirstNonce = pdata[29];

	static bool init[8] = {0,0,0,0,0,0,0,0};

	if (opt_benchmark)
		((uint32_t*)ptarget)[7] = 0x0000ff;

	if (!init[thr_id])
	{
		cudaSetDevice(device_map[thr_id]);
		cudaMalloc(&d_prod0[thr_id],   38 *sizeof(uint64_t) * throughput);
		cudaMalloc(&d_prod1[thr_id],   38 *sizeof(uint64_t) * throughput);
		cudaMalloc(&FinalHash[thr_id],  8 *sizeof(uint64_t) * throughput);
		cudaMalloc(&KeccakH[thr_id],   38 *sizeof(uint64_t) * throughput);
		cudaMalloc(&WhirlpoolH[thr_id], 8 *sizeof(uint64_t) * throughput);
		cudaMalloc(&Sha256H[thr_id],    8 *sizeof(uint64_t) * throughput);
		cudaMalloc(&Sha512H[thr_id],    8 *sizeof(uint64_t) * throughput);
		cudaMalloc(&HavalH[thr_id],     8 *sizeof(uint64_t) * throughput);
		cudaMalloc(&RipemdH[thr_id],    8 *sizeof(uint64_t) * throughput);
		cudaMalloc(&TigerH[thr_id],     8 *sizeof(uint64_t) * throughput);

		m7_sha256_cpu_init(thr_id, throughput);
		m7_sha512_cpu_init(thr_id, throughput);
		m7_keccak512_cpu_init(thr_id, throughput);
		m7_haval256_cpu_init(thr_id, throughput);
		tiger192_cpu_init(thr_id, throughput);
		m7_whirlpool512_cpu_init(thr_id, throughput,0);
		ripemd160_cpu_init(thr_id, throughput);

		cuda_check_cpu_init(thr_id, throughput);

		m7_bigmul_init(thr_id, throughput);
		//mul_init();

		init[thr_id] = true;
	}

	uint32_t Htarg = ptarget[7];

	m7_whirlpool512_setBlock_120((void*)pdata);
	m7_sha256_setBlock_120((void*)pdata, ptarget);
	m7_sha512_setBlock_120((void*)pdata);
	m7_haval256_setBlock_120((void*)pdata);
	m7_keccak512_setBlock_120((void*)pdata);
	ripemd160_setBlock_120((void*)pdata);
	tiger192_setBlock_120((void*)pdata);

	cuda_check_cpu_setTarget(ptarget);

	do {
		int order = 0;
		uint32_t foundNonce;

		m7_sha256_cpu_hash_120(thr_id, throughput, pdata[29], Sha256H[thr_id], order++);

		m7_sha512_cpu_hash_120(thr_id, throughput, pdata[29], Sha512H[thr_id], order++);

		m7_keccak512_cpu_hash(thr_id, throughput, pdata[29], KeccakH[thr_id], order++);

		m7_haval256_cpu_hash_120(thr_id, throughput, pdata[29], HavalH[thr_id], order++);

		m7_tiger192_cpu_hash_120(thr_id, throughput, pdata[29], TigerH[thr_id], order++);

		m7_ripemd160_cpu_hash_120(thr_id, throughput, pdata[29], RipemdH[thr_id], order++);

		m7_whirlpool512_cpu_hash_120(thr_id, throughput, pdata[29], WhirlpoolH[thr_id], order++);

		cpu_mulT4(0, throughput, 8, 8, Sha512H[thr_id], KeccakH[thr_id], d_prod0[thr_id],order); //64
		MyStreamSynchronize(0,order++,thr_id);

		cpu_mulT4(0, throughput,8, 16, WhirlpoolH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order); //128
		MyStreamSynchronize(0,order++,thr_id);

		cpu_mulT4(0, throughput, 4, 24, Sha256H[thr_id], d_prod1[thr_id], d_prod0[thr_id],order); //96
		MyStreamSynchronize(0,order++,thr_id);

		cpu_mulT4(0, throughput, 4, 28, HavalH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);  //112
		MyStreamSynchronize(0,order++,thr_id);

		m7_bigmul_unroll1_cpu(0, throughput, TigerH[thr_id], d_prod1[thr_id], d_prod0[thr_id],order);
		MyStreamSynchronize(0,order++,thr_id);

		m7_bigmul_unroll2_cpu(0, throughput, RipemdH[thr_id], d_prod0[thr_id], d_prod1[thr_id],order);
		MyStreamSynchronize(0,order++,thr_id);

		foundNonce = m7_sha256_cpu_hash_300(thr_id, throughput, pdata[29], NULL, d_prod1[thr_id], order);
		if (foundNonce != 0xffffffff)
		{
			uint32_t vhash64[8];
			m7_hash(vhash64, pdata, foundNonce, 0);

			if (vhash64[7] <= Htarg)
			{
				pdata[29] = foundNonce;
				*hashes_done = foundNonce - FirstNonce + 1;
				return 1;
			} else {
				applog(LOG_INFO, "GPU #%d: result for nonce %08x does not validate on CPU! vhash64 %08x and htarg %08x", thr_id, foundNonce, vhash64[7], Htarg);
				////////////////////////////////////////////
//				m7_bigmul_cpu(thr_id,throughput,Sha256H[thr_id],Sha512H[thr_id],KeccakH[thr_id],WhirlpoolH[thr_id],HavalH[thr_id],TigerH[thr_id],RipemdH[thr_id],foundNonce,FirstNonce,order++);
//				m7_hash(vhash64, pdata, foundNonce, 1);
				////////////////////////////////////////////
			}
		}
		pdata[29] += throughput;

	} while (pdata[29] < max_nonce && !work_restart[thr_id].restart);

	*hashes_done = pdata[29] - FirstNonce + 1;
	return 0;
}