scrypt: finish scrypt-jane algo import

2025-03-13 05:51:19 +00:00 · 2015-04-21 13:20:24 +02:00 · 2015-04-21 13:20:24 +02:00 · a6d88abbc9
commit a6d88abbc9
parent 9208888c57
9 changed files with 111 additions and 80 deletions
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -652,7 +652,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)

 	/* discard if a newer bloc was received */
 	stale_work = work->height && work->height < g_work.height;
-	if (have_stratum && !stale_work && opt_algo != ALGO_ZR5) {
+	if (have_stratum && !stale_work && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
 		pthread_mutex_lock(&g_work_lock);
 		if (strlen(work->job_id + 8))
 			stale_work = strncmp(work->job_id + 8, g_work.job_id + 8, 4);
--- a/miner.h
+++ b/miner.h
@ -695,6 +695,7 @@ void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const in
 void quarkhash(void *state, const void *input);
 void qubithash(void *state, const void *input);
 void scrypthash(void* output, const void* input);
+void scryptjane_hash(void* output, const void* input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
 void s3hash(void *output, const void *input);
--- a/scrypt-jane.cpp
+++ b/scrypt-jane.cpp
@ -8,7 +8,7 @@

 #include "scrypt/scrypt-jane.h"
 #include "scrypt/code/scrypt-jane-portable.h"
-#include "scrypt/code/scrypt-jane-romix.h"
+#include "scrypt/code/scrypt-jane-chacha.h"
 #include "scrypt/keccak.h"

 #include "scrypt/salsa_kernel.h"
@ -434,6 +434,7 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
 	uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end)
 {
 	const uint32_t Htarg = ptarget[7];
+	uint64_t N;

 	if (s_Nfactor == 0 && strlen(jane_params) > 0)
 		applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params);
@ -442,14 +443,12 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
 	if (Nfactor > scrypt_maxN) {
 		scrypt_fatal_error("scrypt: N out of range");
 	}
+	N = (1 << (Nfactor + 1));

 	if (Nfactor != s_Nfactor)
 	{
-		// all of this isn't very thread-safe...
-		opt_nfactor = (1 << (Nfactor + 1));
-
-		applog(LOG_INFO, "Nfactor is %d (N=%d)!", Nfactor, opt_nfactor);
-
+		opt_nfactor = Nfactor;
+		applog(LOG_INFO, "N-factor is %d (%d)!", Nfactor, N);
 		if (s_Nfactor != 0) {
 			// handle N-factor increase at runtime
 			// by adjusting the lookup_gap by factor 2
@ -480,7 +479,7 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
 	if (parallel == 2) prepare_keccak512(thr_id, pdata);

 	scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) };
-	scrypt_aligned_alloc Vbuf = scrypt_alloc((uint64_t)opt_nfactor * 128);
+	scrypt_aligned_alloc Vbuf = scrypt_alloc(N * 128);
 	scrypt_aligned_alloc Ybuf = scrypt_alloc(128);

 	uint32_t nonce[2];
@ -498,6 +497,8 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u

 		if (parallel < 2)
 		{
+			// half of cpu
+
 			for(int i=0;i<throughput;++i) {
 				uint32_t tmp_nonce = n++;
 				data[nxt][20*i + 19] = bswap_32x4(tmp_nonce);
@ -509,15 +510,13 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
 			memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput);
 			cuda_scrypt_serialize(thr_id, nxt);
 			cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt);
-			cuda_scrypt_core(thr_id, nxt, opt_nfactor);
+			cuda_scrypt_core(thr_id, nxt, N);
 			cuda_scrypt_done(thr_id, nxt);

 			cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false);
-
 			cuda_scrypt_flush(thr_id, nxt);

-			if(!cuda_scrypt_sync(thr_id, cur))
-			{
+			if(!cuda_scrypt_sync(thr_id, cur)) {
 				return -1;
 			}

@ -553,21 +552,25 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
 			}
 #endif
 		} else {
+
+			// all on gpu
+
 			n += throughput;
+			if (opt_debug && (iteration % 64 == 0))
+				applog(LOG_DEBUG, "GPU #%d: n=%x", device_map[thr_id], n);

 			cuda_scrypt_serialize(thr_id, nxt);
 			pre_keccak512(thr_id, nxt, nonce[nxt], throughput);
-			cuda_scrypt_core(thr_id, nxt, opt_nfactor);
-
-			cuda_scrypt_flush(thr_id, nxt);
+			cuda_scrypt_core(thr_id, nxt, N);
+			cuda_scrypt_flush(thr_id, nxt); // required

 			post_keccak512(thr_id, nxt, nonce[nxt], throughput);
 			cuda_scrypt_done(thr_id, nxt);

 			cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
+			cuda_scrypt_flush(thr_id, nxt); // seems required here

-			if(!cuda_scrypt_sync(thr_id, cur))
-			{
+			if (!cuda_scrypt_sync(thr_id, cur)) {
 				return -1;
 			}
 		}
@ -587,7 +590,7 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
 					tdata[19] = bswap_32x4(tmp_nonce);

 					scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128);
-					scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), opt_nfactor);
+					scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), N);
 					scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32);

 					if (memcmp(thash, &hash[cur][8*i], 32) == 0)
@ -624,3 +627,55 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
 	gettimeofday(tv_end, NULL);
 	return 0;
 }
+
+
+static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, const uchar*salt, size_t salt_len, uint32_t N,
+	uchar *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V)
+{
+	uint32_t chunk_bytes, i;
+	const uint32_t p = SCRYPT_P;
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+#endif
+
+	chunk_bytes = SCRYPT_BLOCK_BYTES * SCRYPT_R * 2;
+
+	/* 1: X = PBKDF2(password, salt) */
+	scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p);
+
+	/* 2: X = ROMix(X) */
+	for (i = 0; i < p; i++)
+		scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N);
+
+	/* 3: Out = PBKDF2(password, X) */
+	scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes);
+
+#ifdef SCRYPT_PREVENT_STATE_LEAK
+	/* This is an unnecessary security feature - mikaelh */
+	scrypt_ensure_zero(Y, (p + 1) * chunk_bytes);
+#endif
+}
+
+/* for cpu hash test */
+void scryptjane_hash(void* output, const void* input)
+{
+	uint64_t Nsize = 1ULL << (opt_nfactor + 1);
+	uint64_t chunk_bytes;
+	uint8_t *X, *Y;
+	scrypt_aligned_alloc YX, V;
+
+	chunk_bytes = 2ULL * SCRYPT_BLOCK_BYTES * SCRYPT_R;
+	V  = scrypt_alloc(Nsize * chunk_bytes);
+	YX = scrypt_alloc((SCRYPT_P + 1) * chunk_bytes);
+
+	memset(V.ptr, 0, Nsize * chunk_bytes);
+
+	Y = YX.ptr;
+	X = Y + chunk_bytes;
+
+	scrypt_jane_hash_1_1((uchar*)input, 80, (uchar*)input, 80, Nsize, (uchar*)output, 32, X, Y, V.ptr);
+
+	scrypt_free(&V);
+	scrypt_free(&YX);
+}
--- a/scrypt.cpp
+++ b/scrypt.cpp
@ -682,12 +682,13 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
 }

 static int lastFactor = 0;
-//
+
+static void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad);
+
 // Scrypt proof of work algorithm
 // using SSE2 vectorized HMAC SHA256 on CPU and
 // a salsa core implementation on GPU with CUDA
 //
-
 int scanhash_scrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf,
 	uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end)
 {
@ -989,9 +990,9 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
 /**
 * @param X input/ouput
 * @param V scratch buffer
- * @param N factor
+ * @param N factor (def. 1024)
 */
-static void scrypt_core(uint32_t *X, uint32_t *V, int N)
+static void scrypt_core(uint32_t *X, uint32_t *V, uint32_t N)
 {
 	for (int i = 0; i < N; i++) {
 		memcpy(&V[i * 32], X, 128);
@ -1013,11 +1014,11 @@ static void scrypt_core(uint32_t *X, uint32_t *V, int N)
 * @param reference  reference data, computed but preallocated
 * @param scratchpad scrypt scratchpad
 **/
-void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad)
+static void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad)
 {
 	uint32_t X[32] = { 0 };
 	uint32_t *V = (uint32_t*) scratchpad;
-	int N = (1<<(opt_nfactor+1)); // default 9 = 1024
+	uint32_t N = (1<<(opt_nfactor+1)); // default 9 = 1024

 	for (int k = 0; k < 32; k++)
 		X[k] = input[k];
@ -1028,32 +1029,18 @@ void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad)
 		reference[k] = X[k];
 }

-static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
-	uint32_t *midstate, unsigned char *scratchpad, int N)
-{
-	uint32_t tstate[8], ostate[8];
-	uint32_t X[32] = { 0 };
-	uint32_t *V = (uint32_t *) scratchpad;
-
-	memcpy(tstate, midstate, 32);
-	HMAC_SHA256_80_init(input, tstate, ostate);
-	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
-
-	scrypt_core(X, V, N);
-
-	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
-}
-
 /* cputest */
 void scrypthash(void* output, const void* input)
 {
 	uint32_t _ALIGN(64) X[32], ref[32] = { 0 }, tstate[8], ostate[8], midstate[8];
 	uint32_t _ALIGN(64) data[20];
-	uchar *scratchbuf = (uchar *) calloc(4 * 128 + 63, 1024);
+	uchar *scratchbuf;

 	// no default set with --cputest
 	if (opt_nfactor == 0) opt_nfactor = 9;

+	scratchbuf = (uchar*) calloc(4 * 128 + 63, 1UL << (opt_nfactor+1));
+
 	memcpy(data, input, 80);

 	sha256_init(midstate);
@ -1072,26 +1059,3 @@ void scrypthash(void* output, const void* input)

 	free(scratchbuf);
 }
-
-#define SCRYPT_MAX_WAYS 1
-/* cputest */
-void scrypthash2(void* output, const void* input)
-{
-	uint32_t midstate[8] = { 0 };
-	uint32_t data[SCRYPT_MAX_WAYS * 20] = { 0 };
-	uint32_t hash[SCRYPT_MAX_WAYS * 8] = { 0 };
-	uint32_t N = 1U << ((opt_nfactor ? opt_nfactor : 9) + 1); // default 1024
-
-	uchar* scratch = (uchar*) calloc(4 * 128 + 63, N); // scrypt_buffer_alloc(N);
-
-	memcpy(data, input, 80);
-
-	sha256_init(midstate);
-	sha256_transform(midstate, data, 0);
-
-	scrypt_1024_1_1_256(data, hash, midstate, scratch, N);
-
-	memcpy(output, hash, 32);
-
-	free(scratch);
-}
--- a/scrypt/code/scrypt-jane-chacha.h
+++ b/scrypt/code/scrypt-jane-chacha.h
@ -5,6 +5,8 @@ typedef uint32_t scrypt_mix_word_t;
 #define SCRYPT_WORDTO8_LE U32TO8_LE
 #define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP

+#define SCRYPT_P 1
+#define SCRYPT_R 1
 #define SCRYPT_BLOCK_BYTES 64
 #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))

--- a/scrypt/code/scrypt-jane-romix.h
+++ b/scrypt/code/scrypt-jane-romix.h
@ -1 +0,0 @@
-#include "scrypt-jane-chacha.h"
--- a/scrypt/salsa_kernel.cu
+++ b/scrypt/salsa_kernel.cu
@ -61,7 +61,7 @@
 }

 // some globals containing pointers to device memory (for chunked allocation)
-// [MAX_DEVICES] indexes up to MAX_DEVICES threads (0...MAX_DEVICES-1)
+// [MAX_GPUS] indexes up to MAX_GPUS threads (0...MAX_GPUS-1)
 int       MAXWARPS[MAX_GPUS];
 uint32_t* h_V[MAX_GPUS][TOTAL_WARP_LIMIT*64];          // NOTE: the *64 prevents buffer overflow for --keccak
 uint32_t  h_V_extra[MAX_GPUS][TOTAL_WARP_LIMIT*64];    //       with really large kernel launch configurations
@ -69,7 +69,7 @@ uint32_t  h_V_extra[MAX_GPUS][TOTAL_WARP_LIMIT*64];    //       with really larg
 KernelInterface *Best_Kernel_Heuristics(cudaDeviceProp *props)
 {
 	KernelInterface *kernel = NULL;
-	uint32_t N = (1UL << opt_nfactor+1); // not sure
+	uint64_t N = 1UL << (opt_nfactor+1);

 	if (IS_SCRYPT() || (IS_SCRYPT_JANE() && N <= 8192))
 	{
@ -83,7 +83,7 @@ KernelInterface *Best_Kernel_Heuristics(cudaDeviceProp *props)
 	}
 	else
 	{
-	   // low register count kernels (high N-factor scrypt-jane)
+	   // high N-factor scrypt-jane = low registers count kernels
 	   if (props->major > 3 || (props->major == 3 && props->minor >= 5))
 			kernel = new TitanKernel();
 		else if (props->major == 3 && props->minor == 0)
@ -161,7 +161,7 @@ int cuda_throughput(int thr_id)
 #else
 		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield));
 		checkCudaErrors(cudaSetDevice(device_map[thr_id]));
-		checkCudaErrors(cudaFree(0));
+		// checkCudaErrors(cudaFree(0));
 #endif

 		KernelInterface *kernel;
@ -599,8 +599,9 @@ int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurre
 							}
 						}
 					}
-skip2:              ;
+skip2:
 					if (opt_debug) {
+
 						if (GRID_BLOCKS == MINB) {
 							char line[512] = "    ";
 							for (int i=1; i<=kernel->max_warps_per_block(); ++i) {
@ -811,17 +812,20 @@ void cuda_scrypt_core(int thr_id, int stream, unsigned int N)
 	unsigned int LOOKUP_GAP = device_lookup_gap[thr_id];

 	// setup execution parameters
-	dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
-	dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
+	dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
+	dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);

-	context_kernel[thr_id]->run_kernel(grid, threads, WARPS_PER_BLOCK, thr_id, context_streams[stream][thr_id], context_idata[stream][thr_id], context_odata[stream][thr_id], N, LOOKUP_GAP, device_interactive[thr_id], opt_benchmark, device_texturecache[thr_id]);
+	context_kernel[thr_id]->run_kernel(grid, threads, WARPS_PER_BLOCK, thr_id,
+		context_streams[stream][thr_id], context_idata[stream][thr_id], context_odata[stream][thr_id],
+		N, LOOKUP_GAP, device_interactive[thr_id], opt_benchmark, device_texturecache[thr_id]
+	);
 }

 bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
 {
 	return context_kernel[thr_id]->prepare_keccak256(thr_id, host_pdata, ptarget);
 }
-
+#if 0
 void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
 {
 	unsigned int GRID_BLOCKS = context_blocks[thr_id];
@ -834,12 +838,13 @@ void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, i

 	context_kernel[thr_id]->do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
 }
-
+#endif
 bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
 {
 	return context_kernel[thr_id]->prepare_blake256(thr_id, host_pdata, ptarget);
 }

+#if 0
 void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
 {
 	unsigned int GRID_BLOCKS = context_blocks[thr_id];
@ -852,6 +857,7 @@ void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, in

 	context_kernel[thr_id]->do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
 }
+#endif

 void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA)
 {
@ -859,7 +865,6 @@ void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA)
 	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
 	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
 	unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * (postSHA ? 8 : 32);
-
 	// copy result from device to host (asynchronously)
 	checkCudaErrors(cudaMemcpyAsync(X, postSHA ? context_hash[stream][thr_id] : context_odata[stream][thr_id], mem_size, cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
 }
--- a/scrypt/salsa_kernel.h
+++ b/scrypt/salsa_kernel.h
@ -40,8 +40,8 @@ static int scrypt_algo = -1;
 static __inline int get_scrypt_type() {
 	if (scrypt_algo != -1) return scrypt_algo;
 	get_currentalgo(algo, 64);
-	if (!strcasecmp(algo,"scrypt-jane")) scrypt_algo = A_SCRYPT_JANE;
-	else if (!strcasecmp(algo,"scrypt")) scrypt_algo = A_SCRYPT;
+	if (!strncasecmp(algo,"scrypt-jane",11)) scrypt_algo = A_SCRYPT_JANE;
+	else if (!strncasecmp(algo,"scrypt",6)) scrypt_algo = A_SCRYPT;
 	return scrypt_algo;
 }
 static __inline bool IS_SCRYPT() { get_scrypt_type(); return (scrypt_algo == A_SCRYPT); }
@ -66,8 +66,6 @@ extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t n
 extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
 extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);

-extern void computeGold(uint32_t *idata, uint32_t *reference, uchar *scratchpad);
-
 extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
 extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);

--- a/util.cpp
+++ b/util.cpp
@ -1703,6 +1703,10 @@ void do_gpu_tests(void)
 	//memcpy(buf, zrtest, 80);
 	//scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done);

+	struct timeval tv;
+	memset(buf, 0, sizeof buf);
+	scanhash_scrypt_jane(0, (uint32_t*)buf, tgt, NULL, 1, &done, &tv, &tv);
+
 	memset(buf, 0, sizeof buf);
 	scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);

@ -1791,6 +1795,9 @@ void print_hash_tests(void)
 	scrypthash(&hash[0], &buf[0]);
 	printpfx("scrypt", hash);

+	scryptjane_hash(&hash[0], &buf[0]);
+	printpfx("scrypt-jane", hash);
+
 	skeincoinhash(&hash[0], &buf[0]);
 	printpfx("skein", hash);