phi2: handle stratum sc hashes

2025-03-13 05:51:19 +00:00 · 2018-06-18 08:31:55 +02:00 · 2018-06-18 08:31:55 +02:00 · 9fd5b04af6
commit 9fd5b04af6
parent 07859f93ce
10 changed files with 386 additions and 21 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -81,7 +81,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
 			  x16/cuda_x16_echo512_64.cu \
 			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
-			  phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu x11/cuda_streebog_maxwell.cu \
+			  phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu phi/cuda_phi2_cubehash512.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu

 # scrypt
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -103,6 +103,7 @@ bool submit_old = false;
 bool use_syslog = false;
 bool use_colors = true;
 int use_pok = 0;
+int use_roots = 0;
 static bool opt_background = false;
 bool opt_quiet = false;
 int opt_maxlograte = 3;
@ -698,6 +699,10 @@ static bool work_decode(const json_t *val, struct work *work)
 		data_size = 192;
 		adata_sz = 180/4;
 		break;
+	case ALGO_PHI2:
+		data_size = 144;
+		adata_sz = data_size / 4;
+		break;
 	case ALGO_NEOSCRYPT:
 	case ALGO_ZR5:
 		data_size = 80;
@ -743,6 +748,12 @@ static bool work_decode(const json_t *val, struct work *work)
 	for (i = 0; i < atarget_sz; i++)
 		work->target[i] = le32dec(work->target + i);

+	if (opt_algo == ALGO_PHI2) {
+		for (i = 20; i < 36; i++) if (work->data[i]) {
+			use_roots = 1; break;
+		}
+	}
+
 	if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo)
 		calc_network_diff(work);

@ -1066,6 +1077,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		else if (opt_algo == ALGO_DECRED) {
 			data_size = 192; adata_sz = 180/4;
 		}
+		else if (opt_algo == ALGO_PHI2 && use_roots) {
+			data_size = 144; adata_sz = 36;
+		}
 		else if (opt_algo == ALGO_SIA) {
 			return sia_submit(curl, pool, work);
 		}
@ -1629,10 +1643,17 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
 		for (i = 0; i < 8; i++)
-			work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
+			work->data[17 + i] = ((uint32_t*)sctx->job.extra)[i];
 		work->data[25] = le32dec(sctx->job.ntime);
 		work->data[26] = le32dec(sctx->job.nbits);
 		work->data[28] = 0x80000000;
+	} else if (opt_algo == ALGO_PHI2) {
+		for (i = 0; i < 8; i++)
+			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+		work->data[17] = le32dec(sctx->job.ntime);
+		work->data[18] = le32dec(sctx->job.nbits);
+		for (i = 0; i < 16; i++)
+			work->data[20 + i] = be32dec((uint32_t*)sctx->job.extra + i);
 	} else if (opt_algo == ALGO_SIA) {
 		uint32_t extra = 0;
 		memcpy(&extra, &sctx->job.coinbase[32], 2);
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -541,6 +541,7 @@
    <CudaCompile Include="phi\phi.cu" />
    <CudaCompile Include="phi\phi2.cu" />
    <CudaCompile Include="phi\cuda_phi2.cu" />
+    <CudaCompile Include="phi\cuda_phi2_cubehash512.cu" />
    <CudaCompile Include="skunk\skunk.cu" />
    <CudaCompile Include="skunk\cuda_skunk.cu">
      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -796,6 +796,9 @@
    <CudaCompile Include="phi\cuda_phi2.cu">
      <Filter>Source Files\CUDA\phi</Filter>
    </CudaCompile>
+    <CudaCompile Include="phi\cuda_phi2_cubehash512.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
    <CudaCompile Include="skunk\skunk.cu">
      <Filter>Source Files\CUDA\skunk</Filter>
    </CudaCompile>
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2.6], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2.7], [], [ccminer], [http://github.com/tpruvot/ccminer])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/equi/equi-stratum.cpp
+++ b/equi/equi-stratum.cpp
@ -101,7 +101,7 @@ bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
 		target_be[31-i] = target_bin[i];
 		if (target_bin[i]) filled++;
 	}
-	memcpy(sctx->job.claim, target_be, 32); // hack, unused struct field
+	memcpy(sctx->job.extra, target_be, 32);

 	pthread_mutex_lock(&stratum_work_lock);
 	sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be);
--- a/miner.h
+++ b/miner.h
@ -669,7 +669,7 @@ struct stratum_job {
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
-	unsigned char claim[32]; // lbry
+	unsigned char extra[64]; // like lbry claimtrie
 	bool clean;
 	unsigned char nreward[2];
 	uint32_t height;
--- a/phi/cuda_phi2_cubehash512.cu
+++ b/phi/cuda_phi2_cubehash512.cu
@ -0,0 +1,319 @@
+/* phi2 cubehash-512 144-bytes input (80 + 64) */
+
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+
+#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
+#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
+
+#if __CUDA_ARCH__ < 350
+#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#else
+#define LROT(x, bits) __funnelshift_l(x, x, bits)
+#endif
+
+#define ROTATEUPWARDS7(a)  LROT(a,7)
+#define ROTATEUPWARDS11(a) LROT(a,11)
+
+#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
+
+#ifdef NO_MIDSTATE
+
+__device__ __constant__
+static const uint32_t c_IV_512[32] = {
+	0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
+	0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
+	0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
+	0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
+	0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
+	0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
+	0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
+	0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
+};
+
+#endif
+
+__device__ __forceinline__
+static void rrounds(uint32_t x[2][2][2][2][2])
+{
+    int r;
+    int j;
+    int k;
+    int l;
+    int m;
+
+//#pragma unroll 16
+    for (r = 0;r < CUBEHASH_ROUNDS;++r) {
+
+        /* "add x_0jklm into x_1jklmn modulo 2^32" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[1][j][k][l][m] += x[0][j][k][l][m];
+
+        /* "rotate x_0jklm upwards by 7 bits" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
+
+        /* "swap x_00klm with x_01klm" */
+#pragma unroll 2
+        for (k = 0;k < 2;++k)
+#pragma unroll 2
+            for (l = 0;l < 2;++l)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[0][0][k][l][m],x[0][1][k][l][m])
+
+        /* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+        /* "swap x_1jk0m with x_1jk1m" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[1][j][k][0][m],x[1][j][k][1][m])
+
+        /* "add x_0jklm into x_1jklm modulo 2^32" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[1][j][k][l][m] += x[0][j][k][l][m];
+
+        /* "rotate x_0jklm upwards by 11 bits" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
+
+        /* "swap x_0j0lm with x_0j1lm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (l = 0;l < 2;++l)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[0][j][0][l][m],x[0][j][1][l][m])
+
+        /* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+        /* "swap x_1jkl0 with x_1jkl1" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+                    SWAP(x[1][j][k][l][0],x[1][j][k][l][1])
+
+    }
+}
+
+__device__ __forceinline__
+static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2])
+{
+	// read 32 bytes input from global mem with uint2 chunks
+	AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]);
+	AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]);
+	AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]);
+	AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]);
+}
+
+__device__ __forceinline__
+static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2])
+{
+	// used to write final hash to global mem
+	AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]);
+	AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]);
+	AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]);
+	AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]);
+	AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]);
+	AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]);
+	AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]);
+	AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]);
+}
+
+#define Init(x) \
+	AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \
+	AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \
+	AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \
+	AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \
+	AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \
+	AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \
+	AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \
+	AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \
+	AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \
+	AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \
+	AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \
+	AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \
+	AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \
+	AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \
+	AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \
+	AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]);
+
+__device__ __forceinline__
+static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data)
+{
+	/* "xor the block into the first b bytes of the state" */
+	block_tox(data, x);
+	/* "and then transform the state invertibly through r identical rounds" */
+	rrounds(x);
+}
+
+__device__ __forceinline__
+static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
+{
+	/* "the integer 1 is xored into the last state word x_11111" */
+	x[1][1][1][1][1] ^= 1;
+
+	/* "the state is then transformed invertibly through 10r identical rounds" */
+	#pragma unroll 10
+	for (int i = 0; i < 10; i++) rrounds(x);
+
+	/* "output the first h/8 bytes of the state" */
+	hash_fromx(hashval, x);
+}
+
+__host__ void phi2_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
+
+/***************************************************/
+
+/**
+ * Timetravel and x16 CUBEHASH-80 CUDA implementation
+ *  by tpruvot@github - Jan 2017 / May 2018
+ */
+
+__constant__ static uint32_t c_midstate128[32];
+__constant__ static uint32_t c_PaddedMessage_144[36];
+
+#undef SPH_C32
+#undef SPH_C64
+#undef SPH_T32
+#undef SPH_T64
+#include "sph/sph_cubehash.h"
+
+__host__
+void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata)
+{
+	sph_cubehash512_context ctx_cubehash;
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64);
+#ifndef NO_MIDSTATE
+	cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice);
+#endif
+	cudaMemcpyToSymbol(c_PaddedMessage_144, endiandata, sizeof(c_PaddedMessage_144), 0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void cubehash512_gpu_hash_144(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNounce + thread;
+		uint32_t message[8];
+		uint32_t x[2][2][2][2][2];
+#ifdef NO_MIDSTATE
+		Init(x);
+
+		// first 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[0]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[4]);
+		Update32(x, message);
+
+		// second 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[8]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[12]);
+		Update32(x, message);
+#else
+		AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]);
+		AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]);
+		AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]);
+		AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]);
+		AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]);
+		AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]);
+		AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]);
+		AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]);
+
+		AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]);
+		AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]);
+		AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]);
+		AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]);
+		AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]);
+		AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]);
+		AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]);
+		AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]);
+#endif
+		// nonce + state root
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[16]);
+		message[3] = cuda_swab32(nonce);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[20]); // state
+		Update32(x, message);
+
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[24]); // state
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[28]); // utxo
+		Update32(x, message);
+
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[32]); // utxo
+		message[4] = 0x80;
+		message[5] = 0;
+		message[6] = 0;
+		message[7] = 0;
+		Update32(x, message);
+
+		uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]);
+		Final(x, output);
+	}
+}
+
+__host__
+void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	cubehash512_gpu_hash_144 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
+}
+
--- a/phi/phi2.cu
+++ b/phi/phi2.cu
@ -1,5 +1,5 @@
 //
-//  PHI2 algo
+//  PHI2 algo (with smart contracts header)
 //  CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein
 //
 //  Implemented by tpruvot in May 2018
@ -24,6 +24,9 @@ extern "C" {
 extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
 extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);

+extern void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata);
+extern void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
 extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
 extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti);

@ -41,11 +44,13 @@ static uint64_t* d_hash_256[MAX_GPUS];
 static uint32_t* d_hash_br2[MAX_GPUS];
 static uint32_t* d_nonce_br[MAX_GPUS];

+static bool has_roots;
+
 extern "C" void phi2_hash(void *output, const void *input)
 {
-	unsigned char _ALIGN(128) hash[128] = { 0 };
-	unsigned char _ALIGN(128) hashA[64] = { 0 };
-	unsigned char _ALIGN(128) hashB[64] = { 0 };
+	unsigned char _ALIGN(128) hash[64];
+	unsigned char _ALIGN(128) hashA[64];
+	unsigned char _ALIGN(128) hashB[64];

 	sph_cubehash512_context ctx_cubehash;
 	sph_jh512_context ctx_jh;
@ -54,7 +59,7 @@ extern "C" void phi2_hash(void *output, const void *input)
 	sph_skein512_context ctx_skein;

 	sph_cubehash512_init(&ctx_cubehash);
-	sph_cubehash512(&ctx_cubehash, input, 80);
+	sph_cubehash512(&ctx_cubehash, input, has_roots ? 144 : 80);
 	sph_cubehash512_close(&ctx_cubehash, (void*)hashB);

 	LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8);
@ -137,7 +142,6 @@ extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce,
 			CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1);
 		}

-		x11_cubehash512_cpu_init(thr_id, throughput);
 		lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
@ -147,17 +151,26 @@ extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce,
 		init[thr_id] = true;
 	}

-	uint32_t endiandata[20];
-	for (int k = 0; k < 20; k++)
+	has_roots = false;
+	uint32_t endiandata[36];
+	for (int k = 0; k < 36; k++) {
 		be32enc(&endiandata[k], pdata[k]);
+		if (k >= 20 && pdata[k]) has_roots = true;
+	}

 	cuda_check_cpu_setTarget(ptarget);
-	cubehash512_setBlock_80(thr_id, endiandata);
+	if (has_roots)
+		cubehash512_setBlock_144(thr_id, endiandata);
+	else
+		cubehash512_setBlock_80(thr_id, endiandata);

 	do {
 		int order = 0;
-
-		cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); order++;
+		if (has_roots)
+			cubehash512_cuda_hash_144(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		else
+			cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		order++;
 		TRACE("cube   ");

 		lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti);
--- a/util.cpp
+++ b/util.cpp
@ -1442,7 +1442,7 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
 static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
 	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
-	const char *claim = NULL, *nreward = NULL;
+	const char *extradata = NULL, *nreward = NULL;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
 	int merkle_count, i, p=0;
@ -1452,7 +1452,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	int ntime;
 	char algo[64] = { 0 };
 	get_currentalgo(algo, sizeof(algo));
-	bool has_claim = !strcasecmp(algo, "lbry");
+	bool has_claim = !strcmp(algo, "lbry");
+	bool has_roots = !strcmp(algo, "phi2") && json_array_size(params) == 10;

 	if (sctx->is_equihash) {
 		return equi_stratum_notify(sctx, params);
@ -1461,11 +1462,17 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	job_id = json_string_value(json_array_get(params, p++));
 	prevhash = json_string_value(json_array_get(params, p++));
 	if (has_claim) {
-		claim = json_string_value(json_array_get(params, p++));
-		if (!claim || strlen(claim) != 64) {
+		extradata = json_string_value(json_array_get(params, p++));
+		if (!extradata || strlen(extradata) != 64) {
 			applog(LOG_ERR, "Stratum notify: invalid claim parameter");
 			goto out;
 		}
+	} else if (has_roots) {
+		extradata = json_string_value(json_array_get(params, p++));
+		if (!extradata || strlen(extradata) != 128) {
+			applog(LOG_ERR, "Stratum notify: invalid UTXO root parameter");
+			goto out;
+		}
 	}
 	coinb1 = json_string_value(json_array_get(params, p++));
 	coinb2 = json_string_value(json_array_get(params, p++));
@ -1529,7 +1536,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	free(sctx->job.job_id);
 	sctx->job.job_id = strdup(job_id);
 	hex2bin(sctx->job.prevhash, prevhash, 32);
-	if (has_claim) hex2bin(sctx->job.claim, claim, 32);
+	if (has_claim) hex2bin(sctx->job.extra, extradata, 32);
+	if (has_roots) hex2bin(sctx->job.extra, extradata, 64);

 	sctx->job.height = getblocheight(sctx);