diff --git a/Makefile.am b/Makefile.am
index f2af2d1..79f521d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -89,13 +89,6 @@ qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
-x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
-	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
-
-# Shavite compiles faster with 128 regs
-x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
-	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=128 -o $@ -c $<
-
 x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu
index 1d1ae07..51f407c 100644
--- a/x11/cuda_x11_aes.cu
+++ b/x11/cuda_x11_aes.cu
@@ -311,7 +311,7 @@ void aes_gpu_init(uint32_t *sharedMemory)
 }
 
 /* tried with 3 xor.b32 asm, not faster */
-#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d);
+#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d));
 
 __device__
 static void aes_round(
diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index 29505da..5fc2a0f 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -28,7 +28,11 @@ __device__ __forceinline__ void AES_2ROUND(
 	k0++;
 }
 
-__constant__ uint32_t P[48] = {
+__device__ __forceinline__
+void cuda_echo_round(
+	const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__  hash)
+{
+	const uint32_t P[48] = {
 	0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 	0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 	//8-12
@@ -45,14 +49,11 @@ __constant__ uint32_t P[48] = {
 	0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 	0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
 	//58-61
-};
-
-__device__ __forceinline__
-void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash)
-{
+	};
 	uint32_t k0;
 	uint32_t h[16];
-	#pragma unroll
+
+	#pragma unroll 16
 	for (int i = 0; i < 16; i++)
 	{
 		h[i] = hash[i];
@@ -170,13 +171,21 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
 	{
 
 		// Big Sub Words
-		#pragma unroll 16
-		for (int i = 0; i < 16; i++)
+		#pragma unroll 4
+		for (int idx = 0; idx < 64; idx += 16)
 		{
-			int idx = i << 2; // *4
 			AES_2ROUND(sharedMemory,
 				W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3],
 				k0);
+			AES_2ROUND(sharedMemory,
+				W[idx + 4], W[idx + 5], W[idx + 6], W[idx + 7],
+				k0);
+			AES_2ROUND(sharedMemory,
+				W[idx + 8], W[idx + 9], W[idx + 10], W[idx + 11],
+				k0);
+			AES_2ROUND(sharedMemory,
+				W[idx + 12], W[idx + 13], W[idx + 14], W[idx + 15],
+				k0);
 		}
 
 		// Shift Rows
@@ -241,8 +250,8 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
 		}
 	}
 
-	#pragma unroll 8
-	for (int i = 0; i<32; i += 4)
+	#pragma unroll
+	for (int i = 0; i<16; i += 4)
 	{
 		W[i] ^= W[32 + i] ^ 512;
 		W[i + 1] ^= W[32 + i + 1];
@@ -255,12 +264,29 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
 		hash[i] ^= W[i];
 }
 
-__global__ /* __launch_bounds__(320, 3) will force 64 registers on the 750 Ti */
+__device__ __forceinline__
+void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 128) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+
+		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+	}
+}
+
+__global__ __launch_bounds__(128, 7) /* will force 72 registers */
 void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	aes_gpu_init(sharedMemory);
+	echo_gpu_init(sharedMemory);
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -283,7 +309,7 @@ void x11_echo512_cpu_init(int thr_id, int threads)
 __host__
 void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-	const int threadsperblock = 256;
+	const int threadsperblock = 128;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu
index f87562a..dfe6a1e 100644
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@@ -1,9 +1,8 @@
 #include "cuda_helper.h"
 
-#define TPB 256
+#include <memory.h>
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#define TPB 128
 
 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
 
@@ -1294,12 +1293,30 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u
 	state[0xF] ^= p7;
 }
 
+__device__ __forceinline__
+void shavite_gpu_init(uint32_t *sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 128) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+
+		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+	}
+}
+
 // GPU Hash
-__global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(TPB, 8) /* 64 registers if TPB 128 (fast), 80 with 92 (medium), 32 if 256 (slow) */
+void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	aes_gpu_init(sharedMemory);
+	shavite_gpu_init(sharedMemory);
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -1344,11 +1361,12 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 	}
 }
 
-__global__ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
+__global__ __launch_bounds__(TPB, 8)
+void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	aes_gpu_init(sharedMemory);
+	shavite_gpu_init(sharedMemory);
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -1397,9 +1415,9 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	size_t shared_size = 0;
+	cudaFuncSetCacheConfig(x11_shavite512_gpu_hash_64, cudaFuncCachePreferL1);
 
-	x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	MyStreamSynchronize(NULL, order, thr_id);
 }