diff --git a/Makefile.am b/Makefile.am
index 635a530..cb4ba62 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -46,15 +46,16 @@ ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
 ccminer_LDADD		= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
 ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
 
-nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . -Xptxas "-v" --ptxas-options=-v
+nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . --ptxas-options=-v --use_fast_math
 nvcc_FLAGS += $(JANSSON_INCLUDES)
 
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
 	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $<
 
-# Luffa is faster with 80 registers than 128
+# Luffa and Echo are faster with 80 registers than 128
 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
+x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
 	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
 
 # Shavite compiles faster with 128 regs
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 689b017..a0edd97 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -479,6 +479,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
     </CudaCompile>
     <CudaCompile Include="x11\cuda_x11_echo.cu">
+      <MaxRegCount>80</MaxRegCount>
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
@@ -577,4 +578,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
   <ImportGroup Label="ExtensionTargets">
     <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/config.sh b/config.sh
index 04ef6f9..9955155 100755
--- a/config.sh
+++ b/config.sh
@@ -3,7 +3,7 @@
 # Simple script to create the Makefile
 # then type 'make'
 
-# export PATH="$PATH:/usr/local/cuda-6.5/bin/"
+# export PATH="$PATH:/usr/local/cuda/bin/"
 
 make clean || echo clean
 
@@ -11,4 +11,4 @@ rm -f Makefile.in
 rm -f config.status
 ./autogen.sh || echo done
 
-CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2 -D_REENTRANT" ./configure
+CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2" ./configure
diff --git a/cpu-miner.c b/cpu-miner.c
index 4dcb71b..8295d1a 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -153,7 +153,7 @@ static const char *algo_names[] = {
 	"anime",
 	"fresh",
 	"nist5",
-	"whirlcoin",
+	"whirl",
 	"x11",
 	"x13",
 	"x14",
@@ -231,7 +231,7 @@ Options:\n\
                         anime     Animecoin hash\n\
                         fresh     Freshcoin hash (shavite 80)\n\
                         nist5     NIST5 (TalkCoin) hash\n\
-                        whirlcoin Whirlcoin hash\n\
+                        whirl     Whirlcoin (old whirlpool)\n\
                         x11       X11 (DarkCoin) hash\n\
                         x13       X13 (MaruCoin) hash\n\
                         x14       X14 hash\n\
@@ -1526,17 +1526,17 @@ int main(int argc, char *argv[])
 	int i;
 
 	printf("*** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n");
-	printf("\t This is version "PROGRAM_VERSION" (tpruvot@github)\n");
+	printf("\t This is the forked version "PROGRAM_VERSION" (tpruvot@github)\n");
 #ifdef WIN32
-	printf("\t  Built with VC++ 2013 and nVidia CUDA SDK 6.5 RC (DC 5.0)\n\n");
+	printf("\t  Built with VC++ 2013 and nVidia CUDA SDK 6.5\n\n");
 #else
-	printf("\t  Built with the nVidia CUDA SDK 6.5 RC\n\n");
+	printf("\t  Built with the nVidia CUDA SDK 6.5\n\n");
 #endif
 	printf("\t  based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n");
-	printf("\t  based on pooler-cpuminer extension for HVC from http://hvc.1gh.com/" "\n\n");
+	printf("\t    and HVC extension from http://hvc.1gh.com/" "\n\n");
 	printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n");
 	printf("\t  BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n");
-	printf("\tCuda X14 and X15 added by Tanguy Pruvot (also in cpuminer-multi)\n");
+	printf("\tCleaned and optimized by Tanguy Pruvot\n");
 	printf("\t  BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n");
 
 	rpc_user = strdup("");
diff --git a/cuda_helper.h b/cuda_helper.h
index c2c9f3e..8563737 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -38,12 +38,12 @@ extern const uint3 threadIdx;
 #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
 #endif
 
-__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
+__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
 {
 #if __CUDA_ARCH__ >= 130
 	return __double_as_longlong(__hiloint2double(HI, LO));
 #else
-	return (unsigned long long)LO | (((unsigned long long)HI) << 32);
+	return (uint64_t)LO | (((uint64_t)HI) << 32);
 #endif
 }
 
@@ -94,11 +94,8 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
 {
 	// Input:       77665544 33221100
 	// Output:      00112233 44556677
-	uint64_t temp[2];
-	temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
-	temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);
-
-	return temp[0] | (temp[1]<<32);
+	uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
+	return (result << 32) | __byte_perm(_HIWORD(x), 0, 0x0123);
 }
 #else
 	/* host */
@@ -132,7 +129,7 @@ __device__ __forceinline__
 uint64_t xor1(uint64_t a, uint64_t b)
 {
 	uint64_t result;
-	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a) ,"l"(b));
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a),"l"(b));
 	return result;
 }
 
@@ -141,10 +138,10 @@ __device__ __forceinline__
 uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
 {
 	uint64_t result;
-	asm("{\n\t"
-		" .reg .u64 t1;\n\t"
-		"xor.b64 t1, %2, %3;\n\t"
-		"xor.b64 %0, %1, t1;\n\t"
+	asm("{"
+		".reg .u64 lt;\n\t"
+		"xor.b64 lt, %2, %3;\n\t"
+		"xor.b64 %0, %1, lt;\n\t"
 		"}"
 	: "=l"(result) : "l"(a) ,"l"(b),"l"(c));
 	return result;
diff --git a/util.c b/util.c
index 81cc85b..9ccbbdf 100644
--- a/util.c
+++ b/util.c
@@ -1366,7 +1366,7 @@ void print_hash_tests(void)
 
 	memset(hash, 0, sizeof hash);
 	wcoinhash(&hash[0], &buf[0]);
-	printf("\nwhirlc:  "); print_hash(hash);
+	printf("\nwhirl:   "); print_hash(hash);
 
 	memset(hash, 0, sizeof hash);
 	x11hash(&hash[0], &buf[0]);
diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu
index 97cd1dd..fca1b05 100644
--- a/x11/cuda_x11_aes.cu
+++ b/x11/cuda_x11_aes.cu
@@ -298,10 +298,11 @@ static void aes_cpu_init()
                         0, cudaMemcpyHostToDevice);
 }
 
-static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
+__device__ __forceinline__
+void aes_gpu_init(uint32_t *sharedMemory)
 {
-	if(threadIdx.x < 256)
-	{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 256) {
 		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
 		sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
 		sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
@@ -309,10 +310,13 @@ static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
 	}
 }
 
-static __device__ __forceinline__ void aes_round(
+/* tried with 3 xor.b32 asm, not faster */
+#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d);
+
+__device__
+static void aes_round(
 	const uint32_t *sharedMemory,
-	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 
-	uint32_t k0, 
+	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
 	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
 {
 	uint32_t idx0, idx1, idx2, idx3;
@@ -321,42 +325,47 @@ static __device__ __forceinline__ void aes_round(
 	idx1 = __byte_perm(x1, 0, 0x4441) + 256;
 	idx2 = __byte_perm(x2, 0, 0x4442) + 512;
 	idx3 = __byte_perm(x3, 0, 0x4443) + 768;
-	y0 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3] ^ 
-		k0;
+	y0 = xor4_32(
+		sharedMemory[idx0],
+		sharedMemory[idx1],
+		sharedMemory[idx2],
+		sharedMemory[idx3]);
+	y0 ^= k0;
 
 	idx0 = __byte_perm(x1, 0, 0x4440);
 	idx1 = __byte_perm(x2, 0, 0x4441) + 256;
 	idx2 = __byte_perm(x3, 0, 0x4442) + 512;
 	idx3 = __byte_perm(x0, 0, 0x4443) + 768;
 
-	y1 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k3
+	y1 = xor4_32(
+		sharedMemory[idx0],
+		sharedMemory[idx1],
+		sharedMemory[idx2],
+		sharedMemory[idx3]);
 
 	idx0 = __byte_perm(x2, 0, 0x4440);
 	idx1 = __byte_perm(x3, 0, 0x4441) + 256;
 	idx2 = __byte_perm(x0, 0, 0x4442) + 512;
 	idx3 = __byte_perm(x1, 0, 0x4443) + 768;
-	y2 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k2
+	y2 = xor4_32(
+		sharedMemory[idx0],
+		sharedMemory[idx1],
+		sharedMemory[idx2],
+		sharedMemory[idx3]); // ^k2
 
 	idx0 = __byte_perm(x3, 0, 0x4440);
 	idx1 = __byte_perm(x0, 0, 0x4441) + 256;
 	idx2 = __byte_perm(x1, 0, 0x4442) + 512;
 	idx3 = __byte_perm(x2, 0, 0x4443) + 768;
-	y3 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k3
+	y3 = xor4_32(
+		sharedMemory[idx0],
+		sharedMemory[idx1],
+		sharedMemory[idx2],
+		sharedMemory[idx3]); // ^k3
 }
 
-static __device__ __forceinline__ void aes_round(
+__device__
+static void aes_round(
 	const uint32_t *sharedMemory,
 	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 
 	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
@@ -367,36 +376,40 @@ static __device__ __forceinline__ void aes_round(
 	idx1 = __byte_perm(x1, 0, 0x4441) + 256;
 	idx2 = __byte_perm(x2, 0, 0x4442) + 512;
 	idx3 = __byte_perm(x3, 0, 0x4443) + 768;
-	y0 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3];
+	y0 = xor4_32(
+		sharedMemory[idx0],
+		sharedMemory[idx1],
+		sharedMemory[idx2],
+		sharedMemory[idx3]);
 
 	idx0 = __byte_perm(x1, 0, 0x4440);
 	idx1 = __byte_perm(x2, 0, 0x4441) + 256;
 	idx2 = __byte_perm(x3, 0, 0x4442) + 512;
 	idx3 = __byte_perm(x0, 0, 0x4443) + 768;
 
-	y1 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k3
+	y1 = xor4_32(
+		sharedMemory[idx0],
+		sharedMemory[idx1],
+		sharedMemory[idx2],
+		sharedMemory[idx3]);
 
 	idx0 = __byte_perm(x2, 0, 0x4440);
 	idx1 = __byte_perm(x3, 0, 0x4441) + 256;
 	idx2 = __byte_perm(x0, 0, 0x4442) + 512;
 	idx3 = __byte_perm(x1, 0, 0x4443) + 768;
-	y2 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k2
+	y2 = xor4_32(
+		sharedMemory[idx0],
+		sharedMemory[idx1],
+		sharedMemory[idx2],
+		sharedMemory[idx3]); // ^k2
 
 	idx0 = __byte_perm(x3, 0, 0x4440);
 	idx1 = __byte_perm(x0, 0, 0x4441) + 256;
 	idx2 = __byte_perm(x1, 0, 0x4442) + 512;
 	idx3 = __byte_perm(x2, 0, 0x4443) + 768;
-	y3 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k3
+	y3 = xor4_32(
+		sharedMemory[idx0],
+		sharedMemory[idx1],
+		sharedMemory[idx2],
+		sharedMemory[idx3]); // ^k3
 }