diff --git a/Makefile.am b/Makefile.am
index 1bc4663..3a3caa2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -17,7 +17,7 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
 			  cpu-miner.c util.c crc32.c hefty1.c scrypt.c \
-			  hashlog.cpp \
+			  hashlog.cpp stats.cpp cuda.cu \
 			  heavy/heavy.cu \
 			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
 			  heavy/cuda_combine.cu heavy/cuda_combine.h \
diff --git a/README.txt b/README.txt
index bea7d7c..caac95d 100644
--- a/README.txt
+++ b/README.txt
@@ -155,6 +155,7 @@ features.
 >>> RELEASE HISTORY <<<
 
   Nov. 11th 2014  v1.4.7
+                  Average hashrate (based on the 50 last scans)
                   Rewrite blake algo
                   Add the -i (gpu threads/intensity parameter)
                   Add some X11 optimisations based on sp_ commits
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 8eb0499..554b88f 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -239,6 +239,7 @@
     <ClCompile Include="fuguecoin.cpp" />
     <ClCompile Include="groestlcoin.cpp" />
     <ClCompile Include="hashlog.cpp" />
+    <ClCompile Include="stats.cpp" />
     <ClCompile Include="hefty1.c" />
     <ClCompile Include="myriadgroestl.cpp" />
     <ClCompile Include="scrypt.c">
@@ -317,6 +318,7 @@
     <ClInclude Include="uint256.h" />
   </ItemGroup>
   <ItemGroup>
+    <CudaCompile Include="cuda.cu" />
     <CudaCompile Include="bitslice_transformations_quad.cu">
       <ExcludedFromBuild>true</ExcludedFromBuild>
     </CudaCompile>
@@ -599,4 +601,4 @@
   <Target Name="AfterClean">
     <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
   </Target>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 334ca64..95543a5 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -189,6 +189,9 @@
     <ClCompile Include="hashlog.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="stats.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="compat.h">
@@ -307,6 +310,9 @@
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
+    <CudaCompile Include="cuda.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
     <CudaCompile Include="cuda_fugue256.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
@@ -473,4 +479,4 @@
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
   </ItemGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/cpu-miner.c b/cpu-miner.c
index 8d3f632..e4c6d1a 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -391,11 +391,20 @@ static struct work _ALIGN(64) g_work;
 static time_t g_work_time;
 static pthread_mutex_t g_work_lock;
 
-
+/**
+ * Exit app
+ */
 void proper_exit(int reason)
 {
 	cuda_devicereset();
+
 	hashlog_purge_all();
+	stats_purge_all();
+
+#ifdef WIN32
+	timeEndPeriod(1); // else never executed
+#endif
+
 	exit(reason);
 }
 
@@ -487,12 +496,14 @@ static void calc_diff(struct work *work, int known)
 static int share_result(int result, const char *reason)
 {
 	char s[345];
-	double hashrate;
+	double hashrate = 0.;
 
-	hashrate = 0.;
 	pthread_mutex_lock(&stats_lock);
-	for (int i = 0; i < opt_n_threads; i++)
-		hashrate += thr_hashrates[i];
+	hashrate = stats_get_speed(-1);
+	if (hashrate == 0.) {
+		for (int i = 0; i < opt_n_threads; i++)
+			hashrate += thr_hashrates[i];
+	}
 	result ? accepted_count++ : rejected_count++;
 	pthread_mutex_unlock(&stats_lock);
 
@@ -1297,6 +1308,7 @@ continue_scan:
 				thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
 				if (rc > 1)
 					thr_hashrates[thr_id] = (rc * hashes_done) / (diff.tv_sec + 1e-6 * diff.tv_usec);
+				stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id]);
 			}
 			pthread_mutex_unlock(&stats_lock);
 		}
@@ -1307,15 +1319,18 @@ continue_scan:
 				device_map[thr_id], device_name[device_map[thr_id]], s);
 		}
 		if (thr_id == opt_n_threads - 1) {
-			double hashrate = 0.;
-			for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
-				hashrate += thr_hashrates[i];
-
-			global_hashrate = llround(hashrate);
+			double hashrate = stats_get_speed(-1);
+			if (hashrate == 0.) {
+				for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
+					hashrate += thr_hashrates[i];
+			}
 			if (opt_benchmark) {
 				sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate / 1000.);
 				applog(LOG_NOTICE, "Total: %s kH/s", s);
 			}
+
+			// X-Mining-Hashrate
+			global_hashrate = llround(hashrate);
 		}
 
 		if (rc) {
@@ -1520,6 +1535,7 @@ static void *stratum_thread(void *userdata)
 						stratum.bloc_height);
 				restart_threads();
 				hashlog_purge_old();
+				stats_purge_old();
 			} else if (opt_debug && !opt_quiet) {
 					applog(LOG_BLUE, "%s asks job %d for block %d", short_url,
 						strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
@@ -2096,5 +2112,7 @@ int main(int argc, char *argv[])
 
 	applog(LOG_INFO, "workio thread dead, exiting.");
 
+	proper_exit(0);
+
 	return 0;
 }
diff --git a/cuda.cu b/cuda.cu
new file mode 100644
index 0000000..3758335
--- /dev/null
+++ b/cuda.cu
@@ -0,0 +1,141 @@
+ï»¿#include <stdio.h>
+#include <memory.h>
+#include <string.h>
+
+#include <map>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+// include thrust
+#include <thrust/version.h>
+#include <thrust/remove.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+
+extern char *device_name[8];
+extern int device_map[8];
+
+// CUDA Devices on the System
+extern "C" int cuda_num_devices()
+{
+	int version;
+	cudaError_t err = cudaDriverGetVersion(&version);
+	if (err != cudaSuccess)
+	{
+		applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?");
+		exit(1);
+	}
+
+	int maj = version / 1000, min = version % 100; // same as in deviceQuery sample
+	if (maj < 5 || (maj == 5 && min < 5))
+	{
+		applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5);
+		exit(1);
+	}
+
+	int GPU_N;
+	err = cudaGetDeviceCount(&GPU_N);
+	if (err != cudaSuccess)
+	{
+		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
+		exit(1);
+	}
+	return GPU_N;
+}
+
+extern "C" void cuda_devicenames()
+{
+	cudaError_t err;
+	int GPU_N;
+	err = cudaGetDeviceCount(&GPU_N);
+	if (err != cudaSuccess)
+	{
+		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
+		exit(1);
+	}
+
+	for (int i=0; i < GPU_N; i++)
+	{
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, device_map[i]);
+
+		device_name[i] = strdup(props.name);
+	}
+}
+
+// Can't be called directly in cpu-miner.c
+extern "C" void cuda_devicereset()
+{
+	cudaDeviceReset();
+}
+
+static bool substringsearch(const char *haystack, const char *needle, int &match)
+{
+	int hlen = (int) strlen(haystack);
+	int nlen = (int) strlen(needle);
+	for (int i=0; i < hlen; ++i)
+	{
+		if (haystack[i] == ' ') continue;
+		int j=0, x = 0;
+		while(j < nlen)
+		{
+			if (haystack[i+x] == ' ') {++x; continue;}
+			if (needle[j] == ' ') {++j; continue;}
+			if (needle[j] == '#') return ++match == needle[j+1]-'0';
+			if (tolower(haystack[i+x]) != tolower(needle[j])) break;
+			++j; ++x;
+		}
+		if (j == nlen) return true;
+	}
+	return false;
+}
+
+// CUDA GerÃ¤t nach Namen finden (gibt GerÃ¤te-Index zurÃ¼ck oder -1)
+extern "C" int cuda_finddevice(char *name)
+{
+	int num = cuda_num_devices();
+	int match = 0;
+	for (int i=0; i < num; ++i)
+	{
+		cudaDeviceProp props;
+		if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
+			if (substringsearch(props.name, name, match)) return i;
+	}
+	return -1;
+}
+
+// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
+typedef struct { double value[8]; } tsumarray;
+cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
+{
+	cudaError_t result = cudaSuccess;
+	if (situation >= 0)
+	{
+		static std::map<int, tsumarray> tsum;
+
+		double a = 0.95, b = 0.05;
+		if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence
+
+		double tsync = 0.0;
+		double tsleep = 0.95 * tsum[situation].value[thr_id];
+		if (cudaStreamQuery(stream) == cudaErrorNotReady)
+		{
+			usleep((useconds_t)(1e6*tsleep));
+			struct timeval tv_start, tv_end;
+			gettimeofday(&tv_start, NULL);
+			result = cudaStreamSynchronize(stream);
+			gettimeofday(&tv_end, NULL);
+			tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
+		}
+		if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
+	}
+	else
+		result = cudaStreamSynchronize(stream);
+	return result;
+}
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index 3ae6f51..f733ef2 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -139,147 +139,23 @@ struct check_nonce_for_remove
     uint32_t  m_startNonce;
 };
 
-// Zahl der CUDA Devices im System bestimmen
-extern "C" int cuda_num_devices()
-{
-    int version;
-    cudaError_t err = cudaDriverGetVersion(&version);
-    if (err != cudaSuccess)
-    {
-        applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?");
-        exit(1);
-    }
-
-    int maj = version / 1000, min = version % 100; // same as in deviceQuery sample
-    if (maj < 5 || (maj == 5 && min < 5))
-    {
-        applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5);
-        exit(1);
-    }
-
-    int GPU_N;
-    err = cudaGetDeviceCount(&GPU_N);
-    if (err != cudaSuccess)
-    {
-        applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
-        exit(1);
-    }
-    return GPU_N;
-}
-
-// Gerätenamen holen
-extern char *device_name[8];
-extern int device_map[8];
-
-extern "C" void cuda_devicenames()
-{
-    cudaError_t err;
-    int GPU_N;
-    err = cudaGetDeviceCount(&GPU_N);
-    if (err != cudaSuccess)
-    {
-        applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
-        exit(1);
-    }
-
-    for (int i=0; i < GPU_N; i++)
-    {
-        cudaDeviceProp props;
-        cudaGetDeviceProperties(&props, device_map[i]);
-
-        device_name[i] = strdup(props.name);
-    }
-}
-
-// Can't be called directly in cpu-miner
-extern "C" void cuda_devicereset()
-{
-    cudaDeviceReset();
-}
-
-static bool substringsearch(const char *haystack, const char *needle, int &match)
-{
-    int hlen = (int) strlen(haystack);
-    int nlen = (int) strlen(needle);
-    for (int i=0; i < hlen; ++i)
-    {
-        if (haystack[i] == ' ') continue;
-        int j=0, x = 0;
-        while(j < nlen)
-        {
-            if (haystack[i+x] == ' ') {++x; continue;}
-            if (needle[j] == ' ') {++j; continue;}
-            if (needle[j] == '#') return ++match == needle[j+1]-'0';
-            if (tolower(haystack[i+x]) != tolower(needle[j])) break;
-            ++j; ++x;
-        }
-        if (j == nlen) return true;
-    }
-    return false;
-}
-
-// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
-extern "C" int cuda_finddevice(char *name)
-{
-    int num = cuda_num_devices();
-    int match = 0;
-    for (int i=0; i < num; ++i)
-    {
-        cudaDeviceProp props;
-        if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
-            if (substringsearch(props.name, name, match)) return i;
-    }
-    return -1;
-}
-
-// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
-typedef struct { double value[8]; } tsumarray;
-cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
-{
-    cudaError_t result = cudaSuccess;
-    if (situation >= 0)
-    {   
-        static std::map<int, tsumarray> tsum;
-
-        double a = 0.95, b = 0.05;
-        if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence
-
-        double tsync = 0.0;
-        double tsleep = 0.95 * tsum[situation].value[thr_id];
-        if (cudaStreamQuery(stream) == cudaErrorNotReady)
-        {
-            usleep((useconds_t)(1e6*tsleep));
-            struct timeval tv_start, tv_end;
-            gettimeofday(&tv_start, NULL);
-            result = cudaStreamSynchronize(stream);
-            gettimeofday(&tv_end, NULL);
-            tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
-        }
-        if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
-    }
-    else
-        result = cudaStreamSynchronize(stream);
-    return result;
-}
-
 int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
- const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done, uint32_t maxvote, int blocklen);
+    const uint32_t *ptarget, uint32_t max_nonce,
+    unsigned long *hashes_done, uint32_t maxvote, int blocklen);
 
 extern "C"
 int scanhash_heavy(int thr_id, uint32_t *pdata,
- const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done, uint32_t maxvote, int blocklen)
+    const uint32_t *ptarget, uint32_t max_nonce,
+    unsigned long *hashes_done, uint32_t maxvote, int blocklen)
 {
- return scanhash_heavy_cpp(thr_id, pdata,
-  ptarget, max_nonce, hashes_done, maxvote, blocklen);
+    return scanhash_heavy_cpp(thr_id, pdata,
+    ptarget, max_nonce, hashes_done, maxvote, blocklen);
 }
 
-extern bool opt_benchmark;
 
 int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
- const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done, uint32_t maxvote, int blocklen)
+    const uint32_t *ptarget, uint32_t max_nonce,
+    unsigned long *hashes_done, uint32_t maxvote, int blocklen)
 {
     const uint32_t first_nonce = pdata[19]; /* to check */
     // CUDA will process thousands of threads.
diff --git a/miner.h b/miner.h
index 13faecd..3671044 100644
--- a/miner.h
+++ b/miner.h
@@ -470,6 +470,11 @@ void hashlog_purge_job(char* jobid);
 void hashlog_purge_all(void);
 void hashlog_dump_job(char* jobid);
 
+void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate);
+double stats_get_speed(int thr_id);
+void stats_purge_old(void);
+void stats_purge_all(void);
+
 struct thread_q;
 
 extern struct thread_q *tq_new(void);
diff --git a/stats.cpp b/stats.cpp
new file mode 100644
index 0000000..c67e647
--- /dev/null
+++ b/stats.cpp
@@ -0,0 +1,103 @@
+/**
+ * Stats place holder
+ *
+ * Note: this source is C++ (requires std::map)
+ *
+ * tpruvot@github 2014
+ */
+#include <stdlib.h>
+#include <memory.h>
+#include <map>
+
+#include "miner.h"
+
+struct stats_data {
+	uint32_t tm_stat;
+	uint32_t hashcount;
+	double hashrate;
+	uint8_t thr_id;
+};
+
+static std::map<uint64_t, stats_data> tlastscans;
+static uint64_t uid = 0;
+
+#define STATS_PURGE_TIMEOUT 5*60
+
+/**
+ * Store speed per thread (todo: compute here)
+ */
+extern "C" void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate)
+{
+	uint64_t thr = (0xff && thr_id);
+	uint64_t key = (thr << 56) + (uid++ % UINT_MAX);
+	stats_data data;
+
+	if (hashcount < 1000 || !hashrate)
+		return;
+
+	memset(&data, 0, sizeof(data));
+	data.thr_id = thr;
+	data.tm_stat = (uint32_t) time(NULL);
+	data.hashcount = hashcount;
+	data.hashrate = hashrate;
+	tlastscans[key] = data;
+}
+
+/**
+ * Get the computed average speed
+ * @param thr_id int (-1 for all threads)
+ */
+extern "C" double stats_get_speed(int thr_id)
+{
+	uint64_t thr = (0xff && thr_id);
+	uint64_t keypfx = (thr << 56);
+	double speed = 0.;
+	// uint64_t hashcount;
+	int records = 0;
+	stats_data data;
+
+	std::map<uint64_t, stats_data>::iterator i = tlastscans.end();
+	while (i != tlastscans.begin() && records < 50) {
+		if ((i->first & UINT_MAX) > 3) /* ignore firsts */
+		if (thr_id == -1 || (keypfx & i->first) == keypfx) {
+			if (i->second.hashcount > 1000) {
+				speed += i->second.hashrate;
+				records++;
+			}
+		}
+		i--;
+	}
+	if (!records)
+		return 0.;
+	return speed / (1.0 * records);
+}
+
+/**
+ * Remove old entries to reduce memory usage
+ */
+extern "C" void stats_purge_old(void)
+{
+	int deleted = 0;
+	uint32_t now = (uint32_t) time(NULL);
+	uint32_t sz = tlastscans.size();
+	std::map<uint64_t, stats_data>::iterator i = tlastscans.begin();
+	while (i != tlastscans.end()) {
+		if ((now - i->second.tm_stat) > STATS_PURGE_TIMEOUT) {
+			deleted++;
+			tlastscans.erase(i++);
+		}
+		else ++i;
+	}
+	if (opt_debug && deleted) {
+		applog(LOG_DEBUG, "hashlog: %d/%d purged", deleted, sz);
+	}
+}
+
+/**
+ * Reset the cache
+ */
+extern "C" void stats_purge_all(void)
+{
+	tlastscans.clear();
+}
+