diff --git a/ccminer.cpp b/ccminer.cpp
index 8db1685..fe99248 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -128,6 +128,7 @@ int active_gpus;
 char * device_name[MAX_GPUS];
 short device_map[MAX_GPUS] = { 0 };
 long  device_sm[MAX_GPUS] = { 0 };
+short device_mpcount[MAX_GPUS] = { 0 };
 uint32_t gpus_intensity[MAX_GPUS] = { 0 };
 uint32_t device_gpu_clocks[MAX_GPUS] = { 0 };
 uint32_t device_mem_clocks[MAX_GPUS] = { 0 };
diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu
index 5d5b1c6..5a78639 100644
--- a/crypto/cryptonight.cu
+++ b/crypto/cryptonight.cu
@@ -3,8 +3,13 @@
 
 extern char *device_config[MAX_GPUS]; // -l 32x16
 
-static __thread uint32_t cn_blocks  = 32;
-static __thread uint32_t cn_threads = 16;
+static __thread uint32_t cn_blocks;
+static __thread uint32_t cn_threads;
+
+// used for gpu intensity on algo init
+static __thread bool gpu_init_shown = false;
+#define gpulog_init(p,thr,fmt, ...) if (!gpu_init_shown) \
+	gpulog(p, thr, fmt, ##__VA_ARGS__)
 
 static uint32_t *d_long_state[MAX_GPUS];
 static uint64_t *d_ctx_state[MAX_GPUS];
@@ -26,6 +31,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 	uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
 	const uint32_t first_nonce = *nonceptr;
 	uint32_t nonce = first_nonce;
+	int dev_id = device_map[thr_id];
 
 	if(opt_benchmark) {
 		ptarget[7] = 0x00ff;
@@ -33,19 +39,29 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 
 	if(!init[thr_id])
 	{
+		int mem = cuda_available_memory(thr_id);
+		int mul = device_sm[dev_id] >= 300 ? 4 : 1; // see cryptonight-core.cu
+		cn_threads = device_sm[dev_id] >= 600 ? 16 : 8; // real TPB is x4 on SM3+
+		cn_blocks = device_mpcount[dev_id] * 4;
+		if (cn_blocks*cn_threads*2.2 > mem) cn_blocks = device_mpcount[dev_id] * 2;
+
+		if (!opt_quiet)
+			gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
+				mem, device_mpcount[dev_id]);
+
 		if (device_config[thr_id]) {
-			sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
+			int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
-			gpulog(LOG_INFO, thr_id, "Using %u x %u kernel launch config, %u threads",
-				cn_blocks, cn_threads, throughput);
+			gpulog_init(LOG_INFO, thr_id, "Using %ux%u(x%d) kernel launch config, %u threads",
+				cn_blocks, cn_threads, mul, throughput);
 		} else {
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
 			if (throughput != cn_blocks*cn_threads && cn_threads) {
 				cn_blocks = throughput / cn_threads;
 				throughput = cn_threads * cn_blocks;
 			}
-			gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u threads (%ux%u)",
-				throughput2intensity(throughput), throughput, cn_blocks, cn_threads);
+			gpulog_init(LOG_INFO, thr_id, "%u threads (%g) with %u blocks",// of %ux%d",
+				throughput, throughput2intensity(throughput), cn_blocks);//, cn_threads, mul);
 		}
 
 		if(sizeof(size_t) == 4 && throughput > UINT32_MAX / MEMORY) {
@@ -67,7 +83,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 
 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 200 is aligned 8, not 16
+		cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16)
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@@ -80,6 +96,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 
+		gpu_init_shown = true;
 		init[thr_id] = true;
 	}
 
diff --git a/cuda.cpp b/cuda.cpp
index a2d19c4..9b65adc 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -78,6 +78,7 @@ void cuda_devicenames()
 		cudaGetDeviceProperties(&props, dev_id);
 
 		device_sm[dev_id] = (props.major * 100 + props.minor * 10);
+		device_mpcount[dev_id] = (short) props.multiProcessorCount;
 
 		if (device_name[dev_id]) {
 			free(device_name[dev_id]);
@@ -105,8 +106,10 @@ void cuda_print_devices()
 		cudaDeviceProp props;
 		cudaGetDeviceProperties(&props, dev_id);
 		if (!opt_n_threads || n < opt_n_threads) {
-			fprintf(stderr, "GPU #%d: SM %d.%d %s @ %.0f MHz (MEM %.0f)\n", dev_id, props.major, props.minor,
-				device_name[dev_id], (double) props.clockRate/1000, (double) props.memoryClockRate/1000);
+			fprintf(stderr, "GPU #%d: SM %d.%d %s @ %.0f MHz (MEM %.0f)\n", dev_id,
+				props.major, props.minor, device_name[dev_id],
+				(double) props.clockRate/1000,
+				(double) props.memoryClockRate/1000);
 #ifdef USE_WRAPNVML
 			if (opt_debug) nvml_print_device_info(dev_id);
 #ifdef WIN32
@@ -224,7 +227,7 @@ int cuda_available_memory(int thr_id)
 	uint64_t tot64 = 0, free64 = 0;
 	// cuda (6.5) one can crash on pascal and dont handle 8GB
 	nvapiMemGetInfo(dev_id, &free64, &tot64);
-	return (int) (free64 / (1024 * 1024));
+	return (int) (free64 / (1024));
 #else
 	size_t mtotal = 0, mfree = 0;
 	cudaSetDevice(dev_id);
diff --git a/cuda_helper.h b/cuda_helper.h
index da0fc2a..5d6b6cd 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -25,7 +25,7 @@
 
 extern "C" short device_map[MAX_GPUS];
 extern "C"  long device_sm[MAX_GPUS];
-
+extern short device_mpcount[MAX_GPUS];
 extern int cuda_arch[MAX_GPUS];
 
 // common functions
diff --git a/miner.h b/miner.h
index cd1ae92..9446760 100644
--- a/miner.h
+++ b/miner.h
@@ -519,6 +519,7 @@ extern double stratum_diff;
 //#define MAX_THREADS 32 todo
 extern char* device_name[MAX_GPUS];
 extern short device_map[MAX_GPUS];
+extern short device_mpcount[MAX_GPUS];
 extern long  device_sm[MAX_GPUS];
 extern uint32_t gpus_intensity[MAX_GPUS];
 extern int opt_cudaschedule;
@@ -576,6 +577,7 @@ void cuda_clear_lasterror();
 extern void format_hashrate(double hashrate, char *output);
 extern void applog(int prio, const char *fmt, ...);
 extern void gpulog(int prio, int thr_id, const char *fmt, ...);
+
 void get_defconfig_path(char *out, size_t bufsize, char *argv0);
 extern void cbin2hex(char *out, const char *in, size_t len);
 extern char *bin2hex(const unsigned char *in, size_t len);