diff --git a/README.md b/README.md
index 1040e56ea271405656af8b2ec93c988a226e7d8e..d19b1a77d692247a1bd1fbbda1969bfa615ac9a2 100644
--- a/README.md
+++ b/README.md
@@ -44,10 +44,13 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [Graft](https://www.graft.network)
 - [Haven](https://havenprotocol.com)
 - [Lethean](https://lethean.io)
+- [Masari](https://getmasari.org)
+- [Plenteum](https://www.plenteum.com/)
 - [QRL](https://theqrl.org)
 - **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo**
+- [Stellite](https://stellite.cash/)
 - [TurtleCoin](https://turtlecoin.lol)
-- [Plenteum](https://www.plenteum.com/)
+- [Zelerius](https://zelerius.org/)
 
 Ryo currency is a way for us to implement the ideas that we were unable to in
 Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
@@ -61,10 +64,13 @@ If your prefered coin is not listed, you can choose one of the following algorit
     - cryptonight_lite_v7_xor (algorithm used by ipbc)
 - 2MiB scratchpad memory
     - cryptonight
-    - cryptonight_masari
+    - cryptonight_gpu (for Ryo's 14th of Feb fork)
+    - cryptonight_masari (used in 2018)
     - cryptonight_v7
     - cryptonight_v7_stellite
     - cryptonight_v8
+    - cryptonight_v8_half (used by masari and stellite)
+    - cryptonight_v8_zelerius
 - 4MiB scratchpad memory
     - cryptonight_haven
     - cryptonight_heavy
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index f80c37a8b5eca84ba094d1f04fd2a21f456f3345..8713784c274e56a25213304c32959c1fc5d94f1c 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -300,6 +300,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		MaximumWorkSize /= 8;
 	}
 	printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
+
+	if(ctx->workSize > MaximumWorkSize)
+	{
+		ctx->workSize = MaximumWorkSize;
+		printer::inst()->print_msg(L1,"Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
+	}
+
+	const std::string backendName = xmrstak::params::inst().openCLVendor;
+	if( (ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0)
+	{
+		size_t reduced_intensity = (ctx->rawIntensity / ctx->workSize) * ctx->workSize;
+		ctx->rawIntensity = reduced_intensity;
+		printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx->deviceIdx, int(reduced_intensity));
+	}
+
 #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
 	const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 };
 	ctx->CommandQueues = clCreateCommandQueueWithProperties(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret);
@@ -330,7 +345,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	size_t scratchPadSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{
-		scratchPadSize = std::max(scratchPadSize, cn_select_memory(algo));
+		scratchPadSize = std::max(scratchPadSize, algo.Mem());
 	}
 
 	size_t g_thd = ctx->rawIntensity;
@@ -405,9 +420,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	for(const auto miner_algo : neededAlgorithms)
 	{
 		// scratchpad size for the selected mining algorithm
-		size_t hashMemSize = cn_select_memory(miner_algo);
-		int threadMemMask = cn_select_mask(miner_algo);
-		int hashIterations = cn_select_iter(miner_algo);
+		size_t hashMemSize = miner_algo.Mem();
+		int threadMemMask = miner_algo.Mask();
+		int hashIterations = miner_algo.Iter();
 
 		size_t mem_chunk_exp = 1u << ctx->memChunk;
 		size_t strided_index = ctx->stridedIndex;
@@ -415,7 +430,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 * this is required if the dev pool is mining monero
 		 * but the user tuned there settings for another currency
 		 */
-		if(miner_algo == cryptonight_monero_v8 || miner_algo == cryptonight_turtle)
+		if(miner_algo == cryptonight_monero_v8)
 		{
 			if(ctx->memChunk < 2)
 				mem_chunk_exp = 1u << 2;
@@ -438,7 +453,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U";
 		options += " -DCOMP_MODE=" + std::to_string(needCompMode);
 		options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU";
-		options += " -DALGO=" + std::to_string(miner_algo);
+		options += " -DALGO=" + std::to_string(miner_algo.Id());
 		options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
 		/* AMD driver output is something like: `1445.5 (VM)`
 		 * and is mapped to `14` only. The value is only used for a compiler
@@ -611,6 +626,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		for(int k = 0; k < 3; k++)
 			KernelNames[k] += std::to_string(miner_algo);
 
+		if(miner_algo == cryptonight_gpu)
+		{
+			KernelNames.push_back(std::string("cn00_cn_gpu") + std::to_string(miner_algo));
+		}
+
 		for(int i = 0; i < KernelNames.size(); ++i)
 		{
 			ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret);
@@ -984,14 +1004,6 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
 		ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
 
-		const std::string backendName = xmrstak::params::inst().openCLVendor;
-		if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
-		{
-			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
-			ctx[i].rawIntensity = reduced_intensity;
-			printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
-		}
-
 		if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
 		{
 			return ret;
@@ -1001,10 +1013,10 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	return ERR_SUCCESS;
 }
 
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo)
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo)
 {
 
-	const auto & Kernels = ctx->Kernels[miner_algo];
+	const auto & Kernels = ctx->Kernels[miner_algo.Id()];
 
 	cl_int ret;
 
@@ -1049,6 +1061,24 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		return(ERR_OCL_API);
 	}
 
+	if(miner_algo == cryptonight_gpu)
+	{
+		// we use an additional cn0 kernel to prepare the scratchpad
+		// Scratchpads
+		if((ret = clSetKernelArg(Kernels[7], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+
+		// States
+		if((ret = clSetKernelArg(Kernels[7], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+	}
+
 	// CN1 Kernel
 
 	// Scratchpads
@@ -1289,9 +1319,9 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 	return t0;
 }
 
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo)
 {
-	const auto & Kernels = ctx->Kernels[miner_algo];
+	const auto & Kernels = ctx->Kernels[miner_algo.Id()];
 
 	cl_int ret;
 	cl_uint zero = 0;
@@ -1336,6 +1366,14 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 
 	if(miner_algo == cryptonight_gpu)
 	{
+		size_t thd = 64;
+		size_t intens = g_intensity * thd;
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[7], 1, 0, &intens, &thd, 0, NULL, NULL)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7);
+			return ERR_OCL_API;
+		}
+
 		size_t w_size_cn_gpu = w_size * 16;
 		size_t g_thd_cn_gpu = g_thd * 16;
 
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 5b95e9865b17717913fa195c9b68c57d0f0adba6..d665dff1b2f3d5a5296178fb37467caf64001c59 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -52,8 +52,8 @@ struct GpuContext
 	cl_mem InputBuffer;
 	cl_mem OutputBuffer;
 	cl_mem ExtraBuffers[6];
-	std::map<xmrstak_algo, cl_program> Program;
-	std::map<xmrstak_algo, std::array<cl_kernel,7>> Kernels;
+	std::map<xmrstak_algo_id, cl_program> Program;
+	std::map<xmrstak_algo_id, std::array<cl_kernel,8>> Kernels;
 	size_t freeMem;
 	size_t maxMemPerAlloc;
 	int computeUnits;
@@ -72,7 +72,7 @@ int getAMDPlatformIdx();
 std::vector<GpuContext> getAMDDevices(int index);
 
 size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo);
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo);
 uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment = true);
 uint64_t updateTimings(GpuContext* ctx, const uint64_t t);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 2c775e77e1dca5fd4cd33e2b1de88b235dc114d9..eac4dadb089ce5313b2513c3d682a1bb8d6cceb1 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -51,10 +51,10 @@ R"===(
  */
 inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
 {
-    uint2 result;
+	uint2 result;
 	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
 	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
-    return result;
+	return result;
 }
 #endif
 
@@ -78,256 +78,256 @@ inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
  */
 inline int amd_bfe(const uint src0, const uint offset, const uint width)
 {
-    /* casts are removed because we can implement everything as uint
-     * int offset = src1;
-     * int width = src2;
-     * remove check for edge case, this function is always called with
-     * `width==8`
-     * @code
-     *   if ( width == 0 )
-     *      return 0;
-     * @endcode
-     */
-    if ( (offset + width) < 32u )
-        return (src0 << (32u - offset - width)) >> (32u - width);
-
-    return src0 >> offset;
+	/* casts are removed because we can implement everything as uint
+	 * int offset = src1;
+	 * int width = src2;
+	 * remove check for edge case, this function is always called with
+	 * `width==8`
+	 * @code
+	 *   if ( width == 0 )
+	 *      return 0;
+	 * @endcode
+	 */
+	if ( (offset + width) < 32u )
+		return (src0 << (32u - offset - width)) >> (32u - width);
+
+	return src0 >> offset;
 }
 #endif
 
 static const __constant ulong keccakf_rndc[24] =
 {
-    0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
-    0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
-    0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
-    0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-    0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-    0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
-    0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
-    0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
 };
 
 static const __constant uchar sbox[256] =
 {
-    0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
-    0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
-    0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
-    0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
-    0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
-    0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
-    0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
-    0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
-    0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
-    0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
-    0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
-    0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
-    0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
-    0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
-    0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
-    0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
+	0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+	0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+	0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+	0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+	0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+	0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+	0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+	0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+	0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+	0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+	0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+	0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+	0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+	0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+	0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+	0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
 };
 
 
 void keccakf1600(ulong *s)
 {
-    for(int i = 0; i < 24; ++i)
-    {
-        ulong bc[5], tmp1, tmp2;
-        bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
-        bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL);
-        bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
-        bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
-        bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
-
-        tmp1 = s[1] ^ bc[0];
-
-        s[0] ^= bc[4];
-        s[1] = rotate(s[6] ^ bc[0], 44UL);
-        s[6] = rotate(s[9] ^ bc[3], 20UL);
-        s[9] = rotate(s[22] ^ bc[1], 61UL);
-        s[22] = rotate(s[14] ^ bc[3], 39UL);
-        s[14] = rotate(s[20] ^ bc[4], 18UL);
-        s[20] = rotate(s[2] ^ bc[1], 62UL);
-        s[2] = rotate(s[12] ^ bc[1], 43UL);
-        s[12] = rotate(s[13] ^ bc[2], 25UL);
-        s[13] = rotate(s[19] ^ bc[3], 8UL);
-        s[19] = rotate(s[23] ^ bc[2], 56UL);
-        s[23] = rotate(s[15] ^ bc[4], 41UL);
-        s[15] = rotate(s[4] ^ bc[3], 27UL);
-        s[4] = rotate(s[24] ^ bc[3], 14UL);
-        s[24] = rotate(s[21] ^ bc[0], 2UL);
-        s[21] = rotate(s[8] ^ bc[2], 55UL);
-        s[8] = rotate(s[16] ^ bc[0], 35UL);
-        s[16] = rotate(s[5] ^ bc[4], 36UL);
-        s[5] = rotate(s[3] ^ bc[2], 28UL);
-        s[3] = rotate(s[18] ^ bc[2], 21UL);
-        s[18] = rotate(s[17] ^ bc[1], 15UL);
-        s[17] = rotate(s[11] ^ bc[0], 10UL);
-        s[11] = rotate(s[7] ^ bc[1], 6UL);
-        s[7] = rotate(s[10] ^ bc[4], 3UL);
-        s[10] = rotate(tmp1, 1UL);
-
-        tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
-        tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
-        tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
-        tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
-        tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
-        s[0] ^= keccakf_rndc[i];
-    }
+	for(int i = 0; i < 24; ++i)
+	{
+		ulong bc[5], tmp1, tmp2;
+		bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
+		bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL);
+		bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
+		bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
+		bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = rotate(s[6] ^ bc[0], 44UL);
+		s[6] = rotate(s[9] ^ bc[3], 20UL);
+		s[9] = rotate(s[22] ^ bc[1], 61UL);
+		s[22] = rotate(s[14] ^ bc[3], 39UL);
+		s[14] = rotate(s[20] ^ bc[4], 18UL);
+		s[20] = rotate(s[2] ^ bc[1], 62UL);
+		s[2] = rotate(s[12] ^ bc[1], 43UL);
+		s[12] = rotate(s[13] ^ bc[2], 25UL);
+		s[13] = rotate(s[19] ^ bc[3], 8UL);
+		s[19] = rotate(s[23] ^ bc[2], 56UL);
+		s[23] = rotate(s[15] ^ bc[4], 41UL);
+		s[15] = rotate(s[4] ^ bc[3], 27UL);
+		s[4] = rotate(s[24] ^ bc[3], 14UL);
+		s[24] = rotate(s[21] ^ bc[0], 2UL);
+		s[21] = rotate(s[8] ^ bc[2], 55UL);
+		s[8] = rotate(s[16] ^ bc[0], 35UL);
+		s[16] = rotate(s[5] ^ bc[4], 36UL);
+		s[5] = rotate(s[3] ^ bc[2], 28UL);
+		s[3] = rotate(s[18] ^ bc[2], 21UL);
+		s[18] = rotate(s[17] ^ bc[1], 15UL);
+		s[17] = rotate(s[11] ^ bc[0], 10UL);
+		s[11] = rotate(s[7] ^ bc[1], 6UL);
+		s[7] = rotate(s[10] ^ bc[4], 3UL);
+		s[10] = rotate(tmp1, 1UL);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccakf_rndc[i];
+	}
 }
 
 static const __constant uint keccakf_rotc[24] =
 {
-    1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
-    27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+	1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+	27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
 };
 
 static const __constant uint keccakf_piln[24] =
 {
-    10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
-    15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+	10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+	15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
 };
 
 inline void keccakf1600_1(ulong st[25])
 {
-    int i, round;
-    ulong t, bc[5];
-
-    #pragma unroll 1
-    for (round = 0; round < 24; ++round)
-    {
-        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
-        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
-        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
-        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
-        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
-
-        st[0] ^= bc[4];
-        st[5] ^= bc[4];
-        st[10] ^= bc[4];
-        st[15] ^= bc[4];
-        st[20] ^= bc[4];
-
-        st[1] ^= bc[0];
-        st[6] ^= bc[0];
-        st[11] ^= bc[0];
-        st[16] ^= bc[0];
-        st[21] ^= bc[0];
-
-        st[2] ^= bc[1];
-        st[7] ^= bc[1];
-        st[12] ^= bc[1];
-        st[17] ^= bc[1];
-        st[22] ^= bc[1];
-
-        st[3] ^= bc[2];
-        st[8] ^= bc[2];
-        st[13] ^= bc[2];
-        st[18] ^= bc[2];
-        st[23] ^= bc[2];
-
-        st[4] ^= bc[3];
-        st[9] ^= bc[3];
-        st[14] ^= bc[3];
-        st[19] ^= bc[3];
-        st[24] ^= bc[3];
-
-        // Rho Pi
-        t = st[1];
-        #pragma unroll
-        for (i = 0; i < 24; ++i) {
-            bc[0] = st[keccakf_piln[i]];
-            st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
-            t = bc[0];
-        }
-
-        #pragma unroll
-        for(int i = 0; i < 25; i += 5)
-        {
-            ulong tmp1 = st[i], tmp2 = st[i + 1];
-
-            st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
-            st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
-            st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
-            st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
-            st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
-        }
-
-        //  Iota
-        st[0] ^= keccakf_rndc[round];
-    }
+	int i, round;
+	ulong t, bc[5];
+
+	#pragma unroll 1
+	for (round = 0; round < 24; ++round)
+	{
+		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
+		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
+		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
+		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
+		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
+
+		st[0] ^= bc[4];
+		st[5] ^= bc[4];
+		st[10] ^= bc[4];
+		st[15] ^= bc[4];
+		st[20] ^= bc[4];
+
+		st[1] ^= bc[0];
+		st[6] ^= bc[0];
+		st[11] ^= bc[0];
+		st[16] ^= bc[0];
+		st[21] ^= bc[0];
+
+		st[2] ^= bc[1];
+		st[7] ^= bc[1];
+		st[12] ^= bc[1];
+		st[17] ^= bc[1];
+		st[22] ^= bc[1];
+
+		st[3] ^= bc[2];
+		st[8] ^= bc[2];
+		st[13] ^= bc[2];
+		st[18] ^= bc[2];
+		st[23] ^= bc[2];
+
+		st[4] ^= bc[3];
+		st[9] ^= bc[3];
+		st[14] ^= bc[3];
+		st[19] ^= bc[3];
+		st[24] ^= bc[3];
+
+		// Rho Pi
+		t = st[1];
+		#pragma unroll
+		for (i = 0; i < 24; ++i) {
+			bc[0] = st[keccakf_piln[i]];
+			st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
+			t = bc[0];
+		}
+
+		#pragma unroll
+		for(int i = 0; i < 25; i += 5)
+		{
+			ulong tmp1 = st[i], tmp2 = st[i + 1];
+
+			st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
+			st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
+			st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
+			st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
+			st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
+		}
+
+		//  Iota
+		st[0] ^= keccakf_rndc[round];
+	}
 }
 )==="
 R"===(
 
 void keccakf1600_2(__local ulong *st)
 {
-    int i, round;
-    ulong t, bc[5];
-
-    #pragma unroll 1
-    for (round = 0; round < 24; ++round)
-    {
-        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
-        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
-        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
-        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
-        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
-
-        st[0] ^= bc[4];
-        st[5] ^= bc[4];
-        st[10] ^= bc[4];
-        st[15] ^= bc[4];
-        st[20] ^= bc[4];
-
-        st[1] ^= bc[0];
-        st[6] ^= bc[0];
-        st[11] ^= bc[0];
-        st[16] ^= bc[0];
-        st[21] ^= bc[0];
-
-        st[2] ^= bc[1];
-        st[7] ^= bc[1];
-        st[12] ^= bc[1];
-        st[17] ^= bc[1];
-        st[22] ^= bc[1];
-
-        st[3] ^= bc[2];
-        st[8] ^= bc[2];
-        st[13] ^= bc[2];
-        st[18] ^= bc[2];
-        st[23] ^= bc[2];
-
-        st[4] ^= bc[3];
-        st[9] ^= bc[3];
-        st[14] ^= bc[3];
-        st[19] ^= bc[3];
-        st[24] ^= bc[3];
-
-        // Rho Pi
-        t = st[1];
-        #pragma unroll
-        for (i = 0; i < 24; ++i) {
-            bc[0] = st[keccakf_piln[i]];
-            st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
-            t = bc[0];
-        }
-
-        #pragma unroll
-        for(int i = 0; i < 25; i += 5)
-        {
-            ulong tmp1 = st[i], tmp2 = st[i + 1];
-
-            st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
-            st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
-            st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
-            st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
-            st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
-        }
-
-        //  Iota
-        st[0] ^= keccakf_rndc[round];
-    }
+	int i, round;
+	ulong t, bc[5];
+
+	#pragma unroll 1
+	for (round = 0; round < 24; ++round)
+	{
+		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
+		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
+		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
+		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
+		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
+
+		st[0] ^= bc[4];
+		st[5] ^= bc[4];
+		st[10] ^= bc[4];
+		st[15] ^= bc[4];
+		st[20] ^= bc[4];
+
+		st[1] ^= bc[0];
+		st[6] ^= bc[0];
+		st[11] ^= bc[0];
+		st[16] ^= bc[0];
+		st[21] ^= bc[0];
+
+		st[2] ^= bc[1];
+		st[7] ^= bc[1];
+		st[12] ^= bc[1];
+		st[17] ^= bc[1];
+		st[22] ^= bc[1];
+
+		st[3] ^= bc[2];
+		st[8] ^= bc[2];
+		st[13] ^= bc[2];
+		st[18] ^= bc[2];
+		st[23] ^= bc[2];
+
+		st[4] ^= bc[3];
+		st[9] ^= bc[3];
+		st[14] ^= bc[3];
+		st[19] ^= bc[3];
+		st[24] ^= bc[3];
+
+		// Rho Pi
+		t = st[1];
+		#pragma unroll
+		for (i = 0; i < 24; ++i) {
+			bc[0] = st[keccakf_piln[i]];
+			st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
+			t = bc[0];
+		}
+
+		#pragma unroll
+		for(int i = 0; i < 25; i += 5)
+		{
+			ulong tmp1 = st[i], tmp2 = st[i + 1];
+
+			st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
+			st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
+			st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
+			st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
+			st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
+		}
+
+		//  Iota
+		st[0] ^= keccakf_rndc[round];
+	}
 }
 
 #define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)
@@ -347,7 +347,7 @@ void keccakf1600_2(__local ulong *st)
 
 inline uint getIdx()
 {
-    return get_global_id(0) - get_global_offset(0);
+	return get_global_id(0) - get_global_offset(0);
 }
 
 //#include "opencl/fast_int_math_v2.cl"
@@ -375,24 +375,24 @@ R"===(
 
 void CNKeccak(ulong *output, ulong *input)
 {
-    ulong st[25];
+	ulong st[25];
 
-    // Copy 72 bytes
-    for(int i = 0; i < 9; ++i) st[i] = input[i];
+	// Copy 72 bytes
+	for(int i = 0; i < 9; ++i) st[i] = input[i];
 
-    // Last four and '1' bit for padding
-    //st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
+	// Last four and '1' bit for padding
+	//st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
 
-    st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
+	st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
 
-    for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
+	for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
 
-    // Last bit of padding
-    st[16] = 0x8000000000000000UL;
+	// Last bit of padding
+	st[16] = 0x8000000000000000UL;
 
-    keccakf1600_1(st);
+	keccakf1600_1(st);
 
-    for(int i = 0; i < 25; ++i) output[i] = st[i];
+	for(int i = 0; i < 25; ++i) output[i] = st[i];
 }
 
 static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
@@ -403,17 +403,17 @@ static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x
 
 void AESExpandKey256(uint *keybuf)
 {
-    //#pragma unroll 4
-    for(uint c = 8, i = 1; c < 40; ++c)
-    {
-        // For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
-        uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
-
-        // If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
-        // then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
-        // is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
-        keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t);
-    }
+	//#pragma unroll 4
+	for(uint c = 8, i = 1; c < 40; ++c)
+	{
+		// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
+		uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
+
+		// If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
+		// then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
+		// is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
+		keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t);
+	}
 }
 
 )==="
@@ -424,159 +424,159 @@ R"===(
 __attribute__((reqd_work_group_size(8, 8, 1)))
 __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
 {
-    uint ExpandedKey1[40];
-    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
-    uint4 text;
+	uint ExpandedKey1[40];
+	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
+	uint4 text;
 
-    const uint gIdx = getIdx();
+	const uint gIdx = getIdx();
 
 	for(int i = get_local_id(1) * 8 + get_local_id(0);
 		i < 256;
 		i += 8 * 8)
 	{
-        const uint tmp = AES0_C[i];
-        AES0[i] = tmp;
-        AES1[i] = rotate(tmp, 8U);
-        AES2[i] = rotate(tmp, 16U);
-        AES3[i] = rotate(tmp, 24U);
-    }
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
+		AES2[i] = rotate(tmp, 16U);
+		AES3[i] = rotate(tmp, 24U);
+	}
 
-    __local ulong State_buf[8 * 25];
+	__local ulong State_buf[8 * 25];
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_LOCAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-    // do not use early return here
+	// do not use early return here
 	if(gIdx < Threads)
 #endif
-    {
-        states += 25 * gIdx;
+	{
+		states += 25 * gIdx;
 
 #if(STRIDED_INDEX==0)
-        Scratchpad += gIdx * (MEMORY >> 4);
+		Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-        Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
 #elif(STRIDED_INDEX==3)
 		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
-        if (get_local_id(1) == 0)
-        {
-            __local ulong* State = State_buf + get_local_id(0) * 25;
+		if (get_local_id(1) == 0)
+		{
+			__local ulong* State = State_buf + get_local_id(0) * 25;
 // NVIDIA
 #ifdef __NV_CL_C_VERSION
 			for(uint i = 0; i < 8; ++i)
 				State[i] = input[i];
 #else
-            ((__local ulong8 *)State)[0] = vload8(0, input);
+			((__local ulong8 *)State)[0] = vload8(0, input);
 #endif
-            State[8]  = input[8];
-            State[9]  = input[9];
-            State[10] = input[10];
-
-            ((__local uint *)State)[9]  &= 0x00FFFFFFU;
-            ((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
-            ((__local uint *)State)[10] &= 0xFF000000U;
-            /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
-             * handle get_global_id and get_global_offset as signed long long int and add
-             * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
-             * (even if it is correct casted to unsigned on the host)
-             */
-            ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
-
-            for (int i = 11; i < 25; ++i) {
-                State[i] = 0x00UL;
-            }
-
-            // Last bit of padding
-            State[16] = 0x8000000000000000UL;
-
-            keccakf1600_2(State);
-
-            #pragma unroll
-            for (int i = 0; i < 25; ++i) {
-                states[i] = State[i];
-            }
-        }
-    }
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
+			State[8]  = input[8];
+			State[9]  = input[9];
+			State[10] = input[10];
+
+			((__local uint *)State)[9]  &= 0x00FFFFFFU;
+			((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
+			((__local uint *)State)[10] &= 0xFF000000U;
+			/* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
+			 * handle get_global_id and get_global_offset as signed long long int and add
+			 * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
+			 * (even if it is correct casted to unsigned on the host)
+			 */
+			((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
+
+			for (int i = 11; i < 25; ++i) {
+			    State[i] = 0x00UL;
+			}
+
+			// Last bit of padding
+			State[16] = 0x8000000000000000UL;
+
+			keccakf1600_2(State);
+
+			#pragma unroll
+			for (int i = 0; i < 25; ++i) {
+			    states[i] = State[i];
+			}
+		}
+	}
+
+	barrier(CLK_GLOBAL_MEM_FENCE);
 
 #   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
+	// do not use early return here
+	if (gIdx < Threads)
 #   endif
-    {
-        text = vload4(get_local_id(1) + 4, (__global uint *)(states));
+	{
+		text = vload4(get_local_id(1) + 4, (__global uint *)(states));
 
-        #pragma unroll
-        for (int i = 0; i < 4; ++i) {
-            ((ulong *)ExpandedKey1)[i] = states[i];
-        }
+		#pragma unroll
+		for (int i = 0; i < 4; ++i) {
+			((ulong *)ExpandedKey1)[i] = states[i];
+		}
 
-        AESExpandKey256(ExpandedKey1);
-    }
+		AESExpandKey256(ExpandedKey1);
+	}
 
-    mem_fence(CLK_LOCAL_MEM_FENCE);
+	mem_fence(CLK_LOCAL_MEM_FENCE);
 
 #if (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
-    __local uint4 xin[8][8];
-    {
-
-        /* Also left over threads perform this loop.
-         * The left over thread results will be ignored
-         */
-        #pragma unroll 16
-        for (size_t i = 0; i < 16; i++) {
-            #pragma unroll 10
-            for (int j = 0; j < 10; ++j) {
-                uint4 t = ((uint4 *)ExpandedKey1)[j];
-                t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
-                t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
-                t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
-                t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
-                text = t;
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-            xin[get_local_id(1)][get_local_id(0)] = text;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            text = mix_and_propagate(xin);
-        }
-    }
+	__local uint4 xin[8][8];
+	{
+
+		/* Also left over threads perform this loop.
+		 * The left over thread results will be ignored
+		 */
+		#pragma unroll 16
+		for (size_t i = 0; i < 16; i++) {
+			#pragma unroll 10
+			for (int j = 0; j < 10; ++j) {
+			    uint4 t = ((uint4 *)ExpandedKey1)[j];
+			    t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
+			    t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
+			    t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
+			    t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
+			    text = t;
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+			xin[get_local_id(1)][get_local_id(0)] = text;
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text = mix_and_propagate(xin);
+		}
+	}
 #endif
 
 #if(COMP_MODE==1)
-    // do not use early return here
+	// do not use early return here
 	if(gIdx < Threads)
 #endif
-    {
-
-        #pragma unroll 2
-        for(int i = 0; i < (MEMORY >> 4); i += 8) {
-            #pragma unroll 10
-            for (int j = 0; j < 10; ++j) {
-                uint4 t = ((uint4 *)ExpandedKey1)[j];
-                t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
-                t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
-                t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
-                t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
-                text = t;
-            }
-
-            Scratchpad[IDX(i + get_local_id(1))] = text;
-        }
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
+	{
+
+		#pragma unroll 2
+		for(int i = 0; i < (MEMORY >> 4); i += 8) {
+			#pragma unroll 10
+			for (int j = 0; j < 10; ++j) {
+			    uint4 t = ((uint4 *)ExpandedKey1)[j];
+			    t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
+			    t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
+			    t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
+			    t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
+			    text = t;
+			}
+
+			Scratchpad[IDX(i + get_local_id(1))] = text;
+		}
+	}
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 )==="
 R"===(
 
 // __NV_CL_C_VERSION checks if NVIDIA opencl is used
-#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && defined(__NV_CL_C_VERSION))
+#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
 #	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
 #else
@@ -593,7 +593,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 {
 	ulong a[2];
 
-#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+#if(ALGO == cryptonight_monero_v8)
 	ulong b[4];
 	uint4 b_x[2];
 // NVIDIA
@@ -605,62 +605,62 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	ulong b[2];
 	uint4 b_x[1];
 #endif
-    __local uint AES0[256], AES1[256];
+	__local uint AES0[256], AES1[256];
 
-#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+#if(ALGO == cryptonight_monero_v8)
 #	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
-    __local uint RCP[256];
+	__local uint RCP[256];
 #	endif
 
 	uint2 division_result;
 	uint sqrt_result;
 #endif
-    const uint gIdx = getIdx();
+	const uint gIdx = getIdx();
 
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
-        const uint tmp = AES0_C[i];
-        AES0[i] = tmp;
-        AES1[i] = rotate(tmp, 8U);
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
 
-#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
+#if(ALGO == cryptonight_monero_v8 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
 		RCP[i] = RCP_C[i];
 #endif
-    }
+	}
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_LOCAL_MEM_FENCE);
 
 #if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-    uint2 tweak1_2;
+	uint2 tweak1_2;
 #endif
 
 #if(COMP_MODE==1)
-    // do not use early return here
+	// do not use early return here
 	if(gIdx < Threads)
 #endif
-    {
-        states += 25 * gIdx;
+	{
+		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
-        Scratchpad += gIdx * (MEMORY >> 4);
+		Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-        Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
 #elif(STRIDED_INDEX==3)
 		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
-        a[0] = states[0] ^ states[4];
-        b[0] = states[2] ^ states[6];
-        a[1] = states[1] ^ states[5];
-        b[1] = states[3] ^ states[7];
+		a[0] = states[0] ^ states[4];
+		b[0] = states[2] ^ states[6];
+		a[1] = states[1] ^ states[5];
+		b[1] = states[3] ^ states[7];
 
 		b_x[0] = ((uint4 *)b)[0];
 
-#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
-        a[1] = states[1] ^ states[5];
-        b[2] = states[8] ^ states[10];
-        b[3] = states[9] ^ states[11];
+#if(ALGO == cryptonight_monero_v8)
+		a[1] = states[1] ^ states[5];
+		b[2] = states[8] ^ states[10];
+		b[3] = states[9] ^ states[11];
 		b_x[1] = ((uint4 *)b)[1];
 		division_result = as_uint2(states[12]);
 		sqrt_result = as_uint2(states[13]).s0;
@@ -673,23 +673,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		tweak1_2.s1 = (uint)get_global_id(0);
 		tweak1_2 ^= as_uint2(states[24]);
 #endif
-    }
+	}
 
-    mem_fence(CLK_LOCAL_MEM_FENCE);
+	mem_fence(CLK_LOCAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-    // do not use early return here
+	// do not use early return here
 	if(gIdx < Threads)
 #endif
-    {
+	{
 		uint idx0 = as_uint2(a[0]).s0 & MASK;
 
 		#pragma unroll CN_UNROLL
-    for(int i = 0; i < ITERATIONS; ++i)
-    {
+	for(int i = 0; i < ITERATIONS; ++i)
+	{
 			ulong c[2];
 
-#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && defined(__NV_CL_C_VERSION))
+#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
 			uint idxS = idx0 & 0x30U;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #endif
@@ -702,15 +702,15 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #endif
 
-#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
-        {
+#if(ALGO == cryptonight_monero_v8)
+		{
 			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
 			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
 			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
 			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
 			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
-            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
-        }
+			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+		}
 #endif
 
 #if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
@@ -726,7 +726,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(0) = b_x[0];
 			idx0 = as_uint2(c[0]).s0 & MASK;
 
-#elif(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+#elif(ALGO == cryptonight_monero_v8)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
 			// flush shuffled data
@@ -745,10 +745,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
 
-#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+#if(ALGO == cryptonight_monero_v8)
 			// Use division and square root results from the _previous_ iteration to hide the latency
-            tmp.s0 ^= division_result.s0;
-            tmp.s1 ^= division_result.s1 ^ sqrt_result;
+			tmp.s0 ^= division_result.s0;
+			tmp.s1 ^= division_result.s1 ^ sqrt_result;
  			// Most and least significant bits in the divisor are set to 1
 			// to make sure we don't divide by a small or even number,
 			// so there are no shortcuts for such cases
@@ -799,9 +799,9 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 #endif
 
-        ((uint4 *)a)[0] ^= tmp;
+		((uint4 *)a)[0] ^= tmp;
 
-#if (ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+#if (ALGO == cryptonight_monero_v8)
 #	if defined(__NV_CL_C_VERSION)
 			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
@@ -814,7 +814,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if (ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
-            long q = fast_div_heavy(n, d | 0x5);
+			long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
 			idx0 = (d ^ as_int2(q).s0) & MASK;
 #elif (ALGO == cryptonight_haven || ALGO == cryptonight_superfast)
@@ -825,9 +825,9 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			idx0 = ((~d) ^ as_int2(q).s0) & MASK;
 #endif
 
-    }
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
+	}
+	}
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 )==="
@@ -842,156 +842,156 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	__global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
 #endif
 {
-    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
-    uint ExpandedKey2[40];
-    uint4 text;
+	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
+	uint ExpandedKey2[40];
+	uint4 text;
 
-    const uint gIdx = getIdx();
+	const uint gIdx = getIdx();
 
-    for (int i = get_local_id(1) * 8 + get_local_id(0); i < 256; i += 8 * 8) {
-        const uint tmp = AES0_C[i];
-        AES0[i] = tmp;
-        AES1[i] = rotate(tmp, 8U);
-        AES2[i] = rotate(tmp, 16U);
-        AES3[i] = rotate(tmp, 24U);
-    }
+	for (int i = get_local_id(1) * 8 + get_local_id(0); i < 256; i += 8 * 8) {
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
+		AES2[i] = rotate(tmp, 16U);
+		AES3[i] = rotate(tmp, 24U);
+	}
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_LOCAL_MEM_FENCE);
 
 #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2  || ALGO == cryptonight_superfast)
-    __local uint4 xin1[8][8];
-    __local uint4 xin2[8][8];
+	__local uint4 xin1[8][8];
+	__local uint4 xin2[8][8];
 #endif
 
 #if(COMP_MODE==1)
-    // do not use early return here
-    if(gIdx < Threads)
+	// do not use early return here
+	if(gIdx < Threads)
 #endif
-    {
-        states += 25 * gIdx;
+	{
+		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
-        Scratchpad += gIdx * (MEMORY >> 4);
+		Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-        Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
 #elif(STRIDED_INDEX==3)
 		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
-        #if defined(__Tahiti__) || defined(__Pitcairn__)
+		#if defined(__Tahiti__) || defined(__Pitcairn__)
 
-        for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
-        text = vload4(get_local_id(1) + 4, (__global uint *)states);
+		for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
+		text = vload4(get_local_id(1) + 4, (__global uint *)states);
 
-        #else
+		#else
 
-        text = vload4(get_local_id(1) + 4, (__global uint *)states);
-        ((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
+		text = vload4(get_local_id(1) + 4, (__global uint *)states);
+		((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
 
-        #endif
+		#endif
 
-        AESExpandKey256(ExpandedKey2);
-    }
+		AESExpandKey256(ExpandedKey2);
+	}
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_LOCAL_MEM_FENCE);
 
 #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
-    __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
-    __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
-    __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
-    __local uint4* xin2_load = &xin2[(get_local_id(1) + 1) % 8][get_local_id(0)];
-    *xin2_store = (uint4)(0, 0, 0, 0);
+	__local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
+	__local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
+	__local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
+	__local uint4* xin2_load = &xin2[(get_local_id(1) + 1) % 8][get_local_id(0)];
+	*xin2_store = (uint4)(0, 0, 0, 0);
 #endif
 
 #if(COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
+	// do not use early return here
+	if (gIdx < Threads)
 #endif
-    {
+	{
 
 #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
-        #pragma unroll 2
-        for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
-        {
-            text ^= Scratchpad[IDX((uint)i1)];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            text ^= *xin2_load;
+		#pragma unroll 2
+		for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
+		{
+			text ^= Scratchpad[IDX((uint)i1)];
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text ^= *xin2_load;
 
-            #pragma unroll 10
-            for(int j = 0; j < 10; ++j)
-                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+			    text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
 
-            *xin1_store = text;
+			*xin1_store = text;
 
-            text ^= Scratchpad[IDX((uint)i1 + 8u)];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            text ^= *xin1_load;
+			text ^= Scratchpad[IDX((uint)i1 + 8u)];
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text ^= *xin1_load;
 
-            #pragma unroll 10
-            for(int j = 0; j < 10; ++j)
-                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+			    text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
 
-            *xin2_store = text;
-        }
+			*xin2_store = text;
+		}
 
-        barrier(CLK_LOCAL_MEM_FENCE);
-        text ^= *xin2_load;
+		barrier(CLK_LOCAL_MEM_FENCE);
+		text ^= *xin2_load;
 
 #else
-        #pragma unroll 2
-        for (int i = 0; i < (MEMORY >> 7); ++i) {
-            text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))];
-
-            #pragma unroll 10
-            for(int j = 0; j < 10; ++j)
-                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-        }
+		#pragma unroll 2
+		for (int i = 0; i < (MEMORY >> 7); ++i) {
+			text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))];
+
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+			    text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+		}
 #endif
-    }
+	}
 
 #if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
-    /* Also left over threads performe this loop.
-     * The left over thread results will be ignored
-     */
-    #pragma unroll 16
-    for(size_t i = 0; i < 16; i++)
-    {
-        #pragma unroll 10
-        for (int j = 0; j < 10; ++j) {
-            text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-        *xin1_store = text;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        text ^= *xin1_load;
-    }
+	/* Also left over threads performe this loop.
+	 * The left over thread results will be ignored
+	 */
+	#pragma unroll 16
+	for(size_t i = 0; i < 16; i++)
+	{
+		#pragma unroll 10
+		for (int j = 0; j < 10; ++j) {
+			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+		*xin1_store = text;
+		barrier(CLK_LOCAL_MEM_FENCE);
+		text ^= *xin1_load;
+	}
 #endif
 
-    __local ulong State_buf[8 * 25];
+	__local ulong State_buf[8 * 25];
 #if(COMP_MODE==1)
-    // do not use early return here
-    if(gIdx < Threads)
+	// do not use early return here
+	if(gIdx < Threads)
 #endif
-    {
-        vstore2(as_ulong2(text), get_local_id(1) + 4, states);
-    }
+	{
+		vstore2(as_ulong2(text), get_local_id(1) + 4, states);
+	}
 
-    barrier(CLK_GLOBAL_MEM_FENCE);
+	barrier(CLK_GLOBAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-    // do not use early return here
-    if(gIdx < Threads)
+	// do not use early return here
+	if(gIdx < Threads)
 #endif
-    {
-        if(!get_local_id(1))
-        {
-            __local ulong* State = State_buf + get_local_id(0) * 25;
+	{
+		if(!get_local_id(1))
+		{
+			__local ulong* State = State_buf + get_local_id(0) * 25;
 
-            for(int i = 0; i < 25; ++i) State[i] = states[i];
+			for(int i = 0; i < 25; ++i) State[i] = states[i];
 
-            keccakf1600_2(State);
+			keccakf1600_2(State);
 
 #if (ALGO == cryptonight_gpu)
 			if(State[3] <= Target)
@@ -1001,297 +1001,326 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 					output[outIdx] = get_global_id(0);
 			}
 #else
-            for(int i = 0; i < 25; ++i) states[i] = State[i];
+			for(int i = 0; i < 25; ++i) states[i] = State[i];
 
-            uint StateSwitch = State[0] & 3;
-            __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
-            __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
-            __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
-            destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
+			uint StateSwitch = State[0] & 3;
+			__global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
+			__global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
+			__global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
+			destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
 #endif
-        }
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
+		}
+	}
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 )==="
 R"===(
 
 #define VSWAP8(x)	(((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \
-          | (((x) >>  8) & 0x00000000FF000000UL) | (((x) <<  8) & 0x000000FF00000000UL) \
-          | (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL))
+		  | (((x) >>  8) & 0x00000000FF000000UL) | (((x) <<  8) & 0x000000FF00000000UL) \
+		  | (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL))
 
 #define VSWAP4(x)	((((x) >> 24) & 0xFFU) | (((x) >> 8) & 0xFF00U) | (((x) << 8) & 0xFF0000U) | (((x) << 24) & 0xFF000000U))
 
 __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-    const ulong idx = get_global_id(0) - get_global_offset(0);
+	const uint idx = get_global_id(0) - get_global_offset(0);
 
-    // do not use early return here
-    if(idx < BranchBuf[Threads])
-    {
-        states += 25 * BranchBuf[idx];
-
-        // skein
-        ulong8 h = vload8(0, SKEIN512_256_IV);
+	// do not use early return here
+	if(idx < BranchBuf[Threads])
+	{
+		states += 25 * BranchBuf[idx];
 
-        // Type field begins with final bit, first bit, then six bits of type; the last 96
-        // bits are input processed (including in the block to be processed with that tweak)
-        // The output transform is only one run of UBI, since we need only 256 bits of output
-        // The tweak for the output transform is Type = Output with the Final bit set
-        // T[0] for the output is 8, and I don't know why - should be message size...
-        ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
-        ulong8 p, m;
+		// skein
+		ulong8 h = vload8(0, SKEIN512_256_IV);
 
-        for(uint i = 0; i < 4; ++i)
-        {
-            t[0] += i < 3 ? 0x40UL : 0x08UL;
+		// Type field begins with final bit, first bit, then six bits of type; the last 96
+		// bits are input processed (including in the block to be processed with that tweak)
+		// The output transform is only one run of UBI, since we need only 256 bits of output
+		// The tweak for the output transform is Type = Output with the Final bit set
+		// T[0] for the output is 8, and I don't know why - should be message size...
+		ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
+		ulong8 p, m;
 
-            t[2] = t[0] ^ t[1];
+		#pragma unroll 1
+		for (uint i = 0; i < 4; ++i)
+		{
+			t[0] += i < 3 ? 0x40UL : 0x08UL;
 
-            m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
-            const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
-            p = Skein512Block(m, h, h8, t);
+			t[2] = t[0] ^ t[1];
 
-            h = m ^ p;
+			m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+			const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+			p = Skein512Block(m, h, h8, t);
 
-            t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL;
-        }
+			h = m ^ p;
 
-        t[0] = 0x08UL;
-        t[1] = 0xFF00000000000000UL;
-        t[2] = t[0] ^ t[1];
+			t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL;
+		}
 
-        p = (ulong8)(0);
-        const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+		t[0] = 0x08UL;
+		t[1] = 0xFF00000000000000UL;
+		t[2] = t[0] ^ t[1];
 
-        p = Skein512Block(p, h, h8, t);
+		p = (ulong8)(0);
+		const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
 
-        //vstore8(p, 0, output);
+		p = Skein512Block(p, h, h8, t);
 
-        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-        // and expect an accurate result for target > 32-bit without implementing carries
-		if(p.s3 <= Target)
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
+		if (p.s3 <= Target)
 		{
-            ulong outIdx = atomic_inc(output + 0xFF);
+			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-        }
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
+			    output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
+		}
+	}
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 #define SWAP8(x)	as_ulong(as_uchar8(x).s76543210)
 
 #define JHXOR \
-    h0h ^= input[0]; \
-    h0l ^= input[1]; \
-    h1h ^= input[2]; \
-    h1l ^= input[3]; \
-    h2h ^= input[4]; \
-    h2l ^= input[5]; \
-    h3h ^= input[6]; \
-    h3l ^= input[7]; \
+	h0h ^= input[0]; \
+	h0l ^= input[1]; \
+	h1h ^= input[2]; \
+	h1l ^= input[3]; \
+	h2h ^= input[4]; \
+	h2l ^= input[5]; \
+	h3h ^= input[6]; \
+	h3l ^= input[7]; \
 \
-    E8; \
+	E8; \
 \
-    h4h ^= input[0]; \
-    h4l ^= input[1]; \
-    h5h ^= input[2]; \
-    h5l ^= input[3]; \
-    h6h ^= input[4]; \
-    h6l ^= input[5]; \
-    h7h ^= input[6]; \
-    h7l ^= input[7]
+	h4h ^= input[0]; \
+	h4l ^= input[1]; \
+	h5h ^= input[2]; \
+	h5l ^= input[3]; \
+	h6h ^= input[4]; \
+	h6l ^= input[5]; \
+	h7h ^= input[6]; \
+	h7l ^= input[7]
 
 __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-    const uint idx = get_global_id(0) - get_global_offset(0);
-
-    // do not use early return here
-    if(idx < BranchBuf[Threads])
-    {
-        states += 25 * BranchBuf[idx];
-
-        sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
-        sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
-        sph_u64 tmp;
-
-        for(int i = 0; i < 3; ++i)
-        {
-            ulong input[8];
-
-            const int shifted = i << 3;
-            for(int x = 0; x < 8; ++x) input[x] = (states[shifted + x]);
-            JHXOR;
-        }
-        {
-            ulong input[8];
-            input[0] = (states[24]);
-            input[1] = 0x80UL;
-            #pragma unroll 6
-            for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
-            JHXOR;
-        }
-        {
-            ulong input[8];
-            for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
-            input[7] = 0x4006000000000000UL;
-            JHXOR;
-        }
-
-        //output[0] = h6h;
-        //output[1] = h6l;
-        //output[2] = h7h;
-        //output[3] = h7l;
-
-        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-        // and expect an accurate result for target > 32-bit without implementing carries
+	const uint idx = get_global_id(0) - get_global_offset(0);
+
+	// do not use early return here
+	if(idx < BranchBuf[Threads])
+	{
+		states += 25 * BranchBuf[idx];
+
+		sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
+		sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
+		sph_u64 tmp;
+
+		#pragma unroll 1
+		for(uint i = 0; i < 3; ++i)
+		{
+			ulong input[8];
+
+			const int shifted = i << 3;
+			for (uint x = 0; x < 8; ++x)
+			{
+			    input[x] = (states[shifted + x]);
+			}
+
+			JHXOR;
+		}
+
+		{
+			ulong input[8] = { (states[24]), 0x80UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL };
+			JHXOR;
+		}
+
+		{
+			ulong input[8] = { 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x4006000000000000UL };
+			JHXOR;
+		}
+
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
 		if(h7l <= Target)
 		{
-            ulong outIdx = atomic_inc(output + 0xFF);
+			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-            }
-        }
-    }
+		}
+	}
+}
 
 #define SWAP4(x)	as_uint(as_uchar4(x).s3210)
 
 __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-    const uint idx = get_global_id(0) - get_global_offset(0);
-
-    // do not use early return here
-    if(idx < BranchBuf[Threads])
-    {
-        states += 25 * BranchBuf[idx];
-
-        unsigned int m[16];
-        unsigned int v[16];
-        uint h[8];
-
-        ((uint8 *)h)[0] = vload8(0U, c_IV256);
-
-        for(uint i = 0, bitlen = 0; i < 4; ++i)
-        {
-            if(i < 3)
-            {
-                ((uint16 *)m)[0] = vload16(i, (__global uint *)states);
-                for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]);
-                bitlen += 512;
-            }
-            else
-            {
-                m[0] = SWAP4(((__global uint *)states)[48]);
-                m[1] = SWAP4(((__global uint *)states)[49]);
-                m[2] = 0x80000000U;
-
-                for(int i = 3; i < 13; ++i) m[i] = 0x00U;
-
-                m[13] = 1U;
-                m[14] = 0U;
-                m[15] = 0x640;
-                bitlen += 64;
-            }
-
-            ((uint16 *)v)[0].lo = ((uint8 *)h)[0];
-            ((uint16 *)v)[0].hi = vload8(0U, c_u256);
-
-            //v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-            //v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-
-            v[12] ^= bitlen;
-            v[13] ^= bitlen;
-
-            for(int r = 0; r < 14; r++)
-            {
-                GS(0, 4, 0x8, 0xC, 0x0);
-                GS(1, 5, 0x9, 0xD, 0x2);
-                GS(2, 6, 0xA, 0xE, 0x4);
-                GS(3, 7, 0xB, 0xF, 0x6);
-                GS(0, 5, 0xA, 0xF, 0x8);
-                GS(1, 6, 0xB, 0xC, 0xA);
-                GS(2, 7, 0x8, 0xD, 0xC);
-                GS(3, 4, 0x9, 0xE, 0xE);
-            }
-
-            ((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
-        }
-
-        for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]);
-
-        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-        // and expect an accurate result for target > 32-bit without implementing carries
-        uint2 t = (uint2)(h[6],h[7]);
-		if( as_ulong(t) <= Target)
+	const uint idx = get_global_id(0) - get_global_offset(0);
+
+	// do not use early return here
+	if(idx < BranchBuf[Threads])
+	{
+		states += 25 * BranchBuf[idx];
+
+		unsigned int m[16];
+		unsigned int v[16];
+		uint h[8];
+		uint bitlen = 0;
+
+		((uint8 *)h)[0] = vload8(0U, c_IV256);
+
+		#pragma unroll 1
+		for (uint i = 0; i < 3; ++i)
 		{
-            ulong outIdx = atomic_inc(output + 0xFF);
+			((uint16 *)m)[0] = vload16(i, (__global uint *)states);
+			for (uint x = 0; x < 16; ++x)
+			{
+			    m[x] = SWAP4(m[x]);
+			}
+
+			bitlen += 512;
+
+			((uint16 *)v)[0].lo = ((uint8 *)h)[0];
+			((uint16 *)v)[0].hi = vload8(0U, c_u256);
+
+			v[12] ^= bitlen;
+			v[13] ^= bitlen;
+
+			for (uint r = 0; r < 14; r++) {
+			    GS(0, 4, 0x8, 0xC, 0x0);
+			    GS(1, 5, 0x9, 0xD, 0x2);
+			    GS(2, 6, 0xA, 0xE, 0x4);
+			    GS(3, 7, 0xB, 0xF, 0x6);
+			    GS(0, 5, 0xA, 0xF, 0x8);
+			    GS(1, 6, 0xB, 0xC, 0xA);
+			    GS(2, 7, 0x8, 0xD, 0xC);
+			    GS(3, 4, 0x9, 0xE, 0xE);
+			}
+
+			((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
+		}
+
+		m[0]  = SWAP4(((__global uint *)states)[48]);
+		m[1]  = SWAP4(((__global uint *)states)[49]);
+		m[2]  = 0x80000000U;
+		m[3]  = 0x00U;
+		m[4]  = 0x00U;
+		m[5]  = 0x00U;
+		m[6]  = 0x00U;
+		m[7]  = 0x00U;
+		m[8]  = 0x00U;
+		m[9]  = 0x00U;
+		m[10] = 0x00U;
+		m[11] = 0x00U;
+		m[12] = 0x00U;
+		m[13] = 1U;
+		m[14] = 0U;
+		m[15] = 0x640;
+
+		bitlen += 64;
+
+		((uint16 *)v)[0].lo = ((uint8 *)h)[0];
+		((uint16 *)v)[0].hi = vload8(0U, c_u256);
+
+		v[12] ^= bitlen;
+		v[13] ^= bitlen;
+
+		for (uint r = 0; r < 14; r++) {
+			GS(0, 4, 0x8, 0xC, 0x0);
+			GS(1, 5, 0x9, 0xD, 0x2);
+			GS(2, 6, 0xA, 0xE, 0x4);
+			GS(3, 7, 0xB, 0xF, 0x6);
+			GS(0, 5, 0xA, 0xF, 0x8);
+			GS(1, 6, 0xB, 0xC, 0xA);
+			GS(2, 7, 0x8, 0xD, 0xC);
+			GS(3, 4, 0x9, 0xE, 0xE);
+		}
+
+		((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
+
+		for (uint i = 0; i < 8; ++i) {
+			h[i] = SWAP4(h[i]);
+		}
+
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
+		uint2 t = (uint2)(h[6],h[7]);
+		if(as_ulong(t) <= Target)
+		{
+			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-            }
-        }
-    }
-
-__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
-{
-    const uint idx = get_global_id(0) - get_global_offset(0);
+		}
+	}
+}
 
-    // do not use early return here
-    if(idx < BranchBuf[Threads])
-    {
-        states += 25 * BranchBuf[idx];
+#undef SWAP4
 
-        ulong State[8];
 
-        for(int i = 0; i < 7; ++i) State[i] = 0UL;
+__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
+{
+	const uint idx = get_global_id(0) - get_global_offset(0);
 
-        State[7] = 0x0001000000000000UL;
+	// do not use early return here
+	if(idx < BranchBuf[Threads])
+	{
+		states += 25 * BranchBuf[idx];
 
-        #pragma unroll 4
-        for(uint i = 0; i < 4; ++i)
-        {
-            volatile ulong H[8], M[8];
+		ulong State[8] = { 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0x0001000000000000UL };
+		volatile ulong H[8], M[8];
 
-            if(i < 3)
-            {
-                ((ulong8 *)M)[0] = vload8(i, states);
-            }
-            else
-            {
-                M[0] = states[24];
-                M[1] = 0x80UL;
+		for (uint i = 0; i < 3; ++i) {
+			((ulong8 *)M)[0] = vload8(i, states);
 
-                for(int x = 2; x < 7; ++x) M[x] = 0UL;
+			for (uint x = 0; x < 8; ++x) {
+			    H[x] = M[x] ^ State[x];
+			}
 
-                M[7] = 0x0400000000000000UL;
-            }
+			PERM_SMALL_P(H);
+			PERM_SMALL_Q(M);
 
-            for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x];
+			for (uint x = 0; x < 8; ++x)
+			{
+			    State[x] ^= H[x] ^ M[x];
+			}
+		}
 
-            PERM_SMALL_P(H);
-            PERM_SMALL_Q(M);
+		M[0] = states[24];
+		M[1] = 0x80UL;
+		M[2] = 0UL;
+		M[3] = 0UL;
+		M[4] = 0UL;
+		M[5] = 0UL;
+		M[6] = 0UL;
+		M[7] = 0x0400000000000000UL;
 
-            for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x];
-        }
+		for (uint x = 0; x < 8; ++x) {
+			H[x] = M[x] ^ State[x];
+		}
 
-        ulong tmp[8];
+		PERM_SMALL_P(H);
+		PERM_SMALL_Q(M);
 
-        for(int i = 0; i < 8; ++i) tmp[i] = State[i];
+		ulong tmp[8];
+		for (uint i = 0; i < 8; ++i) {
+			tmp[i] = State[i] ^= H[i] ^ M[i];
+		}
 
-        PERM_SMALL_P(State);
+		PERM_SMALL_P(State);
 
-        for(int i = 0; i < 8; ++i) State[i] ^= tmp[i];
+		for (uint i = 0; i < 8; ++i) {
+			State[i] ^= tmp[i];
+		}
 
-        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-        // and expect an accurate result for target > 32-bit without implementing carries
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
 		if(State[7] <= Target)
 		{
-            ulong outIdx = atomic_inc(output + 0xFF);
+			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-            }
-        }
-    }
+		}
+	}
+}
 
-)==="
\ No newline at end of file
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
index a99243e4402724057780df88ae64a98cf097933f..880aa85849b659b09c5eb35ddea61042bcdb87fb 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
@@ -13,8 +13,6 @@ inline float4 _mm_sub_ps(float4 a, float4 b)
 
 inline float4 _mm_mul_ps(float4 a, float4 b)
 {
-
-	//#pragma OPENCL SELECT_ROUNDING_MODE rte
 	return a * b;
 }
 
@@ -195,13 +193,19 @@ static const __constant float ccnt[16] = {
 	1.4609375f
 };
 
+struct SharedMemChunk
+{
+	int4 out[16];
+	float4 va[16];
+};
+
 __attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1)))
 __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads)
 {
 	const uint gIdx = getIdx();
 
 #if(COMP_MODE==1)
-	if(gIdx < Threads)
+	if(gIdx/16 >= numThreads)
 		return;
 #endif
 
@@ -211,13 +215,8 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u
 	__global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16));
 #endif
 
-	__local int4 smem2[1 * 4 * WORKSIZE];
-	__local int4 smemOut2[1 * 16 * WORKSIZE];
-	__local float4 smemVa2[1 * 16 * WORKSIZE];
-
-	__local int4* smem = smem2 + 4 * chunk;
-	__local int4* smemOut = smemOut2 + 16 * chunk;
-	__local float4* smemVa = smemVa2 + 16 * chunk;
+	__local struct SharedMemChunk smem_in[WORKSIZE];
+	__local struct SharedMemChunk* smem = smem_in + chunk;
 
 	uint tid = get_local_id(0) % 16;
 
@@ -225,67 +224,72 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u
 	uint s = ((__global uint*)spad)[idxHash * 50] >> 8;
 	float4 vs = (float4)(0);
 
+	// tid divided
+	const uint tidd = tid / 4;
+	// tid modulo
+	const uint tidm = tid % 4;
+	const uint block = tidd * 16 + tidm;
+
+	#pragma unroll CN_UNROLL
 	for(size_t i = 0; i < ITERATIONS; i++)
 	{
 		mem_fence(CLK_LOCAL_MEM_FENCE);
-		((__local int*)smem)[tid] = ((__global int*)scratchpad_ptr(s, tid/4, lpad))[tid%4];
+		int tmp = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm];
+		((__local int*)smem)[tid] = tmp;
 		mem_fence(CLK_LOCAL_MEM_FENCE);
 
-		float4 rc = vs;
-
 		{
 			single_comupte_wrap(
-				tid%4,
-				*(smem + look[tid][0]),
-				*(smem + look[tid][1]),
-				*(smem + look[tid][2]),
-				*(smem + look[tid][3]),
-				ccnt[tid], rc, smemVa + tid,
-				smemOut + tid
+				tidm,
+				*(smem->out + look[tid][0]),
+				*(smem->out + look[tid][1]),
+				*(smem->out + look[tid][2]),
+				*(smem->out + look[tid][3]),
+				ccnt[tid], vs, smem->va + tid,
+				smem->out + tid
 			);
 		}
 		mem_fence(CLK_LOCAL_MEM_FENCE);
 
-		int4 tmp2;
-		if(tid % 4 == 0)
-		{
-			int4 out = _mm_xor_si128(smemOut[tid], smemOut[tid + 1]);
-			int4 out2 = _mm_xor_si128(smemOut[tid + 2], smemOut[tid + 3]);
-			out = _mm_xor_si128(out, out2);
-			tmp2 = out;
-			*scratchpad_ptr(s , tid/4, lpad) = _mm_xor_si128(smem[tid/4], out);
-		}
+		int outXor = ((__local int*)smem->out)[block];
+		for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
+			outXor ^= ((__local int*)smem->out)[dd];
+
+		((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ tmp;
+		((__local int*)smem->out)[tid] = outXor;
+
+		float va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
+		float va_tmp2 = ((__local float*)smem->va)[block+ 8] + ((__local float*)smem->va)[block + 12];
+		((__local float*)smem->va)[tid] = va_tmp1 + va_tmp2;
+
 		mem_fence(CLK_LOCAL_MEM_FENCE);
-		if(tid % 4 == 0)
-			smemOut[tid] = tmp2;
+
+		int out2 = ((__local int*)smem->out)[tid] ^ ((__local int*)smem->out)[tid + 4 ] ^ ((__local int*)smem->out)[tid + 8] ^ ((__local int*)smem->out)[tid + 12];
+		va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
+		va_tmp2 = ((__local float*)smem->va)[block + 8] + ((__local float*)smem->va)[block + 12];
+		va_tmp1 = va_tmp1 + va_tmp2;
+		va_tmp1 = fabs(va_tmp1);
+
+		float xx = va_tmp1 * 16777216.0f;
+		int xx_int = (int)xx;
+		((__local int*)smem->out)[tid] = out2 ^ xx_int;
+		((__local float*)smem->va)[tid] = va_tmp1 / 64.0f;
+
 		mem_fence(CLK_LOCAL_MEM_FENCE);
-		int4 out2 = smemOut[0] ^ smemOut[4] ^ smemOut[8] ^ smemOut[12];
-
-		if(tid%2 == 0)
-			smemVa[tid] = smemVa[tid] + smemVa[tid + 1];
-		if(tid%4 == 0)
-			smemVa[tid] = smemVa[tid] + smemVa[tid + 2];
-		if(tid%8 == 0)
-			smemVa[tid] = smemVa[tid] + smemVa[tid + 4];
-		if(tid%16 == 0)
-			smemVa[tid] = smemVa[tid] + smemVa[tid + 8];
-		vs = smemVa[0];
-
-		vs = fabs(vs); // take abs(va) by masking the float sign bit
-		float4 xx = _mm_mul_ps(vs, (float4)(16777216.0f));
-		// vs range 0 - 64
-		int4 tmp = convert_int4_rte(xx);
-		tmp = _mm_xor_si128(tmp, out2);
-		// vs is now between 0 and 1
-		vs = _mm_div_ps(vs, (float4)(64.0f));
-		s = tmp.x ^ tmp.y ^ tmp.z ^ tmp.w;
+
+		vs = smem->va[0];
+		s = smem->out->x ^ smem->out->y ^ smem->out->z ^ smem->out->w;
 	}
 }
 
 )==="
 R"===(
 
-inline void generate_512(ulong idx, __local ulong* in, __global ulong* out)
+static const __constant uint skip[3] = {
+	20,22,22
+};
+
+inline void generate_512(uint idx, __local ulong* in, __global ulong* out)
 {
 	ulong hash[25];
 
@@ -293,19 +297,13 @@ inline void generate_512(ulong idx, __local ulong* in, __global ulong* out)
 	for(int i = 1; i < 25; ++i)
 		hash[i] = in[i];
 
-	keccakf1600_1(hash);
-	for(int i = 0; i < 20; ++i)
-		out[i] = hash[i];
-	out+=160/8;
-
-	keccakf1600_1(hash);
-	for(int i = 0; i < 22; ++i)
-		out[i] = hash[i];
-	out+=176/8;
-
-	keccakf1600_1(hash);
-	for(int i = 0; i < 22; ++i)
-		out[i] = hash[i];
+	for(int a = 0; a < 3;++a)
+	{
+		keccakf1600_1(hash);
+		for(int i = 0; i < skip[a]; ++i)
+			out[i] = hash[i];
+		out+=skip[a];
+	}
 }
 
 __attribute__((reqd_work_group_size(8, 8, 1)))
@@ -365,18 +363,29 @@ __kernel void JOIN(cn0_cn_gpu,ALGO)(__global ulong *input, __global int *Scratch
             }
         }
 	}
+}
 
-	barrier(CLK_LOCAL_MEM_FENCE);
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void JOIN(cn00_cn_gpu,ALGO)(__global int *Scratchpad, __global ulong *states)
+{
+    const uint gIdx = getIdx() / 64;
+    __local ulong State[25];
 
-#if(COMP_MODE==1)
-    // do not use early return here
-	if(gIdx < Threads)
+	states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
+    Scratchpad = (__global int*)((__global char*)Scratchpad + MEMORY * gIdx);
 #endif
+
+	for(int i = get_local_id(0); i < 25; i+=get_local_size(0))
+		State[i] = states[i];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+
+	for(uint i = get_local_id(0); i < MEMORY / 512; i += get_local_size(0))
 	{
-		for(ulong i = get_local_id(1); i < MEMORY / 512; i += get_local_size(1))
-		{
-			generate_512(i, State, (__global ulong*)((__global uchar*)Scratchpad + i*512));
-		}
+		generate_512(i, State, (__global ulong*)((__global uchar*)Scratchpad + i*512));
 	}
 }
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index b34e68294defba3dd0a7f88dfd3ae5c20b7485e6..8878db6187db037ef2a6596f36ca361cf4701c71 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -3,7 +3,7 @@ R"===(
  * @author SChernykh
  */
 
-#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+#if(ALGO == cryptonight_monero_v8)
 
 static const __constant uint RCP_C[256] =
 {
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 7ca072c95abf93513264d478dc2f6807e252a9fe..48f4ca49aa53019507d6ea7350f911ae2740e981 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -88,7 +88,7 @@ private:
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+			hashMemSize = std::max(hashMemSize, algo.Mem());
 		}
 
 		std::string conf;
@@ -131,14 +131,13 @@ private:
 			}
 
 			// check if cryptonight_monero_v8 is selected for the user or dev pool
-			bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end() ||
-			                          std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_turtle) != neededAlgorithms.end());
+			bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());
 
 			// true for all cryptonight_heavy derivates since we check the user and dev pool
 			bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();
 
-			// true for all cryptonight_gpu derivates since we check the user and dev pool
-			bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end();
+			// true for cryptonight_gpu as main user pool algorithm
+			bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
 
 			// set strided index to default
 			ctx.stridedIndex = 1;
@@ -153,13 +152,14 @@ private:
 			else if(useCryptonight_heavy)
 				ctx.stridedIndex = 3;
 
-			// increase all intensity limits by two if scratchpad is only 1 MiB
-			if(hashMemSize <= CRYPTONIGHT_LITE_MEMORY)
-				maxThreads *= 2u;
+			if(hashMemSize < CN_MEMORY)
+			{
+				size_t factor = CN_MEMORY / hashMemSize;
+				// increase all intensity relative to the original scratchpad size
+				maxThreads *= factor;
+			}
 
-			// increase all intensity limits by eight for turtle (*2u shadowed from lite)
-			if (hashMemSize <= CRYPTONIGHT_TURTLE_MEMORY)
-				maxThreads *= 4u;
+			uint32_t numUnroll = 8;
 
 			if(useCryptonight_gpu)
 			{
@@ -167,6 +167,7 @@ private:
 				// @todo check again after all optimizations
 				maxThreads = ctx.computeUnits * 6 * 8;
 				ctx.stridedIndex = 0;
+				numUnroll = 1;
 			}
 
 			// keep 128MiB memory free (value is randomly chosen) from the max available memory
@@ -210,7 +211,7 @@ private:
 					conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 						"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
 						"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
-						"    \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
+						"    \"unroll\" : " + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
 						"  },\n";
 				}
 			}
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index b0f4e6ecd3f3429274a122c5ad2c842e84fc9618..1c9eb62797bf491e84cb3ca38a5cd82428fa7e7a 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -275,7 +275,7 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = results[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 74ab5fb6030bf6fc83b36d60a136296264e8c165..402d63cd693e55b0882751b8c38921094b7aa3e6 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -24,7 +24,7 @@ public:
 	static bool init_gpus();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 	minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg);
 
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 91da7a6ee1ac8d21a0f14f19e010f17d1a43b9d6..ba0e6984f0bc2b538b727f8c13990228933a8154 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -33,7 +33,7 @@ public:
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+			hashMemSize = std::max(hashMemSize, algo.Mem());
 		}
 		const size_t hashMemSizeKB = hashMemSize / 1024u;
 
@@ -49,7 +49,14 @@ public:
 
 		std::string conf;
 
+		// if cryptonight_gpu is used we will disable cpu mining but provide a inactive config
+		bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
 
+		if(useCryptonight_gpu)
+		{
+			printer::inst()->print_msg(L0, "WARNING: CPU mining will be disabled because cryptonight_gpu is not suitable for CPU mining. You can uncomment the auto generated config in %s to enable CPU mining.", params::inst().configFileCPU.c_str());
+			conf += "/*\n//CPU config is disabled by default because cryptonight_gpu is not suitable for CPU mining.\n";
+		}
 		if(!detectL3Size() || L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048u))
 		{
 			if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048))
@@ -100,6 +107,9 @@ public:
 			}
 		}
 
+		if(useCryptonight_gpu)
+			conf += "*/\n";
+
 		configTpl.replace("CPUCONFIG",conf);
 		configTpl.write(params::inst().configFileCPU);
 		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str());
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index 39e80a376f13abda4c96ddbbfe7515647731feec..f09b1ebc046f4a50f6123be4107e4bb49ae963c9 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -32,7 +32,7 @@ public:
 
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+			hashMemSize = std::max(hashMemSize, algo.Mem());
 		}
 		halfHashMemSize = hashMemSize / 2u;
 	}
@@ -53,6 +53,15 @@ public:
 		;
 		configTpl.set( std::string(tpl) );
 
+		// if cryptonight_gpu is used we will disable cpu mining but provide a inactive config
+		bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
+
+		if(useCryptonight_gpu)
+		{
+			printer::inst()->print_msg(L0, "WARNING: CPU mining will be disabled because cryptonight_gpu is not suitable for CPU mining. You can uncomment the auto generated config in %s to enable CPU mining.", params::inst().configFileCPU.c_str());
+			conf += "/*\n//CPU config is disabled by default because cryptonight_gpu is not suitable for CPU mining.\n";
+		}
+
 		try
 		{
 			std::vector<hwloc_obj_t> tlcs;
@@ -85,6 +94,9 @@ public:
 			printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what());
 		}
 
+		if(useCryptonight_gpu)
+			conf += "*/\n";
+
 		configTpl.replace("CPUCONFIG",conf);
 		configTpl.write(params::inst().configFileCPU);
 		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str());
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
index 4a7697b028b6ad9ff72c1c02bdade2b03142aae6..5844d381461915090db552d22a47bf1f40b2ed10 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu.hpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "xmrstak/backend/cryptonight.hpp"
 #include <stdint.h>
 
 #if defined(_WIN32) || defined(_WIN64)
@@ -36,8 +37,6 @@ inline bool cngpu_check_avx2()
 	return (cpu_info[1] & (1 << 5)) != 0;
 }
 
-template<size_t ITER, uint32_t MASK>
-void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad);
+void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo);
 
-template<size_t ITER, uint32_t MASK>
-void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad);
+void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo);
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
index e46705fd0920d68834c8f97423cae343c3df9e3e..8b4aefe13b5df9c163681d7cd620773a229d0177 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
@@ -9,11 +9,11 @@ inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01)
 	n01 = _mm256_cvtepi32_ps(v);
 }
 
-inline __m256 fma_break(const __m256& x) 
-{ 
-	// Break the dependency chain by setitng the exp to ?????01 
-	__m256 xx = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFEFFFFFF)), x); 
-	return _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x00800000)), xx); 
+inline __m256 fma_break(const __m256& x)
+{
+	// Break the dependency chain by setitng the exp to ?????01
+	__m256 xx = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFEFFFFFF)), x);
+	return _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x00800000)), xx);
 }
 
 // 14
@@ -60,7 +60,7 @@ inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2,
 
 // 112×4 = 448
 template <bool add>
-inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, 
+inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
 							  float lcnt, float hcnt, const __m256& rnd_c, __m256& sum)
 {
 	__m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1);
@@ -85,7 +85,7 @@ inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256&
 }
 
 template <size_t rot>
-inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, 
+inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
 								float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out)
 {
 	__m256i r = double_comupte<rot % 2 != 0>(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum);
@@ -95,15 +95,18 @@ inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256
 	out = _mm256_xor_si256(out, r);
 }
 
-template<uint32_t MASK>
-inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m256i*>(lpad + (idx & MASK) + n*16); }
 
-template<size_t ITER, uint32_t MASK>
-void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad)
+inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n*16); }
+
+
+void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo)
 {
+	const uint32_t ITER = algo.Iter();
+	const uint32_t mask = algo.Mask();
+
 	uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
-	__m256i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
-	__m256i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
+	__m256i* idx0 = scratchpad_ptr(lpad, s, 0, mask);
+	__m256i* idx2 = scratchpad_ptr(lpad, s, 2, mask);
 	__m256 sum0 = _mm256_setzero_ps();
 
 	for(size_t i = 0; i < ITER; i++)
@@ -116,13 +119,13 @@ void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad)
 		__m256 d01, d23;
 		prep_dv_avx(idx0, v01, n01);
 		prep_dv_avx(idx2, v23, n23);
-		
+
 		__m256i out, out2;
 		__m256 n10, n22, n33;
 		n10 = _mm256_permute2f128_ps(n01, n01, 0x01);
 		n22 = _mm256_permute2f128_ps(n23, n23, 0x00);
 		n33 = _mm256_permute2f128_ps(n23, n23, 0x11);
-		
+
 		out = _mm256_setzero_si256();
 		double_comupte_wrap<0>(n01, n10, n22, n33, 1.3437500f, 1.4296875f, rc, suma, out);
 		double_comupte_wrap<1>(n01, n22, n33, n10, 1.2812500f, 1.3984375f, rc, suma, out);
@@ -131,7 +134,7 @@ void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad)
 		_mm256_store_si256(idx0, _mm256_xor_si256(v01, out));
 		sum0 = _mm256_add_ps(suma, sumb);
 		out2 = out;
-		
+
 		__m256 n11, n02, n30;
 		n11 = _mm256_permute2f128_ps(n01, n01, 0x11);
 		n02 = _mm256_permute2f128_ps(n01, n23, 0x20);
@@ -156,7 +159,7 @@ void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad)
 		__m128 sum = _mm256_castps256_ps128(sum0);
 
 		sum = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum); // take abs(va) by masking the float sign bit
-		// vs range 0 - 64 
+		// vs range 0 - 64
 		__m128i v0 = _mm_cvttps_epi32(_mm_mul_ps(sum, _mm_set1_ps(16777216.0f)));
 		v0 = _mm_xor_si128(v0, _mm256_castsi256_si128(out2));
 		__m128i v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3));
@@ -168,9 +171,7 @@ void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad)
 		sum = _mm_div_ps(sum, _mm_set1_ps(64.0f));
 		sum0 = _mm256_insertf128_ps(_mm256_castps128_ps256(sum), sum, 1);
 		uint32_t n = _mm_cvtsi128_si32(v0);
-		idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
-		idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
+		idx0 = scratchpad_ptr(lpad, n, 0, mask);
+		idx2 = scratchpad_ptr(lpad, n, 2, mask);
 	}
 }
-
-template void cn_gpu_inner_avx<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
index bde34162a13f8cd1914632e140573771dfff78f4..c8627d8b848adf3fb041ed2ebe9c2196f4dc1dbe 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
@@ -9,11 +9,11 @@ inline void prep_dv(__m128i* idx, __m128i& v, __m128& n)
 	n = _mm_cvtepi32_ps(v);
 }
 
-inline __m128 fma_break(__m128 x) 
-{ 
-	// Break the dependency chain by setitng the exp to ?????01 
-	x = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFEFFFFFF)), x); 
-	return _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x00800000)), x); 
+inline __m128 fma_break(__m128 x)
+{
+	// Break the dependency chain by setitng the exp to ?????01
+	x = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFEFFFFFF)), x);
+	return _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x00800000)), x);
 }
 
 // 14
@@ -94,25 +94,26 @@ inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2,  __m128 n3, flo
 	out = _mm_xor_si128(out, r);
 }
 
-template<uint32_t MASK>
-inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m128i*>(lpad + (idx & MASK) + n*16); }
+inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n*16); }
 
-template<size_t ITER, uint32_t MASK>
-void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad)
+void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo)
 {
+	const uint32_t ITER = algo.Iter();
+	const uint32_t mask = algo.Mask();
+
 	uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
-	__m128i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
-	__m128i* idx1 = scratchpad_ptr<MASK>(lpad, s, 1);
-	__m128i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
-	__m128i* idx3 = scratchpad_ptr<MASK>(lpad, s, 3);
+	__m128i* idx0 = scratchpad_ptr(lpad, s, 0, mask);
+	__m128i* idx1 = scratchpad_ptr(lpad, s, 1, mask);
+	__m128i* idx2 = scratchpad_ptr(lpad, s, 2, mask);
+	__m128i* idx3 = scratchpad_ptr(lpad, s, 3, mask);
 	__m128 sum0 = _mm_setzero_ps();
-	
+
 	for(size_t i = 0; i < ITER; i++)
 	{
 		__m128 n0, n1, n2, n3;
 		__m128i v0, v1, v2, v3;
 		__m128 suma, sumb, sum1, sum2, sum3;
-		
+
 		prep_dv(idx0, v0, n0);
 		prep_dv(idx1, v1, n1);
 		prep_dv(idx2, v2, n2);
@@ -128,7 +129,7 @@ void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad)
 		sum0 = _mm_add_ps(suma, sumb);
 		_mm_store_si128(idx0, _mm_xor_si128(v0, out));
 		out2 = out;
-	
+
 		out = _mm_setzero_si128();
 		single_comupte_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out);
 		single_comupte_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out);
@@ -160,7 +161,7 @@ void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad)
 		sum0 = _mm_add_ps(sum0, sum2);
 
 		sum0 = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum0); // take abs(va) by masking the float sign bit
-		// vs range 0 - 64 
+		// vs range 0 - 64
 		n0 = _mm_mul_ps(sum0, _mm_set1_ps(16777216.0f));
 		v0 = _mm_cvttps_epi32(n0);
 		v0 = _mm_xor_si128(v0, out2);
@@ -172,11 +173,9 @@ void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad)
 		// vs is now between 0 and 1
 		sum0 = _mm_div_ps(sum0, _mm_set1_ps(64.0f));
 		uint32_t n = _mm_cvtsi128_si32(v0);
-		idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
-		idx1 = scratchpad_ptr<MASK>(lpad, n, 1);
-		idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
-		idx3 = scratchpad_ptr<MASK>(lpad, n, 3);
+		idx0 = scratchpad_ptr(lpad, n, 0, mask);
+		idx1 = scratchpad_ptr(lpad, n, 1, mask);
+		idx2 = scratchpad_ptr(lpad, n, 2, mask);
+		idx3 = scratchpad_ptr(lpad, n, 3, mask);
 	}
 }
-
-template void cn_gpu_inner_ssse3<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index c75eff8ffc87154f25d1d9f7f49357cc477f8d66..7ba9e2fe89ea20829084c0ee7234896867ca06a4 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -165,11 +165,11 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3
 	x7 = _mm_xor_si128(x7, tmp0);
 }
 
-template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
-void cn_explode_scratchpad(const __m128i* input, __m128i* output)
+template<bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
+void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo)
 {
 	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast;
-	
+
 	// This is more than we have registers, compiler will assign 2 keys on the stack
 	__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
 	__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
@@ -219,6 +219,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 		}
 	}
 
+	const size_t MEM = algo.Mem();
 	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 	{
 		if(SOFT_AES)
@@ -266,13 +267,14 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 	}
 }
 
-template<size_t MEM, bool PREFETCH, xmrstak_algo ALGO>
-void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output)
+template<bool PREFETCH, xmrstak_algo_id ALGO>
+void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrstak_algo& algo)
 {
 	constexpr size_t hash_size = 200; // 25x8 bytes
 	alignas(128) uint64_t hash[25];
+	const size_t mem = algo.Mem();
 
-	for (uint64_t i = 0; i < MEM / 512; i++)
+	for (uint64_t i = 0; i < mem / 512; i++)
 	{
 		memcpy(hash, input, hash_size);
 		hash[0] ^= i;
@@ -288,7 +290,7 @@ void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output)
 		keccakf(hash, 24);
 		memcpy(output, hash, 176);
 		output+=176;
-		
+
 		if(PREFETCH)
 		{
 			_mm_prefetch((const char*)output - 512, _MM_HINT_T2);
@@ -299,10 +301,10 @@ void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output)
 	}
 }
 
-template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
-void cn_implode_scratchpad(const __m128i* input, __m128i* output)
+template<bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
+void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo)
 {
-	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || 
+	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
 		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu;
 
 	// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -320,6 +322,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 	xout6 = _mm_load_si128(output + 10);
 	xout7 = _mm_load_si128(output + 11);
 
+	const size_t MEM = algo.Mem();
 	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 	{
 		if(PREFETCH)
@@ -504,7 +507,7 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
 	return _mm_load_si128((__m128i*)k);
 }
 
-template<xmrstak_algo ALGO>
+template<xmrstak_algo_id ALGO>
 inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 {
 	mem_out[0] = _mm_cvtsi128_si64(tmp);
@@ -584,7 +587,7 @@ inline void set_float_rounding_mode()
 
 #define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \
 	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
+	if(ALGO == cryptonight_monero_v8) \
 	{ \
 		const uint64_t idx1 = idx0 & MASK; \
 		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
@@ -597,7 +600,7 @@ inline void set_float_rounding_mode()
 
 #define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
 	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
+	if(ALGO == cryptonight_monero_v8) \
 	{ \
 		const uint64_t idx1 = idx0 & MASK; \
 		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
@@ -611,7 +614,7 @@ inline void set_float_rounding_mode()
 	}
 
 #define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
+	if(ALGO == cryptonight_monero_v8) \
 	{ \
 		uint64_t sqrt_result_tmp; \
 		assign(sqrt_result_tmp, sqrt_result); \
@@ -650,7 +653,7 @@ inline void set_float_rounding_mode()
 		monero_const ^=  *(reinterpret_cast<const uint64_t*>(ctx[n]->hash_state) + 24); \
 	} \
 	/* Optim - 99% time boundary */ \
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \
+	cn_explode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo); \
 	\
 	__m128i ax0; \
 	uint64_t idx0; \
@@ -666,7 +669,7 @@ inline void set_float_rounding_mode()
 		idx0 = h0[0] ^ h0[4]; \
 		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
 		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
-		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
+		if(ALGO == cryptonight_monero_v8) \
 		{ \
 			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
 			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
@@ -703,7 +706,7 @@ inline void set_float_rounding_mode()
 	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
-	if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_turtle) \
+	if(ALGO != cryptonight_monero_v8) \
 		bx0 = cx
 
 #define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \
@@ -720,7 +723,7 @@ inline void set_float_rounding_mode()
 		ah0 += lo; \
 		al0 += hi; \
 	} \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
+	if(ALGO == cryptonight_monero_v8) \
 	{ \
 		bx1 = bx0; \
 		bx0 = cx; \
@@ -768,7 +771,7 @@ inline void set_float_rounding_mode()
 
 #define CN_FINALIZE(n) \
 	/* Optim - 90% time boundary */ \
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \
+	cn_implode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state, algo); \
 	/* Optim - 99% time boundary */ \
 	keccakf((uint64_t*)ctx[n]->hash_state, 24); \
 	extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n)
@@ -837,12 +840,12 @@ struct Cryptonight_hash<1>
 {
 	static constexpr size_t N = 1;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
 		REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
@@ -866,12 +869,12 @@ struct Cryptonight_hash<2>
 {
 	static constexpr size_t N = 2;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
 		REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
@@ -895,12 +898,12 @@ struct Cryptonight_hash<3>
 {
 	static constexpr size_t N = 3;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
 		REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
@@ -924,12 +927,12 @@ struct Cryptonight_hash<4>
 {
 	static constexpr size_t N = 4;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
 		REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
@@ -953,12 +956,12 @@ struct Cryptonight_hash<5>
 {
 	static constexpr size_t N = 5;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
 		REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
@@ -990,20 +993,19 @@ struct Cryptonight_hash_asm<1, asm_version>
 {
 	static constexpr size_t N = 1;
 
-	template<xmrstak_algo ALGO>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
-		cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+		cn_explode_scratchpad<false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state, algo);
 
 		if(asm_version == 0)
 			cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
 		else if(asm_version == 1)
 			cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
 
-		cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+		cn_implode_scratchpad<false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state, algo);
 		keccakf((uint64_t*)ctx[0]->hash_state, 24);
 		extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
 	}
@@ -1015,16 +1017,16 @@ struct Cryptonight_hash_asm<2, 0>
 {
 	static constexpr size_t N = 2;
 
-	template<xmrstak_algo ALGO>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const size_t MEM = algo.Mem();
 
 		for(size_t i = 0; i < N; ++i)
 		{
 			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
 			/* Optim - 99% time boundary */
-			cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+			cn_explode_scratchpad<false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo);
 		}
 
 		cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
@@ -1032,7 +1034,7 @@ struct Cryptonight_hash_asm<2, 0>
 		for(size_t i = 0; i < N; ++i)
 		{
 			/* Optim - 90% time boundary */
-			cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+			cn_implode_scratchpad<false, false, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state, algo);
 			/* Optim - 99% time boundary */
 			keccakf((uint64_t*)ctx[i]->hash_state, 24);
 			extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
@@ -1044,22 +1046,19 @@ struct Cryptonight_hash_gpu
 {
 	static constexpr size_t N = 1;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
-		cn_explode_scratchpad_gpu<MEM, PREFETCH, ALGO>(ctx[0]->hash_state, ctx[0]->long_state);
+		cn_explode_scratchpad_gpu<PREFETCH, ALGO>(ctx[0]->hash_state, ctx[0]->long_state, algo);
 
 		if(cngpu_check_avx2())
-			cn_gpu_inner_avx<ITERATIONS, MASK>(ctx[0]->hash_state, ctx[0]->long_state);
+			cn_gpu_inner_avx(ctx[0]->hash_state, ctx[0]->long_state, algo);
 		else
-			cn_gpu_inner_ssse3<ITERATIONS, MASK>(ctx[0]->hash_state, ctx[0]->long_state);
+			cn_gpu_inner_ssse3(ctx[0]->hash_state, ctx[0]->long_state, algo);
 
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+		cn_implode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state, algo);
 		keccakf((uint64_t*)ctx[0]->hash_state, 24);
 		memcpy(output, ctx[0]->hash_state, 32);
 	}
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index ee1ff2386c371d37e89e8aaf1343931b9134668e..a065abe0192ab06b7413b73e31fda44991dad9cf 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -208,7 +208,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 	size_t hashMemSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{
-		hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+		hashMemSize = std::max(hashMemSize, algo.Mem());
 	}
 
 	cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096);
@@ -292,7 +292,7 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx)
 	size_t hashMemSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{
-		hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+		hashMemSize = std::max(hashMemSize, algo.Mem());
 	}
 
 	if(ctx->ctx_info[0] != 0)
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index e1af701e8d7ab2b59ab500ebedaa6a39576cdfc6..440732210a450ad228dbea10177b7fdbe9bd5f2c 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -239,190 +239,187 @@ bool minethd::self_test()
 	cn_hash_fun hashf;
 	cn_hash_fun hashf_multi;
 
-	if(xmrstak_algo::invalid_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() ||
-		xmrstak_algo::invalid_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-	{
-		printer::inst()->print_msg(L0, "Root algorithm is not allowed to be invalid");
-		return false;
-	}
-
 	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
 
 	for(const auto algo : neededAlgorithms)
 	{
-		if(algo == cryptonight)
+		if(algo == POW(cryptonight))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf("This is a test", 14, out, ctx);
+			std::cout<<algo.Name()<< " test cn" <<std::endl;
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
-			hashf("This is a test", 14, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
-			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
 					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
 
-			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
-			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
 					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
 
-			hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
+			hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
 
-			hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
 
-			hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
 		}
-		else if(algo == cryptonight_lite)
+		else if(algo == POW(cryptonight_lite))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 		}
-		else if(algo == cryptonight_monero)
+		else if(algo == POW(cryptonight_monero))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 		}
-		else if(algo == cryptonight_monero_v8)
+		else if(algo == POW(cryptonight_monero_v8))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
 		}
-		else if(algo == cryptonight_aeon)
+		else if(algo == POW(cryptonight_aeon))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 		}
-		else if(algo == cryptonight_ipbc)
+		else if(algo == POW(cryptonight_ipbc))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
 		}
-		else if(algo == cryptonight_stellite)
+		else if(algo == POW(cryptonight_stellite))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 		}
-		else if(algo == cryptonight_masari)
+		else if(algo == POW(cryptonight_masari))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 		}
-		else if(algo == cryptonight_heavy)
+		else if(algo == POW(cryptonight_heavy))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 		}
-		else if(algo == cryptonight_haven)
+		else if(algo == POW(cryptonight_haven))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 		}
-		else if(algo == cryptonight_bittube2)
+		else if(algo == POW(cryptonight_bittube2))
 		{
 			unsigned char out[32 * MAX_N];
 			cn_hash_fun hashf;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
 
-			hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx);
+			hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
 
-			hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx);
+			hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
 
-			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx);
+			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
 		}
-		else if(algo == cryptonight_superfast)
+		else if(algo == POW(cryptonight_superfast))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast);
-			hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
 		}
-		else if(algo == cryptonight_gpu)
+		else if(algo == POW(cryptonight_gpu))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_gpu);
-			hashf("", 0, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("", 0, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_gpu);
-			hashf("", 0, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("", 0, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
 		}
-		else if (algo == cryptonight_turtle)
+		else if (algo == POW(cryptonight_turtle))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_turtle);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
-		
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_turtle);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
 		}
+		else
+			printer::inst()->print_msg(L0,
+				"Cryptonight hash self-test NOT defined for POW %s", algo.Name().c_str());
 
 		if(!bResult)
 			printer::inst()->print_msg(L0,
@@ -504,7 +501,7 @@ static std::string getAsmName(const uint32_t num_hashes)
 }
 
 template<size_t N>
-minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
+minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str)
 {
 	static_assert(N >= 1, "number of threads must be >= 1" );
 
@@ -513,7 +510,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	// function as a two digit binary
 
 	uint8_t algv;
-	switch(algo)
+	switch(algo.Id())
 	{
 	case cryptonight:
 		algv = 2;
@@ -554,9 +551,6 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	case cryptonight_gpu:
 		algv = 12;
 		break;
-	case cryptonight_turtle:
-		algv = 13;
-		break;
 	default:
 		algv = 2;
 		break;
@@ -622,16 +616,11 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, true>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, true>,
-		
+
 		Cryptonight_hash_gpu::template hash<cryptonight_gpu, false, false>,
 		Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, false>,
 		Cryptonight_hash_gpu::template hash<cryptonight_gpu, false, true>,
-		Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, true>,
-
-		Cryptonight_hash<N>::template hash<cryptonight_turtle, false, false>,
-		Cryptonight_hash<N>::template hash<cryptonight_turtle, true, false>,
-		Cryptonight_hash<N>::template hash<cryptonight_turtle, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_turtle, true, true>
+		Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, true>
 	};
 
 	std::bitset<2> digit;
@@ -642,7 +631,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 
 
 	// check for asm optimized version for cryptonight_v8
-	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
+	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes && algo.Mem() == CN_MEMORY && algo.Iter() == CN_ITER)
 	{
 		std::string selected_asm = asm_version_str;
 		if(selected_asm == "auto")
@@ -671,39 +660,10 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		}
 	}
 
-	if (N <= 2 && (algo == cryptonight_turtle) && bHaveAes)
-	{
-		std::string selected_asm = asm_version_str;
-		if (selected_asm == "auto")
-			selected_asm = cpu::getAsmName(N);
-
-		if (selected_asm != "off")
-		{
-			if (selected_asm == "intel_avx" && asm_version_str != "auto")
-			{
-				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
-				if (N == 1)
-					selected_function = Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_turtle>;
-				else if (N == 2)
-					selected_function = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_turtle>;
-			}
-			// supports only 1 thread per hash
-			if (N == 1 && selected_asm == "amd_avx")
-			{
-				// AMD Ryzen (1xxx and 2xxx series)
-				selected_function = Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_turtle>;
-			}
-			if (asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx"))
-				printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str());
-			else if (selected_asm != "intel_avx" && selected_asm != "amd_avx") // unknown asm type
-				printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str());
-		}
-	}
-
 	return selected_function;
 }
 
-minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo)
 {
 	return func_multi_selector<1>(bHaveAes, bNoPrefetch, algo);
 }
@@ -853,7 +813,7 @@ void minethd::multiway_work_main()
 			for (size_t i = 0; i < N; i++)
 				*piNonce[i] = iNonce++;
 
-			hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx);
+			hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx, miner_algo);
 
 			for (size_t i = 0; i < N; i++)
 			{
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index eb77749f6236735ab0f5132016cf6c0b29181542..41315269bac2390a11b5d9a3c240885a39e0c9f1 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -22,9 +22,9 @@ public:
 	static std::vector<iBackend*> thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
-	static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
+	static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo);
 	static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id);
 
 	static cryptonight_ctx* minethd_alloc_ctx();
@@ -32,7 +32,7 @@ public:
 private:
 
 	template<size_t N>
-	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "off");
+	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off");
 
 	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
 
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index ae862abae9d816bd4aec0913464f38fb9da2e60e..b75adf217eb180f6edf15d68ce5bfdb403f285ab 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -2,8 +2,12 @@
 #include <stddef.h>
 #include <inttypes.h>
 #include <type_traits>
+#include <string>
+#include <array>
 
-enum xmrstak_algo
+constexpr size_t start_derived_algo_id = 1000;
+
+enum xmrstak_algo_id
 {
 	invalid_algo = 0,
 	cryptonight = 1,
@@ -19,249 +23,182 @@ enum xmrstak_algo
 	cryptonight_monero_v8 = 11,
 	cryptonight_superfast = 12,
 	cryptonight_gpu = 13,
-	cryptonight_turtle = 14
-};
-
-// define aeon settings
-constexpr size_t CRYPTONIGHT_LITE_MEMORY = 1 * 1024 * 1024;
-constexpr uint32_t CRYPTONIGHT_LITE_MASK = 0xFFFF0;
-constexpr uint32_t CRYPTONIGHT_LITE_ITER = 0x40000;
-
-constexpr size_t CRYPTONIGHT_MEMORY = 2 * 1024 * 1024;
-constexpr uint32_t CRYPTONIGHT_MASK = 0x1FFFF0;
-constexpr uint32_t CRYPTONIGHT_ITER = 0x80000;
-
-constexpr size_t CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024;
-constexpr uint32_t CRYPTONIGHT_HEAVY_MASK = 0x3FFFF0;
-constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000;
-
-constexpr uint32_t CRYPTONIGHT_GPU_MASK = 0x1FFFC0;
-constexpr uint32_t CRYPTONIGHT_GPU_ITER = 0xC000;
-
-constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000;
-
-constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; 
-
-constexpr size_t CRYPTONIGHT_TURTLE_MEMORY = 256 * 1024;
-constexpr uint32_t CRYPTONIGHT_TURTLE_MASK = 0x1FFF0;
-constexpr uint32_t CRYPTONIGHT_TURTLE_ITER = 0x10000;
-
-template<xmrstak_algo ALGO>
-inline constexpr size_t cn_select_memory() { return 0; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight>() { return CRYPTONIGHT_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_lite>() { return CRYPTONIGHT_LITE_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_monero>() { return CRYPTONIGHT_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_monero_v8>() { return CRYPTONIGHT_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_aeon>() { return CRYPTONIGHT_LITE_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_ipbc>() { return CRYPTONIGHT_LITE_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_stellite>() { return CRYPTONIGHT_MEMORY; }
 
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_masari>() { return CRYPTONIGHT_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_haven>() { return CRYPTONIGHT_HEAVY_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_superfast>() { return CRYPTONIGHT_MEMORY; } 
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_gpu>() { return CRYPTONIGHT_MEMORY; } 
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_MEMORY; }
+	cryptonight_turtle = start_derived_algo_id,
+	cryptonight_v8_half = (start_derived_algo_id + 1),
+	cryptonight_v8_zelerius = (start_derived_algo_id + 2)
+	// please add the algorithm name to get_algo_name()
+};
 
-inline size_t cn_select_memory(xmrstak_algo algo)
+/** get name of the algorithm
+ *
+ * @param algo mining algorithm
+ */
+inline std::string get_algo_name(xmrstak_algo_id algo_id)
 {
-	switch(algo)
-	{
-	case cryptonight_stellite:
-	case cryptonight_monero:
-	case cryptonight_monero_v8:
-	case cryptonight_masari:
-	case cryptonight:
-	case cryptonight_superfast: 
-	case cryptonight_gpu:
-		return CRYPTONIGHT_MEMORY;
-	case cryptonight_ipbc:
-	case cryptonight_aeon:
-	case cryptonight_lite:
-		return CRYPTONIGHT_LITE_MEMORY;
-	case cryptonight_bittube2:
-	case cryptonight_haven:
-	case cryptonight_heavy:
-		return CRYPTONIGHT_HEAVY_MEMORY;
-	case cryptonight_turtle:
-		return CRYPTONIGHT_TURTLE_MEMORY;
-	default:
-		return 0;
-	}
+	static std::array<std::string, 14> base_algo_names =
+	{{
+		"invalid_algo",
+		"cryptonight",
+		"cryptonight_lite",
+		"cryptonight_v7",
+		"cryptonight_heavy",
+		"cryptonight_lite_v7",
+		"cryptonight_lite_v7_xor",
+		"cryptonight_v7_stellite",
+		"cryptonight_masari",
+		"cryptonight_haven",
+		"cryptonight_bittube2",
+		"cryptonight_v8",
+		"cryptonight_superfast",
+		"cryptonight_gpu"
+	}};
+
+	static std::array<std::string, 3> derived_algo_names =
+	{{
+		"cryptonight_turtle",
+		"cryptonight_v8_half", // used by masari and stellite
+		"cryptonight_v8_zelerius"
+	}};
+
+
+	if(algo_id < start_derived_algo_id)
+		return base_algo_names[algo_id];
+	else
+		return derived_algo_names[algo_id - start_derived_algo_id];
 }
 
-template<xmrstak_algo ALGO>
-inline constexpr uint32_t cn_select_mask() { return 0; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_lite>() { return CRYPTONIGHT_LITE_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_monero>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_monero_v8>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_aeon>() { return CRYPTONIGHT_LITE_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_ipbc>() { return CRYPTONIGHT_LITE_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_stellite>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_masari>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_haven>() { return CRYPTONIGHT_HEAVY_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_superfast>() { return CRYPTONIGHT_MASK; } 
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_gpu>() { return CRYPTONIGHT_GPU_MASK; } 
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_MASK; }
-
-inline size_t cn_select_mask(xmrstak_algo algo)
+struct xmrstak_algo
 {
-	switch(algo)
+	xmrstak_algo(xmrstak_algo_id name_id) : algo_name(name_id), base_algo(name_id)
+	{
+	}
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : algo_name(name_id), base_algo(algorithm)
+	{
+	}
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : algo_name(name_id), base_algo(algorithm), iter(iteration)
+	{
+	}
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory)
+	{
+	}
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory), mask(mem_mask)
 	{
-	case cryptonight_stellite:
-	case cryptonight_monero:
-	case cryptonight_monero_v8:
-	case cryptonight_masari:
-	case cryptonight:
-	case cryptonight_superfast: 
-		return CRYPTONIGHT_MASK;
-	case cryptonight_ipbc:
-	case cryptonight_aeon:
-	case cryptonight_lite:
-		return CRYPTONIGHT_LITE_MASK;
-	case cryptonight_bittube2:
-	case cryptonight_haven:
-	case cryptonight_heavy:
-		return CRYPTONIGHT_HEAVY_MASK;
-	case cryptonight_gpu:
-		return CRYPTONIGHT_GPU_MASK;
-	case cryptonight_turtle:
-		return CRYPTONIGHT_TURTLE_MASK;
-	default:
-		return 0;
 	}
-}
-
-template<xmrstak_algo ALGO>
-inline constexpr uint32_t cn_select_iter() { return 0; }
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight>() { return CRYPTONIGHT_ITER; }
+	/** check if the algorithm is equal to another algorithm
+	 *
+	 * we do not check the member algo_name because this is only an alias name
+	 */
+	bool operator==(const xmrstak_algo& other) const
+	{
+		return other.Id() == Id() && other.Mem() == Mem() && other.Iter() == Iter() && other.Mask() == Mask();
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_lite>() { return CRYPTONIGHT_LITE_ITER; }
+	bool operator==(const xmrstak_algo_id& id) const
+	{
+		return base_algo == id;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_monero>() { return CRYPTONIGHT_ITER; }
+	operator xmrstak_algo_id() const
+	{
+		return base_algo;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_monero_v8>() { return CRYPTONIGHT_ITER; }
+	xmrstak_algo_id Id() const
+	{
+		return base_algo;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_ITER; }
+	size_t Mem() const
+	{
+		if(base_algo == invalid_algo)
+			return 0;
+		else
+			return mem;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_aeon>() { return CRYPTONIGHT_LITE_ITER; }
+	uint32_t Iter() const
+	{
+		return iter;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_ipbc>() { return CRYPTONIGHT_LITE_ITER; }
+	/** Name of the algorithm
+	 *
+	 * This name is only an alias for the native implemented base algorithm.
+	 */
+	std::string Name() const
+	{
+		return get_algo_name(algo_name);
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_stellite>() { return CRYPTONIGHT_ITER; }
+	/** Name of the parent algorithm
+	 *
+	 * This is the real algorithm which is implemented in all POW functions.
+	 */
+	std::string BaseName() const
+	{
+		return get_algo_name(base_algo);
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_masari>() { return CRYPTONIGHT_MASARI_ITER; }
+	uint32_t Mask() const
+	{
+		// default is a 16 byte aligne mask
+		if(mask == 0)
+			return ((mem - 1u) / 16) * 16;
+		else
+			return mask;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_haven>() { return CRYPTONIGHT_HEAVY_ITER; }
+	xmrstak_algo_id algo_name = invalid_algo;
+	xmrstak_algo_id base_algo = invalid_algo;
+	uint32_t iter = 0u;
+	size_t mem = 0u;
+	uint32_t mask = 0u;
+};
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_ITER; }
+// default cryptonight
+constexpr size_t CN_MEMORY = 2 * 1024 * 1024;
+constexpr uint32_t CN_ITER = 0x80000;
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_superfast>() { return CRYPTONIGHT_SUPERFAST_ITER; } 
+// crptonight gpu
+constexpr uint32_t CN_GPU_MASK = 0x1FFFC0;
+constexpr uint32_t CN_GPU_ITER = 0xC000;
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_gpu>() { return CRYPTONIGHT_GPU_ITER; } 
+// cryptonight turtle (the mask is not using the full 256kib scratchpad)
+constexpr uint32_t CN_TURTLE_MASK = 0x1FFF0;
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_ITER; }
+constexpr uint32_t CN_ZELERIUS_ITER = 0x6000;
 
-inline size_t cn_select_iter(xmrstak_algo algo)
+inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 {
-	switch(algo)
-	{
-	case cryptonight_stellite:
-	case cryptonight_monero:
-	case cryptonight_monero_v8:
-	case cryptonight:
-		return CRYPTONIGHT_ITER;
-	case cryptonight_ipbc:
-	case cryptonight_aeon:
-	case cryptonight_lite:
-		return CRYPTONIGHT_LITE_ITER;
-	case cryptonight_bittube2:
-	case cryptonight_haven:
-	case cryptonight_heavy:
-		return CRYPTONIGHT_HEAVY_ITER;
-	case cryptonight_masari:
-		return CRYPTONIGHT_MASARI_ITER;
-	case cryptonight_superfast:
-		return CRYPTONIGHT_SUPERFAST_ITER;
-	case cryptonight_gpu:
-		return CRYPTONIGHT_GPU_ITER;
-	case cryptonight_turtle:
-		return CRYPTONIGHT_TURTLE_ITER;
-	default:
-		return 0;
-	}
+	static std::array<xmrstak_algo, 14> pow = {{
+		{invalid_algo, invalid_algo},
+		{cryptonight, cryptonight, CN_ITER, CN_MEMORY},
+		{cryptonight_lite, cryptonight_lite, CN_ITER/2, CN_MEMORY/2},
+		{cryptonight_monero, cryptonight_monero, CN_ITER, CN_MEMORY},
+		{cryptonight_heavy, cryptonight_heavy, CN_ITER/2, CN_MEMORY*2},
+		{cryptonight_aeon, cryptonight_aeon, CN_ITER/2, CN_MEMORY/2},
+		{cryptonight_ipbc, cryptonight_ipbc, CN_ITER/2, CN_MEMORY/2}, // equal to cryptonight_aeon with a small tweak in the miner code
+		{cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change
+		{cryptonight_masari, cryptonight_masari, CN_ITER/2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari
+		{cryptonight_haven, cryptonight_haven, CN_ITER/2, CN_MEMORY*2}, // equal to cryptonight_heavy with a small tweak
+		{cryptonight_bittube2, cryptonight_bittube2, CN_ITER/2, CN_MEMORY*2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
+		{cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY},
+		{cryptonight_superfast, cryptonight_superfast, CN_ITER/4, CN_MEMORY},
+		{cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK}
+	}};
+
+	static std::array<xmrstak_algo, 3> derived_pow =
+	{{
+		{cryptonight_turtle, cryptonight_monero_v8, CN_ITER/8, CN_MEMORY/8, CN_TURTLE_MASK},
+		{cryptonight_v8_half, cryptonight_monero_v8, CN_ITER/2, CN_MEMORY},
+		{cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY}
+		// {cryptonight_derived}
+	}};
+
+	if(algo_id < start_derived_algo_id)
+		return pow[algo_id];
+	else
+		return derived_pow[algo_id - start_derived_algo_id];
 }
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 6460628de75b17b625418e8244414cce9752a0ad..07ed4d31e9a118809bfd4361ab89eba09aee6907 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -307,7 +307,7 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp
index 389356842b325269150b2f6d399fe91d7cc1a2ce..3863c93e8721e099b7fcdcdd9b9c3a85902178a9 100644
--- a/xmrstak/backend/nvidia/minethd.hpp
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -28,7 +28,7 @@ public:
 	static bool self_test();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
 	void start_mining();
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 8fda8d401323a185479bc5f2756cac3c5ca71af9..45ffef80668c5b1726025d5756dfa04312bcdb68 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -46,8 +46,8 @@ int cuda_get_devicecount( int* deviceCount);
 int cuda_get_deviceinfo(nvid_ctx *ctx);
 int cryptonight_extra_cpu_init(nvid_ctx *ctx);
 void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
-void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo);
-void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo);
+void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo);
+void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo);
 }
 
-void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce);
+void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 2acf1a387ed0f0333b5c75c22c2af1d0bd9ff66f..e151e8c02ec12779552752d1e7cf21a68a0131d1 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -128,8 +128,9 @@ __device__ __forceinline__ uint32_t rotate16( const uint32_t n )
 	return (n >> 16u) | (n << 16u);
 }
 
-template<size_t ITERATIONS, uint32_t MEMORY>
-__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
+__global__ void cryptonight_core_gpu_phase1(
+	const uint32_t ITERATIONS,  const size_t MEMORY,
+	int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
 {
 	__shared__ uint32_t sharedMemory[1024];
 
@@ -267,11 +268,13 @@ struct u64 : public uint2
  * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory
  *                   else if `1` 256bit operations will be used
  */
-template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO, uint32_t MEM_MODE>
+template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 2 )
 #endif
-__global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+__global__ void cryptonight_core_gpu_phase2_double(
+	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
+	int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
 		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
 	__shared__ uint32_t sharedMemory[512];
@@ -311,7 +314,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	uint64_t bx1;
 	uint32_t sqrt_result;
 	uint64_t division_result;
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+	if(ALGO == cryptonight_monero_v8)
 	{
 		bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub];
 		bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub];
@@ -351,7 +354,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) ))
 		);
 
-		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+		if(ALGO == cryptonight_monero_v8)
 		{
 
 			const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ];
@@ -394,14 +397,14 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		else
 			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
 
-		if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_turtle)
+		if(ALGO != cryptonight_monero_v8)
 			bx0 = cx_aes;
 
 		uint64_t cx_mul;
 		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
 		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
 
-		if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && sub == 1)
+		if((ALGO == cryptonight_monero_v8) && sub == 1)
 		{
 			// Use division and square root results from the _previous_ iteration to hide the latency
 			((uint32_t*)&division_result)[1] ^= sqrt_result;
@@ -425,7 +428,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			uint64_t cl = ((uint64_t*)myChunks)[ idx1 ];
 			// sub 0 -> hi, sub 1 -> lo
 			uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
-			if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+			if(ALGO == cryptonight_monero_v8)
 			{
 				const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
 				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
@@ -442,7 +445,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 			ax0 += res;
 		}
-		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+		if(ALGO == cryptonight_monero_v8)
 		{
 			bx1 = bx0;
 			bx0 = cx_aes;
@@ -465,7 +468,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	if ( bfactor > 0 )
 	{
 		((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
-		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+		if(ALGO == cryptonight_monero_v8)
 		{
 			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
 			((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
@@ -482,11 +485,13 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	}
 }
 
-template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
+template<xmrstak_algo_id ALGO>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
 #endif
-__global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+__global__ void cryptonight_core_gpu_phase2_quad(
+	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
+	int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
 		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
 	__shared__ uint32_t sharedMemory[1024];
@@ -685,8 +690,10 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 	}
 }
 
-template<size_t ITERATIONS, uint32_t MEMORY, xmrstak_algo ALGO>
-__global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
+template<xmrstak_algo_id ALGO>
+__global__ void cryptonight_core_gpu_phase3(
+	const uint32_t ITERATIONS,  const size_t MEMORY,
+	int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
 {
 	__shared__ uint32_t sharedMemory[1024];
 
@@ -737,9 +744,13 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
 }
 
-template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, uint32_t MEM_MODE>
-void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
+template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo)
 {
+	const uint32_t MASK = algo.Mask();
+	const uint32_t ITERATIONS = algo.Iter();
+	const size_t MEM = algo.Mem()/4;
+
 	dim3 grid( ctx->device_blocks );
 	dim3 block( ctx->device_threads );
 	dim3 block2( ctx->device_threads << 1 );
@@ -761,7 +772,10 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	for ( int i = 0; i < partcountOneThree; i++ )
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,MEMORY><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>(
+			ITERATIONS,
+			MEM,
+			ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
 			(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state),
@@ -773,19 +787,22 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	for ( int i = 0; i < partcount; i++ )
 	{
-		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+		if(ALGO == cryptonight_monero_v8)
 		{
 			// two threads per block
 			CUDA_CHECK_MSG_KERNEL(
 				ctx->device_id,
 				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
-				cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO, MEM_MODE><<<
+				cryptonight_core_gpu_phase2_double<ALGO, MEM_MODE><<<
 					grid,
 					block2,
 					sizeof(uint64_t) * block2.x * 8 +
 						// shuffle memory for fermi gpus
 						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
 				>>>(
+					ITERATIONS,
+					MEM,
+					MASK,
 					ctx->device_blocks*ctx->device_threads,
 					ctx->device_bfactor,
 					i,
@@ -803,11 +820,14 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 			CUDA_CHECK_MSG_KERNEL(
 				ctx->device_id,
 				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
-				cryptonight_core_gpu_phase2_quad<ITERATIONS,MEMORY,MASK,ALGO><<<
+				cryptonight_core_gpu_phase2_quad<ALGO><<<
 					grid,
 					block4,
 					block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
 				>>>(
+					ITERATIONS,
+					MEM,
+					MASK,
 					ctx->device_blocks*ctx->device_threads,
 					ctx->device_bfactor,
 					i,
@@ -834,20 +854,27 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	for ( int i = 0; i < roundsPhase3; i++ )
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<<
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
 			grid,
 			block8,
 			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-		>>>( ctx->device_blocks*ctx->device_threads,
+		>>>(
+			ITERATIONS,
+			MEM,
+			ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
 			ctx->d_ctx_state, ctx->d_ctx_key2 ));
 	}
 }
 
-template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, uint32_t MEM_MODE>
-void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
+template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo)
 {
+	const uint32_t MASK = algo.Mask();
+	const uint32_t ITERATIONS = algo.Iter();
+	const size_t MEM = algo.Mem();
+
 	dim3 grid( ctx->device_blocks );
 	dim3 block( ctx->device_threads );
 	dim3 block2( ctx->device_threads << 1 );
@@ -858,7 +885,7 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
 
 	CUDA_CHECK_KERNEL(
 		ctx->device_id,
-		xmrstak::nvidia::cn_explode_gpu<MEMORY><<<intensity,32>>>((int*)ctx->d_ctx_state, (int*)ctx->d_long_state)
+		xmrstak::nvidia::cn_explode_gpu<<<intensity,32>>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state)
 	);
 
 	int partcount = 1 << ctx->device_bfactor;
@@ -867,9 +894,12 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
 		CUDA_CHECK_KERNEL(
 			ctx->device_id,
 			// 36 x 16byte x numThreads
-			xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu<ITERATIONS, MEMORY>
-				<<<ctx->device_blocks, ctx->device_threads * 16,  36 * 16 * ctx->device_threads>>>
+			xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu
+				<<<ctx->device_blocks, ctx->device_threads * 16,  32 * 16 * ctx->device_threads>>>
 				(
+					ITERATIONS,
+					MEM,
+					MASK,
 					(int*)ctx->d_ctx_state,
 					(int*)ctx->d_long_state,
 					ctx->device_bfactor,
@@ -901,71 +931,71 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
 
 	for ( int i = 0; i < roundsPhase3; i++ )
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY/4, ALGO><<<
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
 			grid,
 			block8,
 			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-		>>>( ctx->device_blocks*ctx->device_threads,
+		>>>(
+			ITERATIONS,
+			MEM/4,
+			ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
 			ctx->d_ctx_state, ctx->d_ctx_key2 ));
 	}
 }
 
-void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
+void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce)
 {
-	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
+	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo);
 
 	if(miner_algo == invalid_algo) return;
 
 	static const cuda_hash_fn func_table[] = {
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 1>,
-
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 1>,
+		cryptonight_core_gpu_hash<cryptonight, 0>,
+		cryptonight_core_gpu_hash<cryptonight, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 1>,
+		cryptonight_core_gpu_hash<cryptonight_lite, 0>,
+		cryptonight_core_gpu_hash<cryptonight_lite, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 1>,
+		cryptonight_core_gpu_hash<cryptonight_monero, 0>,
+		cryptonight_core_gpu_hash<cryptonight_monero, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 1>,
+		cryptonight_core_gpu_hash<cryptonight_heavy, 0>,
+		cryptonight_core_gpu_hash<cryptonight_heavy, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 1>,
+		cryptonight_core_gpu_hash<cryptonight_aeon, 0>,
+		cryptonight_core_gpu_hash<cryptonight_aeon, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 1>,
+		cryptonight_core_gpu_hash<cryptonight_ipbc, 0>,
+		cryptonight_core_gpu_hash<cryptonight_ipbc, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 1>,
+		cryptonight_core_gpu_hash<cryptonight_stellite, 0>,
+		cryptonight_core_gpu_hash<cryptonight_stellite, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 1>,
+		cryptonight_core_gpu_hash<cryptonight_masari, 0>,
+		cryptonight_core_gpu_hash<cryptonight_masari, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 1>,
+		cryptonight_core_gpu_hash<cryptonight_haven, 0>,
+		cryptonight_core_gpu_hash<cryptonight_haven, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>,
+		cryptonight_core_gpu_hash<cryptonight_bittube2, 0>,
+		cryptonight_core_gpu_hash<cryptonight_bittube2, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 1>,
+		cryptonight_core_gpu_hash<cryptonight_monero_v8, 0>,
+		cryptonight_core_gpu_hash<cryptonight_monero_v8, 1>,
 
-		cryptonight_core_gpu_hash_gpu<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK, CRYPTONIGHT_MEMORY, cryptonight_gpu, 0>,
-		cryptonight_core_gpu_hash_gpu<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK, CRYPTONIGHT_MEMORY, cryptonight_gpu, 1>,
+		cryptonight_core_gpu_hash<cryptonight_superfast, 0>,
+		cryptonight_core_gpu_hash<cryptonight_superfast, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_TURTLE_ITER, CRYPTONIGHT_TURTLE_MASK, CRYPTONIGHT_TURTLE_MEMORY/4, cryptonight_turtle, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_TURTLE_ITER, CRYPTONIGHT_TURTLE_MASK, CRYPTONIGHT_TURTLE_MEMORY/4, cryptonight_turtle, 1>
+		cryptonight_core_gpu_hash_gpu<cryptonight_gpu, 0>,
+		cryptonight_core_gpu_hash_gpu<cryptonight_gpu, 1>
 	};
 
 	std::bitset<1> digit;
 	digit.set(0, ctx->memMode == 1);
 
 	cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ];
-	selected_function(ctx, startNonce);
+	selected_function(ctx, startNonce, miner_algo);
 
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
index 94750560c594a0833c8a51a716cb6f8572652300..fee7e13d14daac353893114f35a035b5479eed24 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
@@ -275,8 +275,7 @@ __forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot
 	);
 }
 
-template<uint32_t MASK>
-__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); }
+__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); }
 
 
 __forceinline__ __device__  __m128 fma_break(__m128 x)
@@ -412,27 +411,27 @@ __forceinline__ __device__ void sync()
 #endif
 }
 
-template<size_t ITERATIONS, uint32_t MEMORY>
-__global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS)
+struct SharedMemChunk
+{
+	__m128i out[16];
+	__m128 va[16];
+};
+
+__global__ void cryptonight_core_gpu_phase2_gpu(
+	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
+	int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS)
 {
-	static constexpr uint32_t MASK = ((MEMORY-1) >> 6) << 6;
 
 	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
 
-	extern __shared__ __m128i smemExtern_in[];
+	extern __shared__ SharedMemChunk smemExtern_in[];
 
 	const uint32_t chunk = threadIdx.x / 16;
 	const uint32_t numHashPerBlock = blockDim.x / 16;
 
 	int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk));
 
-	__m128i* smem = smemExtern_in + 4 * chunk;
-
-	__m128i* smemExtern = smemExtern_in + numHashPerBlock * 4;
-	__m128i* smemOut = smemExtern + 16 * chunk;
-
-	smemExtern = smemExtern + numHashPerBlock * 16;
-	__m128* smemVa = (__m128*)smemExtern + 16 * chunk;
+	SharedMemChunk* smem = smemExtern_in + chunk;
 
 	uint32_t tid = threadIdx.x % 16;
 
@@ -450,50 +449,53 @@ __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int
 		s = ((uint32_t*)spad)[idxHash * 50] >> 8;
 	}
 
-	const uint32_t b = tid / 4;
-	const uint32_t bb = tid % 4;
-	const uint32_t block = b * 16 + bb;
+	// tid divided
+	const uint32_t tidd = tid / 4;
+	// tid modulo
+	const uint32_t tidm = tid % 4;
+	const uint32_t block = tidd * 16 + tidm;
 
 	for(size_t i = 0; i < batchsize; i++)
 	{
 		sync();
-		((int*)smem)[tid] = ((int*)scratchpad_ptr<MASK>(s, b, lpad))[bb];
+		int tmp = ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm];
+		((int*)smem->out)[tid] = tmp;
 		sync();
 
 		__m128 rc = vs;
 		single_comupte_wrap(
-			bb,
-			*(smem + look[tid][0]),
-			*(smem + look[tid][1]),
-			*(smem + look[tid][2]),
-			*(smem + look[tid][3]),
-			ccnt[tid], rc, smemVa[tid],
-			smemOut[tid]
+			tidm,
+			*(smem->out + look[tid][0]),
+			*(smem->out + look[tid][1]),
+			*(smem->out + look[tid][2]),
+			*(smem->out + look[tid][3]),
+			ccnt[tid], rc, smem->va[tid],
+			smem->out[tid]
 		);
 
 		sync();
 
-		int outXor = ((int*)smemOut)[block];
-		for(uint32_t dd = block + 4; dd < (b + 1) * 16; dd += 4)
-			outXor ^= ((int*)smemOut)[dd];
+		int outXor = ((int*)smem->out)[block];
+		for(uint32_t dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
+			outXor ^= ((int*)smem->out)[dd];
 
-		((int*)scratchpad_ptr<MASK>(s, b, lpad))[bb] = outXor ^ ((int*)smem)[tid];
-		((int*)smemOut)[tid] = outXor;
+		((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm] = outXor ^ tmp;
+		((int*)smem->out)[tid] = outXor;
 
-		float va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
-		float va_tmp2 = ((float*)smemVa)[block+ 8] + ((float*)smemVa)[block + 12];
-		((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
+		float va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4];
+		float va_tmp2 = ((float*)smem->va)[block+ 8] + ((float*)smem->va)[block + 12];
+		((float*)smem->va)[tid] = va_tmp1 + va_tmp2;
 
 		sync();
 
-		__m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3];
-		va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
-		va_tmp2 = ((float*)smemVa)[block + 8] + ((float*)smemVa)[block + 12];
-		((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
+		__m128i out2 = smem->out[0] ^ smem->out[1] ^ smem->out[2] ^ smem->out[3];
+		va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4];
+		va_tmp2 = ((float*)smem->va)[block + 8] + ((float*)smem->va)[block + 12];
+		((float*)smem->va)[tid] = va_tmp1 + va_tmp2;
 
 		sync();
 
-		vs = smemVa[0];
+		vs = smem->va[0];
 		vs.abs(); // take abs(va) by masking the float sign bit
 		auto xx = _mm_mul_ps(vs, __m128(16777216.0f));
 		// vs range 0 - 64
@@ -539,8 +541,8 @@ __forceinline__ __device__ void generate_512(uint64_t idx, const uint64_t* in, u
 		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
 }
 
-template<size_t MEMORY>
-__global__ void cn_explode_gpu(int32_t *spad_in, int *lpad_in)
+
+__global__ void cn_explode_gpu(const size_t MEMORY, int32_t *spad_in, int *lpad_in)
 {
 	__shared__ uint64_t state[25];
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index a37ecc8a02409942527ecadd1eb0a7961b142fbd..e20373b7d82216c0d2d8534b24ba560a0efa46a4 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -93,7 +93,7 @@ __device__ __forceinline__ void mix_and_propagate( uint32_t* state )
 		(state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x];
 }
 
-template<xmrstak_algo ALGO>
+template<xmrstak_algo_id ALGO>
 __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
 {
 	int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
@@ -127,7 +127,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
 	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
 	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
+	if(ALGO == cryptonight_monero_v8)
 	{
 		memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 );
 		// bx1
@@ -162,7 +162,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	}
 }
 
-template<xmrstak_algo ALGO>
+template<xmrstak_algo_id ALGO>
 __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
 {
 	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
@@ -292,7 +292,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	size_t hashMemSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{
-		hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+		hashMemSize = std::max(hashMemSize, algo.Mem());
 	}
 
 	size_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -310,8 +310,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		// create a double buffer for the state to exchange the mixed state to phase1
 		CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
 	}
-	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end() ||
-	        std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_turtle) != neededAlgorithms.end() )
+	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
 	{
 		// bx1 (16byte), division_result (8byte) and sqrt_result (8byte)
 		ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize;
@@ -335,7 +334,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	return 1;
 }
 
-extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo)
+extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo)
 {
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -368,11 +367,6 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
-	else if (miner_algo == cryptonight_turtle)
-	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_turtle> << <grid, block >> > (wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
-	}
 	else if(miner_algo == cryptonight_gpu)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_gpu><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
@@ -388,7 +382,7 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 	}
 }
 
-extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo)
+extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo)
 {
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -697,7 +691,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+			hashMemSize = std::max(hashMemSize, algo.Mem());
 		}
 
 #ifdef WIN32
@@ -745,8 +739,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		}
 
 		// check if cryptonight_monero_v8 is selected for the user pool
-		bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end() ||
-		                          std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_turtle) != neededAlgorithms.end());
+		bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());
 
 		// overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0
 		if(useCryptonight_v8 && gpuArch >= 50)
@@ -768,9 +761,15 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			size_t threads = 8;
 			// 8 is chosen by checking the occupancy calculator
 			size_t blockOptimal = 8 * ctx->device_mpcount;
+
+			// the following values are calculated with CUDA10 and the occupancy calculator
+			if(gpuArch == 35 || gpuArch/10 == 5 || gpuArch/10 == 6)
+				blockOptimal = 7 *  ctx->device_mpcount;
+			if(gpuArch == 37)
+				blockOptimal = 14 *  ctx->device_mpcount;
 			if(gpuArch >= 70)
-				blockOptimal = 5 *  ctx->device_mpcount;
-			
+				blockOptimal = 6 *  ctx->device_mpcount;
+
 			if(blockOptimal * threads * hashMemSize < limitedMemory)
 			{
 				ctx->device_threads = threads;
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index 40fb9d94859e03910de2ec642119703dc56ada9b..7539706e0cb3619869a1357912b6bf471d38d0fa 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -801,7 +801,7 @@ int main(int argc, char *argv[])
 	printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n");
 	printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n");
 	printer::inst()->print_str("-------------------------------------------------------------------\n");
-	printer::inst()->print_msg(L0, "Mining coin: %s", jconf::inst()->GetMiningCoin().c_str());
+	printer::inst()->print_msg(L0, "Mining coin: %s", ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo().Name().c_str());
 
 	if(params::inst().benchmark_block_version >= 0)
 	{
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index a16910552e98358c75edb3b647ff3a4f1ee3772f..41e077889cb032264ac5d754838d7efe0babf332 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -87,32 +87,37 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
 
 xmrstak::coin_selection coins[] = {
 	// name, userpool, devpool, default_pool_suggestion
-	{ "aeon7",               {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     "mine.aeon-pool.com:5555" },
-	{ "bbscoin",             {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
-	{ "bittube",             {cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"},
-	{ "cryptonight",         {cryptonight_monero_v8, cryptonight, 255u},          {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "cryptonight_bittube2",{cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},nullptr},
-	{ "cryptonight_masari",  {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
-	{ "cryptonight_haven",   {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "cryptonight_heavy",   {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "cryptonight_lite",    {cryptonight_aeon, cryptonight_lite, 255u},          {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
-	{ "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
-	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u},      {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
-	{ "cryptonight_superfast",   {cryptonight_heavy, cryptonight_superfast, 255u},{cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
-	{ "cryptonight_turtle",  {cryptonight_turtle, cryptonight_turtle, 0u},      {cryptonight_turtle, cryptonight_turtle, 0u},   nullptr },
-	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "cryptonight_v8",      {cryptonight_monero_v8, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "cryptonight_gpu",     {cryptonight_gpu, cryptonight_gpu, 255u},            {cryptonight_gpu, cryptonight_gpu, 0u}, nullptr },
-	{ "freehaven",           {cryptonight_heavy, cryptonight_superfast, 255u},    {cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
-	{ "graft",               {cryptonight_monero_v8, cryptonight_monero_v8, 0u},    {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "haven",               {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "lethean",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "monero",              {cryptonight_monero_v8, cryptonight_monero_v8, 0u},     {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" },
-	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "ryo",                 {cryptonight_gpu, cryptonight_heavy, 6u},            {cryptonight_gpu, cryptonight_heavy, 6u},   nullptr },
-	{ "turtlecoin",          {cryptonight_turtle, cryptonight_aeon, 5u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
-	{ "plenteum",			 {cryptonight_turtle, cryptonight_aeon, 5u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
+	{ "aeon7",                   {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)},      "mine.aeon-pool.com:5555" },
+	{ "bbscoin",                 {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)},      nullptr },
+	{ "bittube",                 {POW(cryptonight_bittube2)},  {POW(cryptonight_heavy)},     "mining.bit.tube:13333" },
+	{ "cryptonight",             {POW(cryptonight)},           {POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_bittube2",    {POW(cryptonight_bittube2)},  {POW(cryptonight_heavy)},     nullptr },
+	{ "cryptonight_masari",      {POW(cryptonight_masari)},    {POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_haven",       {POW(cryptonight_haven)},     {POW(cryptonight_heavy)},     nullptr },
+	{ "cryptonight_heavy",       {POW(cryptonight_heavy)},     {POW(cryptonight_heavy)},     nullptr },
+	{ "cryptonight_lite",        {POW(cryptonight_lite)},      {POW(cryptonight_aeon)},      nullptr },
+	{ "cryptonight_lite_v7",     {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)},      nullptr },
+	{ "cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)},      {POW(cryptonight_aeon)},      nullptr },
+	{ "cryptonight_superfast",   {POW(cryptonight_superfast)}, {POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_turtle",      {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
+	{ "cryptonight_v7",          {POW(cryptonight_monero)},    {POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_v8",          {POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_v8_half",     {POW(cryptonight_v8_half)},   {POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_v7_stellite", {POW(cryptonight_stellite)},  {POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_gpu",         {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)},       nullptr },
+	{ "freehaven",               {POW(cryptonight_superfast)}, {POW(cryptonight_monero_v8)}, nullptr },
+	{ "graft",                   {POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr },
+	{ "haven",                   {POW(cryptonight_haven)},     {POW(cryptonight_heavy)},     nullptr },
+	{ "lethean",                 {POW(cryptonight_monero)},    {POW(cryptonight_monero_v8)}, nullptr },
+	{ "masari",                  {POW(cryptonight_v8_half)},   {POW(cryptonight_monero_v8)}, nullptr },
+	{ "monero",                  {POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, "pool.usxmrpool.com:3333" },
+	{ "qrl",             	     {POW(cryptonight_monero)},    {POW(cryptonight_monero_v8)}, nullptr },
+	{ "ryo",                     {POW(cryptonight_gpu), 6u, POW(cryptonight_heavy)},  {POW(cryptonight_gpu), 6u, POW(cryptonight_heavy)}, nullptr },
+	{ "stellite",                {POW(cryptonight_v8_half)},   {POW(cryptonight_monero_v8)}, nullptr },
+	{ "turtlecoin",              {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr },
+	{ "plenteum",			     {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
+	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_monero_v8)}, nullptr }
 };
 
 constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp
index 26688aeea394313479b63c04882c72ab32db1c5d..65dee143c726e4f3f821cfcd37af35f9ca3bb229 100644
--- a/xmrstak/misc/coinDescription.hpp
+++ b/xmrstak/misc/coinDescription.hpp
@@ -11,13 +11,17 @@ namespace xmrstak
 {
 	struct coinDescription
 	{
-		xmrstak_algo algo = xmrstak_algo::invalid_algo;
-		xmrstak_algo algo_root = xmrstak_algo::invalid_algo;
+		xmrstak_algo algo = {xmrstak_algo_id::invalid_algo};
 		uint8_t fork_version = 0u;
+		xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo};
 
 		coinDescription() = default;
 
-		coinDescription(const xmrstak_algo in_algo, xmrstak_algo in_algo_root, const uint8_t in_fork_version) :
+		coinDescription(
+			const xmrstak_algo in_algo,
+			const uint8_t in_fork_version = 0,
+			xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo
+		) :
 			algo(in_algo), algo_root(in_algo_root), fork_version(in_fork_version)
 		{}
 
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index c475c4129f141a9d11a1025bb88bf96872b3795a..0dd7db1370c06a93f2aa365495fce38f626c5deb 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -567,31 +567,24 @@ void executor::ex_main()
 			pools.emplace_front(0, "donate.xmr-stak.net:5511", "", "", "", 0.0, true, false, "", false);
 		break;
 	case cryptonight_monero_v8:
-	case cryptonight_monero:
-	case cryptonight_turtle:
 		if(dev_tls)
 			pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false);
 		else
 			pools.emplace_front(0, "donate.xmr-stak.net:5500", "", "", "", 0.0, true, false, "", false);
 		break;
-	case cryptonight_ipbc:
 	case cryptonight_aeon:
-	case cryptonight_lite:
 		if(dev_tls)
 			pools.emplace_front(0, "donate.xmr-stak.net:7777", "", "", "", 0.0, true, true, "", true);
 		else
 			pools.emplace_front(0, "donate.xmr-stak.net:4444", "", "", "", 0.0, true, false, "", true);
 		break;
-
-	case cryptonight:
+	default:
+			case cryptonight_lite:
 		if(dev_tls)
 			pools.emplace_front(0, "donate.xmr-stak.net:6666", "", "", "", 0.0, true, true, "", false);
 		else
 			pools.emplace_front(0, "donate.xmr-stak.net:3333", "", "", "", 0.0, true, false, "", false);
 		break;
-
-	default:
-		break;
 	}
 
 	ex_event ev;
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index cbdf1d0c1271192ac79f48738660fbc81ff9f97f..d5b0d7fcc6d18b5b9f968b0c5f870069c534dcbc 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -655,13 +655,17 @@ bool jpsock::cmd_login()
 	return true;
 }
 
-bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, xmrstak_algo algo)
+bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, const xmrstak_algo& algo)
 {
 	char cmd_buffer[1024];
 	char sNonce[9];
 	char sResult[65];
 	/*Extensions*/
 	char sAlgo[64] = {0};
+	char sBaseAlgo[64] = {0};
+	char sIterations[32] = {0};
+	char sMemory[32] = {0};
+	char sMemAlignBytes[32] = {0};
 	char sBackend[64] = {0};
 	char sHashcount[128] = {0};
 
@@ -673,51 +677,12 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 
 	if(ext_algo)
 	{
-		const char* algo_name;
-		switch(algo)
-		{
-		case cryptonight:
-			algo_name = "cryptonight";
-			break;
-		case cryptonight_lite:
-			algo_name = "cryptonight_lite";
-			break;
-		case cryptonight_monero:
-			algo_name = "cryptonight_v7";
-			break;
-		case cryptonight_monero_v8:
-			algo_name = "cryptonight_v8";
-			break;
-		case cryptonight_aeon:
-			algo_name = "cryptonight_lite_v7";
-			break;
-		case cryptonight_stellite:
-			algo_name = "cryptonight_v7_stellite";
-			break;
-		case cryptonight_ipbc:
-			algo_name = "cryptonight_lite_v7_xor";
-			break;
-		case cryptonight_heavy:
-			algo_name = "cryptonight_heavy";
-			break;
-		case cryptonight_haven:
-			algo_name = "cryptonight_haven";
-			break;
-		case cryptonight_masari:
-			algo_name = "cryptonight_masari";
-			break;
-		case cryptonight_superfast:
-			algo_name = "cryptonight_superfast";
-			break;
-		case cryptonight_turtle:
-			algo_name = "cryptonight_turtle";
-			break;
-		default:
-			algo_name = "unknown";
-			break;
-		}
-
-		snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo_name);
+		snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo.Name().c_str());
+		// the real algorithm with three degrees of freedom
+		snprintf(sBaseAlgo, sizeof(sBaseAlgo), ",\"base_algo\":\"%s\"", algo.BaseName().c_str());
+		snprintf(sIterations, sizeof(sIterations), ",\"iterations\":\"0x%08x\"", algo.Iter());
+		snprintf(sMemory, sizeof(sMemory), ",\"scratchpad\":\"0x%08x\"", (uint32_t)algo.Mem());
+		snprintf(sMemAlignBytes, sizeof(sMemAlignBytes), ",\"mask\":\"0x%08x\"", algo.Mask());
 	}
 
 	bin2hex((unsigned char*)&iNonce, 4, sNonce);
@@ -726,8 +691,8 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 	bin2hex(bResult, 32, sResult);
 	sResult[64] = '\0';
 
-	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s},\"id\":1}\n",
-		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo);
+	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s%s%s},\"id\":1}\n",
+		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations,sMemory, sMemAlignBytes);
 
 	uint64_t messageId = 0;
 	opq_json_val oResult(nullptr);
diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp
index 96fec6b98bdf3468f390a5aaadbaf81524e02d39..a1112df7430458f73294b2eff0537d7fbbb81a5f 100644
--- a/xmrstak/net/jpsock.hpp
+++ b/xmrstak/net/jpsock.hpp
@@ -35,7 +35,7 @@ public:
 	void disconnect(bool quiet = false);
 
 	bool cmd_login();
-	bool cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, xmrstak_algo algo);
+	bool cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, const xmrstak_algo& algo);
 
 	static bool hex2bin(const char* in, unsigned int len, unsigned char* out);
 	static void bin2hex(const unsigned char* in, unsigned int len, char* out);
diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp
index 6a05eb9d5e3024751c08afd0928edc36b1c5774b..cd23a94c4d4ed2316bf31b0bc8c23c0447b29eca 100644
--- a/xmrstak/net/msgstruct.hpp
+++ b/xmrstak/net/msgstruct.hpp
@@ -33,10 +33,10 @@ struct job_result
 	char		sJobID[64];
 	uint32_t	iNonce;
 	uint32_t	iThreadId;
-	xmrstak_algo algorithm = invalid_algo;
+	xmrstak_algo algorithm = {invalid_algo};
 
 	job_result() {}
-	job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, xmrstak_algo algo) :
+	job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, const xmrstak_algo& algo) :
 		iNonce(iNonce), iThreadId(iThreadId), algorithm(algo)
 	{
 		memcpy(this->sJobID, sJobID, sizeof(job_result::sJobID));
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 2019f2b86ec3549352808c846f28c79527613e1c..f8f1d7d6cfa2c53a4bd0c4e1a37965bf920ea096 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -45,9 +45,12 @@ POOLCONF],
  *    cryptonight_lite_v7_xor (algorithm used by ipbc)
  *    # 2MiB scratchpad memory
  *    cryptonight
+ *    cryptonight_gpu (for Ryo's 14th of Feb fork)
  *    cryptonight_superfast
  *    cryptonight_v7
  *    cryptonight_v8
+ *    cryptonight_v8_half (used by masari and stellite)
+ *    cryptonight_v8_zelerius
  *    # 4MiB scratchpad memory
  *    cryptonight_bittube2
  *    cryptonight_haven
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 84fd048bda68301e981a599a360c3385461d48ae..f30fde92b2733d87610dc0ef2a1c1a21be21270b 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.8.0"
+#define XMR_STAK_VERSION "2.8.1"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"