diff --git a/README.md b/README.md
index a7eee604141bb492ea7254994f31b8a90eac4f4e..c890da1a53efb08848e1e3798fa6bc0a0026a179 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [Stellite](https://stellite.cash/)
 - [TurtleCoin](https://turtlecoin.lol)
 - [Zelerius](https://zelerius.org/)
+- [X-CASH](https://x-network.io/)
 
 Ryo currency is a way for us to implement the ideas that we were unable to in
 Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
@@ -70,6 +71,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
     - cryptonight_v7
     - cryptonight_v7_stellite
     - cryptonight_v8
+    - cryptonight_v8_double (used by X-CASH)
     - cryptonight_v8_half (used by masari and stellite)
     - cryptonight_v8_reversewaltz (used by graft)
     - cryptonight_v8_zelerius
diff --git a/doc/usage.md b/doc/usage.md
index a371f0e67bdbee9bd4d7247b4e9dfebfff8cf062..82d26dcc50d8bb411f17bb4c829e77c9a8e9554c 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -51,7 +51,6 @@ The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD
 ```
 xmr-stak --noCPU
 ```
-**CUDA** is currently not supported. I am currently try to get some performance out it.
 
 ### NVIDIA via OpenCL
 
diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.cpp b/xmrstak/backend/amd/OclCryptonightR_gen.cpp
index 13785d64bb70158a20525066844e2e7760c60144..7358e98570386de0353815df3eb19eab717455f8 100644
--- a/xmrstak/backend/amd/OclCryptonightR_gen.cpp
+++ b/xmrstak/backend/amd/OclCryptonightR_gen.cpp
@@ -134,6 +134,7 @@ static cl_program CryptonightR_build_program(
     const GpuContext* ctx,
     xmrstak_algo algo,
     uint64_t height,
+    uint32_t precompile_count,
     cl_kernel old_kernel,
     std::string source_code,
     std::string options)
@@ -151,7 +152,7 @@ static cl_program CryptonightR_build_program(
         for(size_t i = 0; i < CryptonightR_cache.size();)
         {
             const CacheEntry& entry = CryptonightR_cache[i];
-            if ((entry.algo == algo) && (entry.height + 2 < height))
+            if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
             {
                 printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
                 old_programs.push_back(entry.program);
@@ -252,10 +253,12 @@ static cl_program CryptonightR_build_program(
     return program;
 }
 
-cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, bool background, cl_kernel old_kernel)
+cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background, cl_kernel old_kernel)
 {
+	printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height);
+
     if (background) {
-        background_exec([=](){ CryptonightR_get_program(ctx, algo, height, false, old_kernel); });
+        background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false, old_kernel); });
         return nullptr;
     }
 
@@ -347,7 +350,7 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t
 
     }
 
-    return CryptonightR_build_program(ctx, algo, height, old_kernel, source, options);
+    return CryptonightR_build_program(ctx, algo, height, precompile_count, old_kernel, source, options);
 }
 
 } // namespace amd
diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
index a69df9074be2715035948358d9f2c0cdfe61515e..5f97d1e5142fa2552146c22af257ffbe02f39ae8 100644
--- a/xmrstak/backend/amd/OclCryptonightR_gen.hpp
+++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
@@ -20,7 +20,7 @@ namespace amd
 {
 
 cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo,
-	uint64_t height, bool background = false, cl_kernel old_kernel = nullptr);
+	uint64_t height, uint32_t precompile_count, bool background = false, cl_kernel old_kernel = nullptr);
 
 } // namespace amd
 } // namespace xmrstak
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index ace1c34bb229818566c6037181e22bfb8123ddd0..9c9db2ee3d274fa1d170b043548e4f1b116b72de 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -199,7 +199,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret);
+	ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
@@ -334,6 +334,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 */
 		options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100);
 
+		uint32_t isWindowsOs = 0;
+#ifdef _WIN32
+		isWindowsOs = 1;
+#endif
+		options += " -DIS_WINDOWS_OS=" + std::to_string(isWindowsOs);
+		
 		if(miner_algo == cryptonight_gpu)
 			options += " -cl-fp32-correctly-rounded-divide-sqrt";
 
@@ -889,15 +895,15 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 
 	cl_int ret;
 
-	if(input_len > 84)
+	if(input_len > 124)
 		return ERR_STUPID_PARAMS;
 
 	input[input_len] = 0x01;
-	memset(input + input_len + 1, 0, 88 - input_len - 1);
+	memset(input + input_len + 1, 0, 128 - input_len - 1);
 
 	cl_uint numThreads = ctx->rawIntensity;
 
-	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
 		return ERR_OCL_API;
@@ -952,8 +958,10 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 
     if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) {
 
+		uint32_t PRECOMPILATION_DEPTH = 4;
+
         // Get new kernel
-        cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height);
+        cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH);
 
         if (program != ctx->ProgramCryptonightR) {
             cl_int ret;
@@ -969,12 +977,10 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
             }
             ctx->ProgramCryptonightR = program;
 
-			uint32_t PRECOMPILATION_DEPTH = 4;
-
             // Precompile next program in background
-            xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, true, old_kernel);
+            xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, PRECOMPILATION_DEPTH, true, old_kernel);
             for (int i = 2; i <= PRECOMPILATION_DEPTH; ++i)
-                xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, true, nullptr);
+                xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true, nullptr);
 
             printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
         }
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index b78f2bcf7d0749fabb0004cb35a33d3aeb4b6c2a..12478aefba767629706408ef4ecdc9558bfa0ec3 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -32,69 +32,6 @@ R"===(
 #define cryptonight_conceal 14
 #define cryptonight_v8_reversewaltz 17
 
-/* For Mesa clover support */
-#ifdef cl_clang_storage_class_specifiers
-#   pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
-#endif
-
-#ifdef cl_amd_media_ops
-#pragma OPENCL EXTENSION cl_amd_media_ops : enable
-#else
-/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
- * Build-in Function
- *     uintn  amd_bitalign (uintn src0, uintn src1, uintn src2)
- *   Description
- *     dst.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
- *     similar operation applied to other components of the vectors.
- *
- * The implemented function is modified because the last is in our case always a scalar.
- * We can ignore the bitwise AND operation.
- */
-inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
-{
-	uint2 result;
-	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
-	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
-	return result;
-}
-#endif
-
-#ifdef cl_amd_media_ops2
-#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
-#else
-/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
- *     Built-in Function:
- *     uintn amd_bfe (uintn src0, uintn src1, uintn src2)
- *   Description
- *     NOTE: operator >> below represent logical right shift
- *     offset = src1.s0 & 31;
- *     width = src2.s0 & 31;
- *     if width = 0
- *         dst.s0 = 0;
- *     else if (offset + width) < 32
- *         dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
- *     else
- *         dst.s0 = src0.s0 >> offset;
- *     similar operation applied to other components of the vectors
- */
-inline int amd_bfe(const uint src0, const uint offset, const uint width)
-{
-	/* casts are removed because we can implement everything as uint
-	 * int offset = src1;
-	 * int width = src2;
-	 * remove check for edge case, this function is always called with
-	 * `width==8`
-	 * @code
-	 *   if ( width == 0 )
-	 *      return 0;
-	 * @endcode
-	 */
-	if ( (offset + width) < 32u )
-		return (src0 << (32u - offset - width)) >> (32u - width);
-
-	return src0 >> offset;
-}
-#endif
 
 static const __constant ulong keccakf_rndc[24] =
 {
@@ -128,6 +65,8 @@ static const __constant uchar sbox[256] =
 	0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
 };
 
+//#include "opencl/wolf-aes.cl"
+XMRSTAK_INCLUDE_WOLF_AES
 
 void keccakf1600(ulong *s)
 {
@@ -355,8 +294,6 @@ inline uint getIdx()
 XMRSTAK_INCLUDE_FAST_INT_MATH_V2
 //#include "fast_div_heavy.cl"
 XMRSTAK_INCLUDE_FAST_DIV_HEAVY
-//#include "opencl/wolf-aes.cl"
-XMRSTAK_INCLUDE_WOLF_AES
 //#include "opencl/wolf-skein.cl"
 XMRSTAK_INCLUDE_WOLF_SKEIN
 //#include "opencl/jh.cl"
@@ -461,8 +398,6 @@ void CNKeccak(ulong *output, ulong *input)
 
 static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
 
-#define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))
-
 #define SubWord(inw)		((sbox[BYTE(inw, 3)] << 24) | (sbox[BYTE(inw, 2)] << 16) | (sbox[BYTE(inw, 1)] << 8) | sbox[BYTE(inw, 0)])
 
 void AESExpandKey256(uint *keybuf)
@@ -539,6 +474,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 			State[8]  = input[8];
 			State[9]  = input[9];
 			State[10] = input[10];
+			State[11] = input[11];
+			State[12] = input[12];
+			State[13] = input[13];
+			State[14] = input[14];
+			State[15] = input[15];
 
 			((__local uint *)State)[9]  &= 0x00FFFFFFU;
 			((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
@@ -550,13 +490,13 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 			 */
 			((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
 
-			for (int i = 11; i < 25; ++i) {
-			    State[i] = 0x00UL;
-			}
-
 			// Last bit of padding
 			State[16] = 0x8000000000000000UL;
 
+			for (int i = 17; i < 25; ++i) {
+			    State[i] = 0x00UL;
+			}
+
 			keccakf1600_2(State);
 
 			#pragma unroll
@@ -1361,7 +1301,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 		states += 25 * BranchBuf[idx];
 
 		ulong State[8] = { 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0x0001000000000000UL };
-#if defined(__clang__) && !defined(__NV_CL_C_VERSION)
+#if defined(__clang__) && !defined(__NV_CL_C_VERSION) && (IS_WINDOWS_OS != 1)
 		// on ROCM we need volatile for AMD RX5xx cards to avoid invalid shares
 		volatile
 #endif
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
index b99b62d5cf30e94b88dea660f0062b9a7fdafe59..f1457c0dc8dbd33d506b5508789d94d9924192c1 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
@@ -2,6 +2,70 @@ R"===(
 #ifndef WOLF_AES_CL
 #define WOLF_AES_CL
 
+/* For Mesa clover support */
+#ifdef cl_clang_storage_class_specifiers
+#   pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
+#endif
+
+#ifdef cl_amd_media_ops
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+#else
+/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
+ * Build-in Function
+ *     uintn  amd_bitalign (uintn src0, uintn src1, uintn src2)
+ *   Description
+ *     dst.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
+ *     similar operation applied to other components of the vectors.
+ *
+ * The implemented function is modified because the last is in our case always a scalar.
+ * We can ignore the bitwise AND operation.
+ */
+inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
+{
+	uint2 result;
+	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
+	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
+	return result;
+}
+#endif
+
+#ifdef cl_amd_media_ops2
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+#else
+/* taken from: https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops2.txt
+ *     Built-in Function:
+ *     uintn amd_bfe (uintn src0, uintn src1, uintn src2)
+ *   Description
+ *     NOTE: operator >> below represent logical right shift
+ *     offset = src1.s0 & 31;
+ *     width = src2.s0 & 31;
+ *     if width = 0
+ *         dst.s0 = 0;
+ *     else if (offset + width) < 32
+ *         dst.s0 = (src0.s0 << (32 - offset - width)) >> (32 - width);
+ *     else
+ *         dst.s0 = src0.s0 >> offset;
+ *     similar operation applied to other components of the vectors
+ */
+inline int amd_bfe(const uint src0, const uint offset, const uint width)
+{
+	/* casts are removed because we can implement everything as uint
+	 * int offset = src1;
+	 * int width = src2;
+	 * remove check for edge case, this function is always called with
+	 * `width==8`
+	 * @code
+	 *   if ( width == 0 )
+	 *      return 0;
+	 * @endcode
+	 */
+	if ( (offset + width) < 32u )
+		return (src0 << (32u - offset - width)) >> (32u - width);
+
+	return src0 >> offset;
+}
+#endif
+
 // AES table - the other three are generated on the fly
 
 static const __constant uint AES0_C[256] =
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index ea688e0534239547c7da41517201e06695851724..120fb6898bda32eaefa116ab4bae74ba2bb3c2b2 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -187,8 +187,8 @@ private:
 				memPerThread = std::min(memPerThread, memDoubleThread);
 			}
 
-			// 224byte extra memory is used per thread for meta data
-			size_t perThread = hashMemSize + 224u;
+			// 240byte extra memory is used per thread for meta data
+			size_t perThread = hashMemSize + 240u;
 			size_t maxIntensity = memPerThread / perThread;
 			size_t possibleIntensity = std::min( maxThreads , maxIntensity );
 			// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 09e030e663c4524397c12840df96d1f178277afe..3be593175f6ee72539b06bf86077c23d22fc0379 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -273,7 +273,7 @@ void minethd::work_main()
 
 			for(size_t i = 0; i < results[0xFF]; i++)
 			{
-				uint8_t	bWorkBlob[112];
+				uint8_t	bWorkBlob[128];
 				uint8_t	bResult[32];
 
 				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp
index 5959b75cc7aa753a4028115ec41539fe4b328b69..c85682d4f3f0e1b5e2abcca7616bc1d0e0829b19 100644
--- a/xmrstak/backend/cpu/cpuType.cpp
+++ b/xmrstak/backend/cpu/cpuType.cpp
@@ -37,9 +37,9 @@ namespace cpu
 	{
 		int32_t mask = 1 << bit;
 		return (val & mask) != 0u;
-		
+
 	}
-	
+
 	Model getModel()
 	{
 		int32_t cpu_info[4];
@@ -53,7 +53,7 @@ namespace cpu
 		Model result;
 
 		cpuid(1, 0, cpu_info);
-		
+
 		result.family = get_masked(cpu_info[0], 12, 8);
 		result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4;
 		result.type_name = cpustr;
@@ -63,8 +63,8 @@ namespace cpu
 		result.sse2 = has_feature(cpu_info[3], 26);
 		// aes-ni
 		result.aes = has_feature(cpu_info[2], 25);
-		// avx
-		result.avx = has_feature(cpu_info[2], 28);	
+		// avx - 27 is the check if the OS overwrote cpu features
+		result.avx = has_feature(cpu_info[2], 28) && has_feature(cpu_info[2], 27) ;
 
 		if(strcmp(cpustr, "AuthenticAMD") == 0)
 		{
diff --git a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp
index a289ac559331c0b3948d1a07846797151a2b4a7a..2fc1a8baafaad969619aa525285ba7f45528b240 100644
--- a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp
+++ b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp
@@ -74,7 +74,7 @@ static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int
     }
 }
 
-void v4_compile_code(cryptonight_ctx* ctx, int code_size)
+void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size)
 {
 	printer::inst()->print_msg(LDEBUG, "CryptonightR update ASM code");
 	const int allocation_size = 65536;
@@ -89,12 +89,24 @@ void v4_compile_code(cryptonight_ctx* ctx, int code_size)
 	if(ctx->fun_data != nullptr)
 	{
 
-		add_code(p, CryptonightR_template_part1, CryptonightR_template_part2);
-		add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version);
-		add_code(p, CryptonightR_template_part2, CryptonightR_template_part3);
-		*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
-		add_code(p, CryptonightR_template_part3, CryptonightR_template_end);
-
+		if(N == 2)
+		{
+		    add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
+			add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version);
+			add_code(p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
+			add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version);
+			add_code(p, CryptonightR_template_double_part3, CryptonightR_template_double_part4);
+			*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0));
+			add_code(p, CryptonightR_template_double_part4, CryptonightR_template_double_end);
+		}
+		else
+		{
+			add_code(p, CryptonightR_template_part1, CryptonightR_template_part2);
+			add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version);
+			add_code(p, CryptonightR_template_part2, CryptonightR_template_part3);
+			*(int*)(p - 4) = static_cast<int>((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
+			add_code(p, CryptonightR_template_part3, CryptonightR_template_end);
+		}
 
 		ctx->loop_fn = reinterpret_cast<cn_mainloop_fun>(ctx->fun_data);
 		protectExecutableMemory(ctx->fun_data, allocation_size);
diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h
index bd0c4967e0cd1271f1299c2fc2222325cd9ac15d..488805ec05516c07118f3bc4d652d0ca2eac66f1 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight.h
@@ -16,7 +16,7 @@ typedef void  (*cn_mainloop_fun)(cryptonight_ctx *ctx);
 typedef void  (*cn_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*);
 typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
-void v4_compile_code(cryptonight_ctx* ctx, int code_size);
+void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size);
 
 struct extra_ctx_r
 {
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 22fd0f481185bdd853a9f3d6fa7eb62b83325175..d7316b25e4ac2a783b590a7598f0b3aa16afcc49 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -744,7 +744,7 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 	__m128i bx1; \
 	__m128i division_result_xmm; \
 	__m128 conc_var; \
-	if(ALGO == cryptonight_conceal || ALGO == cryptonight_gpu) \
+	if(ALGO == cryptonight_conceal) \
 	{\
 		set_float_rounding_mode_nearest(); \
 		conc_var = _mm_setzero_ps(); \
@@ -1143,7 +1143,15 @@ struct Cryptonight_hash_asm<2, 0>
 			cn_explode_scratchpad<false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo);
 		}
 
-		reinterpret_cast<cn_double_mainloop_fun>(ctx[0]->loop_fn)(ctx[0], ctx[1]);
+		if(ALGO == cryptonight_r)
+		{
+			typedef void ABI_ATTRIBUTE (*cn_r_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*);
+			reinterpret_cast<cn_r_double_mainloop_fun>(ctx[0]->loop_fn)(ctx[0], ctx[1]);
+		}
+		else
+		{
+			reinterpret_cast<cn_double_mainloop_fun>(ctx[0]->loop_fn)(ctx[0], ctx[1]);
+		}
 
 		for(size_t i = 0; i < N; ++i)
 		{
@@ -1298,6 +1306,7 @@ struct Cryptonight_hash_gpu
 	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
+		set_float_rounding_mode_nearest();
 		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
 		cn_explode_scratchpad_gpu<PREFETCH, ALGO>(ctx[0]->hash_state, ctx[0]->long_state, algo);
 
@@ -1318,7 +1327,10 @@ struct Cryptonight_R_generator
 	template<xmrstak_algo_id ALGO>
 	static void cn_on_new_job(const xmrstak::miner_work& work, cryptonight_ctx** ctx)
 	{
-		if(ctx[0]->cn_r_ctx.height == work.iBlockHeight && ctx[0]->last_algo == POW(cryptonight_r))
+		if(ctx[0]->cn_r_ctx.height == work.iBlockHeight &&
+			ctx[0]->last_algo == POW(cryptonight_r) &&
+			reinterpret_cast<void*>(ctx[0]->hash_fn) == ctx[0]->fun_data
+		)
 			return;
 
 		ctx[0]->last_algo = POW(cryptonight_r);
@@ -1327,8 +1339,11 @@ struct Cryptonight_R_generator
 		int code_size = v4_random_math_init<ALGO>(ctx[0]->cn_r_ctx.code, work.iBlockHeight);
 		if(ctx[0]->asm_version != 0)
 		{
-			v4_compile_code(ctx[0], code_size);
-			ctx[0]->hash_fn = Cryptonight_hash_asm<N, 1u>::template hash<cryptonight_r>;
+			v4_compile_code(N, ctx[0], code_size);
+			if(N == 2)
+				ctx[0]->hash_fn = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_r>;
+			else
+				ctx[0]->hash_fn = Cryptonight_hash_asm<N, 1u>::template hash<cryptonight_r>;
 		}
 
 		for(size_t i=1; i < N; i++)
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 2b8b0e18d7d8df38dd808ac20e3e7145bb54fb35..e90b59500b09305c149c37ef790e56410672d7b7 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -473,11 +473,21 @@ bool minethd::self_test()
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = memcmp(out, "\x32\xf7\x36\xec\x1d\x2f\x3f\xc5\x4c\x49\xbe\xb8\xa0\x47\x6c\xbf\xdd\x14\xc3\x51\xb9\xc6\xd7\x2c\x6f\x9f\xfc\xb5\x87\x5b\xe6\xb3", 32) == 0;
+			bResult = bResult && memcmp(out, "\x32\xf7\x36\xec\x1d\x2f\x3f\xc5\x4c\x49\xbe\xb8\xa0\x47\x6c\xbf\xdd\x14\xc3\x51\xb9\xc6\xd7\x2c\x6f\x9f\xfc\xb5\x87\x5b\xe6\xb3", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult &= memcmp(out, "\x32\xf7\x36\xec\x1d\x2f\x3f\xc5\x4c\x49\xbe\xb8\xa0\x47\x6c\xbf\xdd\x14\xc3\x51\xb9\xc6\xd7\x2c\x6f\x9f\xfc\xb5\x87\x5b\xe6\xb3", 32) == 0;
+			bResult = bResult && memcmp(out, "\x32\xf7\x36\xec\x1d\x2f\x3f\xc5\x4c\x49\xbe\xb8\xa0\x47\x6c\xbf\xdd\x14\xc3\x51\xb9\xc6\xd7\x2c\x6f\x9f\xfc\xb5\x87\x5b\xe6\xb3", 32) == 0;
+		}
+		else if(algo == POW(cryptonight_v8_double))
+		{
+			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
+			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
+			bResult = bResult && memcmp(out, "\x63\x43\x8e\xd\x5c\x18\xff\xca\xd5\xb5\xdf\xe0\x26\x8a\x5b\x3f\xe9\xbc\x1\xef\xe6\x3a\xd3\x4f\x2c\x57\x1c\xda\xb2\xc\x32\x31", 32) == 0;
+
+			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
+			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
+			bResult = bResult && memcmp(out, "\x63\x43\x8e\xd\x5c\x18\xff\xca\xd5\xb5\xdf\xe0\x26\x8a\x5b\x3f\xe9\xbc\x1\xef\xe6\x3a\xd3\x4f\x2c\x57\x1c\xda\xb2\xc\x32\x31", 32) == 0;
 		}
 		else
 			printer::inst()->print_msg(L0,
@@ -738,9 +748,17 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job&
 		std::string selected_asm = asm_version_str;
 		if(selected_asm == "auto")
 				selected_asm = cpu::getAsmName(N);
-		printer::inst()->print_msg(L0, "enable cryptonight_r asm '%s' cpu's", selected_asm.c_str());
-		for(int h = 0; h < N; ++h)
-			ctx[h]->asm_version = selected_asm == "intel_avx" ? 1 : 2; // 1 == Intel; 2 == AMD
+		if(selected_asm == "off")
+		{
+			for(int h = 0; h < N; ++h)
+				ctx[h]->asm_version = 0;
+		}
+		else
+		{
+			printer::inst()->print_msg(L0, "enable cryptonight_r asm '%s' cpu's", selected_asm.c_str());
+			for(int h = 0; h < N; ++h)
+				ctx[h]->asm_version = selected_asm == "intel_avx" ? 1 : 2; // 1 == Intel; 2 == AMD
+		}
 	}
 
 	for(int h = 1; h < N; ++h)
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index 4f5d88dea3ccb81dc121ab7a5edc8be740be6ccc..e58665922bb6b7da8ea987153dee3b3297fb4654 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -30,7 +30,8 @@ enum xmrstak_algo_id
 
 	cryptonight_turtle = start_derived_algo_id,
 	cryptonight_v8_half = (start_derived_algo_id + 1),
-	cryptonight_v8_zelerius = (start_derived_algo_id + 2)
+	cryptonight_v8_zelerius = (start_derived_algo_id + 2),
+	cryptonight_v8_double = (start_derived_algo_id + 3)
 	// please add the algorithm name to get_algo_name()
 };
 
@@ -62,11 +63,12 @@ inline std::string get_algo_name(xmrstak_algo_id algo_id)
 		"cryptonight_v8_reversewaltz" // used by graft
 	}};
 
-	static std::array<std::string, 3> derived_algo_names =
+	static std::array<std::string, 4> derived_algo_names =
 	{{
 		"cryptonight_turtle",
 		"cryptonight_v8_half", // used by masari and stellite
-		"cryptonight_v8_zelerius"
+		"cryptonight_v8_zelerius",
+		"cryptonight_v8_double"
 	}};
 
 
@@ -181,6 +183,8 @@ constexpr uint32_t CN_ZELERIUS_ITER = 0x60000;
 
 constexpr uint32_t CN_WALTZ_ITER = 0x60000;
 
+constexpr uint32_t CN_DOUBLE_ITER = 0x100000;
+
 inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 {
 	static std::array<xmrstak_algo, 18> pow = {{
@@ -204,11 +208,12 @@ inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 		{cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}
 	}};
 
-	static std::array<xmrstak_algo, 3> derived_pow =
+	static std::array<xmrstak_algo, 4> derived_pow =
 	{{
 		{cryptonight_turtle, cryptonight_monero_v8, CN_ITER/8, CN_MEMORY/8, CN_TURTLE_MASK},
 		{cryptonight_v8_half, cryptonight_monero_v8, CN_ITER/2, CN_MEMORY},
-		{cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY}
+		{cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY},
+		{cryptonight_v8_double, cryptonight_monero_v8, CN_DOUBLE_ITER, CN_MEMORY}
 		// {cryptonight_derived}
 	}};
 
diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp
index c8174df328945e6241321e0f38cd4dcb5365e593..d0e5237f27b3a632e3a9d501492325d02714d0c6 100644
--- a/xmrstak/backend/miner_work.hpp
+++ b/xmrstak/backend/miner_work.hpp
@@ -15,7 +15,7 @@ namespace xmrstak
 	struct miner_work
 	{
 		char        sJobID[64];
-		uint8_t     bWorkBlob[112];
+		uint8_t     bWorkBlob[128];
 		uint32_t    iWorkSize;
 		uint64_t    iTarget;
 		bool        bNiceHash;
@@ -28,7 +28,7 @@ namespace xmrstak
 
 		miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
 			uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : iWorkSize(iWorkSize),
-			iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight) 
+			iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight)
 		{
 			assert(iWorkSize <= sizeof(bWorkBlob));
 			memcpy(this->bWorkBlob, bWork, iWorkSize);
@@ -36,7 +36,7 @@ namespace xmrstak
 		}
 
 		miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget),
-			bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight) 
+			bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight)
 		{
 			assert(iWorkSize <= sizeof(bWorkBlob));
 			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
index 87eb05540b83bde74ab295f8810b92194a03d131..f1bf7581995c95d45a030a690f1b8a82b3a924f6 100644
--- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
+++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
@@ -153,6 +153,7 @@ static void CryptonightR_build_program(
     std::string& lowered_name,
     const xmrstak_algo& algo,
     uint64_t height,
+    uint32_t precompile_count,
     int arch_major,
     int arch_minor,
     std::string source)
@@ -164,7 +165,7 @@ static void CryptonightR_build_program(
         for (size_t i = 0; i < CryptonightR_cache.size();)
         {
             const CacheEntry& entry = CryptonightR_cache[i];
-            if ((entry.algo == algo) && (entry.height + 2 < height))
+            if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
             {
                 printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
                 CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
@@ -273,10 +274,10 @@ static void CryptonightR_build_program(
 	CryptonightR_cache_mutex.UnLock();
 }
 
-void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name, const xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, bool background)
+void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name, const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background)
 {
     if (background) {
-        background_exec([=]() { std::vector<char> tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, arch_major, arch_minor, false); });
+        background_exec([=]() { std::vector<char> tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, precompile_count, arch_major, arch_minor, false); });
         return;
     }
 
@@ -329,7 +330,7 @@ void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name,
 		CryptonightR_cache_mutex.UnLock();
     }
 
-    CryptonightR_build_program(ptx, lowered_name, algo, height, arch_major, arch_minor, source_code);
+    CryptonightR_build_program(ptx, lowered_name, algo, height, precompile_count, arch_major, arch_minor, source_code);
 }
 
 } // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
index e214647b97427c049aed29c3e4c6e6f66f0cdf66..c3d8827b064276157f1b596878381f206d68c463 100644
--- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
+++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
@@ -29,7 +29,7 @@ namespace nvidia
 {
 
 void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name,
-	const xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, bool background = false);
+	const xmrstak_algo algo, uint64_t height,  uint32_t precompile_count, int arch_major, int arch_minor, bool background = false);
 
 
 } // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index a50dd30cc880ab49d3a13b5c8325b0f45c9b8b07..80615d7a34262f2809302b15c29327e00f629b1b 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -285,7 +285,7 @@ void minethd::work_main()
 			for(size_t i = 0; i < foundCount; i++)
 			{
 
-				uint8_t	bWorkBlob[112];
+				uint8_t	bWorkBlob[128];
 				uint8_t	bResult[32];
 
 				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 3c62bd0902ced3917e86fe688b431d222e07e9ab..718cff0c765eeb619d23b5fc7024e0a71d6222c9 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -1033,9 +1033,11 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 			 if(ctx->module)
 				cuModuleUnload(ctx->module);
 
+			uint32_t PRECOMPILATION_DEPTH = 4;
+
 			std::vector<char> ptx;
 			std::string lowered_name;
-			xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo, chain_height, ctx->device_arch[0], ctx->device_arch[1]);
+			xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo, chain_height, PRECOMPILATION_DEPTH, ctx->device_arch[0], ctx->device_arch[1]);
 
 			CU_CHECK(ctx->device_id, cuModuleLoadDataEx(&ctx->module, ptx.data(), 0, 0, 0));
 			CU_CHECK(ctx->device_id, cuModuleGetFunction(&ctx->kernel, ctx->module, lowered_name.c_str()));
@@ -1043,7 +1045,9 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 			ctx->kernel_height = chain_height;
 			ctx->cached_algo = miner_algo;
 
-			xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo, chain_height + 1, ctx->device_arch[0], ctx->device_arch[1], true);
+			for (int i = 1; i <= PRECOMPILATION_DEPTH; ++i)
+				xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo,
+					chain_height + i, PRECOMPILATION_DEPTH, ctx->device_arch[0], ctx->device_arch[1], true);
 		}
 	}
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index e909e2fa34db023ee3eb96e5e01bfb277fc98dce..b6e41c61960aac53f04e5788921da7b1573fcc09 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -112,7 +112,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	uint32_t ctx_b[4];
 	uint32_t ctx_key1[40];
 	uint32_t ctx_key2[40];
-	uint32_t input[21];
+	uint32_t input[32];
 
 	memcpy( input, d_input, len );
 	//*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
@@ -349,7 +349,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size));
 	// POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof (uint32_t ) ));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
 	CUDA_CHECK_MSG(
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
index c75c74964f233c89a54532117824aae52003859f..3f535631db0ceaceb30f6342d6a3ffa8dbf87425 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
@@ -103,54 +103,7 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s)
 {
 	uint64_t bc[5], tmpxor[5], tmp1, tmp2;
 
-	tmpxor[0] = s[0] ^ s[5];
-	tmpxor[1] = s[1] ^ s[6] ^ 0x8000000000000000ULL;
-	tmpxor[2] = s[2] ^ s[7];
-	tmpxor[3] = s[3] ^ s[8];
-	tmpxor[4] = s[4] ^ s[9];
-
-	bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
-	bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
-	bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
-	bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
-	bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
-
-	tmp1 = s[1] ^ bc[0];
-
-	s[0] ^= bc[4];
-	s[1] = rotl64_2(s[6] ^ bc[0], 12);
-	s[6] = rotl64_1(s[9] ^ bc[3], 20);
-	s[9] = rotl64_2(bc[1], 29);
-	s[22] = rotl64_2(bc[3], 7);
-	s[14] = rotl64_1(bc[4], 18);
-	s[20] = rotl64_2(s[2] ^ bc[1], 30);
-	s[2] = rotl64_2(bc[1], 11);
-	s[12] = rotl64_1(bc[2], 25);
-	s[13] = rotl64_1(bc[3], 8);
-	s[19] = rotl64_2(bc[2], 24);
-	s[23] = rotl64_2(bc[4], 9);
-	s[15] = rotl64_1(s[4] ^ bc[3], 27);
-	s[4] = rotl64_1(bc[3], 14);
-	s[24] = rotl64_1(bc[0], 2);
-	s[21] = rotl64_2(s[8] ^ bc[2], 23);
-	s[8] = rotl64_2(0x8000000000000000ULL ^ bc[0], 13);
-	s[16] = rotl64_2(s[5] ^ bc[4], 4);
-	s[5] = rotl64_1(s[3] ^ bc[2], 28);
-	s[3] = rotl64_1(bc[2], 21);
-	s[18] = rotl64_1(bc[1], 15);
-	s[17] = rotl64_1(bc[0], 10);
-	s[11] = rotl64_1(s[7] ^ bc[1], 6);
-	s[7] = rotl64_1(bc[4], 3);
-	s[10] = rotl64_1(tmp1, 1);
-
-	tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
-	tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
-	tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
-	tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
-	tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
-	s[0] ^= 0x0000000000000001;
-
-	for(int i = 1; i < 24; ++i)
+	for(int i = 0; i < 24; ++i)
 	{
 		tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 		tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index 418726208d8b273fc9fed85ad4ed6506131ed3c7..d6822cd63c544e07975d22ecd28474a6e2be904b 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -850,8 +850,8 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 
 	printer::inst()->print_msg(L0, "Prepare benchmark for block version %d", block_version);
 
-	uint8_t work[112];
-	memset(work,0,112);
+	uint8_t work[128];
+	memset(work,0,128);
 	work[0] = static_cast<uint8_t>(block_version);
 
 	xmrstak::pool_data dat;
@@ -862,15 +862,14 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 	printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized",wait_sec);
 	std::this_thread::sleep_for(std::chrono::seconds(wait_sec));
 
-	/* AMD and NVIDIA is currently only supporting work sizes up to 84byte
-	 * \todo fix this issue
+	/* AMD and NVIDIA is currently only supporting work sizes up to 128byte
 	 */
 	printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec);
-	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat);
+	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat);
 	uint64_t iStartStamp = get_timestamp_ms();
 
 	std::this_thread::sleep_for(std::chrono::seconds(work_sec));
-	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat);
+	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat);
 
 	double fTotalHps = 0.0;
 	for (uint32_t i = 0; i < pvThreads->size(); i++)
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 2b22a2fb97171a0ceb1df4627721590563108c6b..5e3384a63484713228c9096eb0580829678b0fd7 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -98,11 +98,12 @@ xmrstak::coin_selection coins[] = {
 	{ "cryptonight_lite",        {POW(cryptonight_lite)},      {POW(cryptonight_aeon)},      nullptr },
 	{ "cryptonight_lite_v7",     {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)},      nullptr },
 	{ "cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)},      {POW(cryptonight_aeon)},      nullptr },
-	{ "cryptonight_r",           {POW(cryptonight_r)},         {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_r",           {POW(cryptonight_r)},         {POW(cryptonight_r)}, nullptr },
 	{ "cryptonight_superfast",   {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr },
 	{ "cryptonight_turtle",      {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
 	{ "cryptonight_v7",          {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v8",          {POW(cryptonight_monero_v8)}, {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_v8",          {POW(cryptonight_monero_v8)}, {POW(cryptonight_r)}, nullptr },
+	{ "cryptonight_v8_double",   {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr },
 	{ "cryptonight_v8_half",     {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
 	{ "cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_gpu)}, nullptr },
 	{ "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_gpu)}, nullptr },
@@ -114,13 +115,14 @@ xmrstak::coin_selection coins[] = {
 	{ "haven",                   {POW(cryptonight_haven)},     {POW(cryptonight_gpu)}, nullptr },
 	{ "lethean",                 {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
 	{ "masari",                  {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "monero",                  {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, "pool.usxmrpool.com:3333" },
+	{ "monero",                  {POW(cryptonight_r)},         {POW(cryptonight_r)}, "pool.usxmrpool.com:3333" },
 	{ "qrl",             	     {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
 	{ "ryo",                     {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" },
 	{ "stellite",                {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
 	{ "turtlecoin",              {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr },
 	{ "plenteum",			     {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
-	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_gpu)}, nullptr }
+	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_gpu)}, nullptr },
+	{ "xcash",                   {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr }
 };
 
 constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp
index 813fc7d06ea6dfbf42fa99be0cca85a6a52aa2d5..33980bf425842fbe4d88da864f84c17396390a93 100644
--- a/xmrstak/net/msgstruct.hpp
+++ b/xmrstak/net/msgstruct.hpp
@@ -12,7 +12,7 @@
 struct pool_job
 {
 	char		sJobID[64];
-	uint8_t		bWorkBlob[112];
+	uint8_t		bWorkBlob[128];
 	uint64_t	iTarget;
 	uint32_t	iWorkLen;
 	uint32_t	iSavedNonce;
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index b1fd0e70b79fa3584893aab895fa9a38bc54490a..ea3a276aafe0a35aa678f6c40d19007e8a5a7efc 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -32,6 +32,7 @@ POOLCONF],
  *    ryo
  *    turtlecoin
  *    plenteum
+ *    xcash
  *
  * Native algorithms which do not depend on any block versions:
  *
@@ -47,6 +48,7 @@ POOLCONF],
  *    cryptonight_superfast
  *    cryptonight_v7
  *    cryptonight_v8
+ *    cryptonight_v8_double (used by xcash)
  *    cryptonight_v8_half (used by masari and stellite)
  *    cryptonight_v8_reversewaltz (used by graft)
  *    cryptonight_v8_zelerius
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 334a246bde3301d8b494431fd13c31bcd3442a01..412fc221335944e0a11c2c1414ca0b6642d7eae4 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.10.0"
+#define XMR_STAK_VERSION "2.10.1"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"