diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 30f97ac5102643ae2e335894c2964783725d1a1d..6e1c70b05002c440e7f7c17e57390d24215376db 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -969,7 +969,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	input[input_len] = 0x01;
 	memset(input + input_len + 1, 0, 88 - input_len - 1);
 
-	size_t numThreads = ctx->rawIntensity;
+	cl_uint numThreads = ctx->rawIntensity;
 
 	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS)
 	{
@@ -998,7 +998,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1021,7 +1021,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1081,7 +1081,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1160,7 +1160,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 
 	clFinish(ctx->CommandQueues);
 
-	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { w_size, 8 };
+	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
 	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
@@ -1212,7 +1212,8 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 		if(BranchNonces[i])
 		{
 			// Threads
-			if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS)
+            cl_uint numThreads = BranchNonces[i];
+			if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
 				return(ERR_OCL_API);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index be5b21107e9c239423271b1dd9c8e23f9afbcaeb..81c0d5ff9e118f40ddd67c91530153dfb93f2195 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -16,7 +16,7 @@ R"===(
 
 /* For Mesa clover support */
 #ifdef cl_clang_storage_class_specifiers
-#	pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
+#   pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
 #endif
 
 #ifdef cl_amd_media_ops
@@ -26,7 +26,7 @@ R"===(
  * Build-in Function
  *     uintn  amd_bitalign (uintn src0, uintn src1, uintn src2)
  *   Description
- *     dst.s0 =  (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2.s0 & 31))
+ *     dst.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2.s0 & 31))
  *     similar operation applied to other components of the vectors.
  *
  * The implemented function is modified because the last is in our case always a scalar.
@@ -34,10 +34,10 @@ R"===(
  */
 inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
 {
-	uint2 result;
+    uint2 result;
 	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
 	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
-	return result;
+    return result;
 }
 #endif
 
@@ -61,20 +61,20 @@ inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
  */
 inline int amd_bfe(const uint src0, const uint offset, const uint width)
 {
-	/* casts are removed because we can implement everything as uint
-	 * int offset = src1;
-	 * int width = src2;
-	 * remove check for edge case, this function is always called with
-	 * `width==8`
-	 * @code
-	 *   if ( width == 0 )
-	 *      return 0;
-	 * @endcode
-	 */
-	if ( (offset + width) < 32u )
-		return (src0 << (32u - offset - width)) >> (32u - width);
-
-	return src0 >> offset;
+    /* casts are removed because we can implement everything as uint
+     * int offset = src1;
+     * int width = src2;
+     * remove check for edge case, this function is always called with
+     * `width==8`
+     * @code
+     *   if ( width == 0 )
+     *      return 0;
+     * @endcode
+     */
+    if ( (offset + width) < 32u )
+        return (src0 << (32u - offset - width)) >> (32u - width);
+
+    return src0 >> offset;
 }
 #endif
 
@@ -107,22 +107,22 @@ static const __constant ulong keccakf_rndc[24] =
 
 static const __constant uchar sbox[256] =
 {
-	0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
-	0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
-	0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
-	0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
-	0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
-	0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
-	0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
-	0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
-	0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
-	0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
-	0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
-	0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
-	0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
-	0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
-	0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
-	0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
+    0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+    0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+    0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+    0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+    0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+    0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+    0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+    0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+    0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+    0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+    0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+    0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+    0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+    0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+    0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+    0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
 };
 
 
@@ -130,7 +130,7 @@ void keccakf1600(ulong *s)
 {
     for(int i = 0; i < 24; ++i)
     {
-		ulong bc[5], tmp1, tmp2;
+        ulong bc[5], tmp1, tmp2;
         bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
         bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL);
         bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
@@ -191,7 +191,7 @@ void keccakf1600_1(ulong *st)
     int i, round;
     ulong t, bc[5];
 
-	#pragma unroll 1
+    #pragma unroll 1
     for(round = 0; round < 24; ++round)
     {
 
@@ -202,7 +202,7 @@ void keccakf1600_1(ulong *st)
         bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
         bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
 
-		#pragma unroll 1
+        #pragma unroll 1
         for (i = 0; i < 5; ++i) {
             t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL);
             st[i     ] ^= t;
@@ -221,23 +221,17 @@ void keccakf1600_1(ulong *st)
             t = bc[0];
         }
 
-        //ulong tmp1 = st[0]; ulong tmp2 = st[1]; st[0] = bitselect(st[0] ^ st[2], st[0], st[1]); st[1] = bitselect(st[1] ^ st[3], st[1], st[2]); st[2] = bitselect(st[2] ^ st[4], st[2], st[3]); st[3] = bitselect(st[3] ^ tmp1, st[3], st[4]); st[4] = bitselect(st[4] ^ tmp2, st[4], tmp1);
-        //tmp1 = st[5]; tmp2 = st[6]; st[5] = bitselect(st[5] ^ st[7], st[5], st[6]); st[6] = bitselect(st[6] ^ st[8], st[6], st[7]); st[7] = bitselect(st[7] ^ st[9], st[7], st[8]); st[8] = bitselect(st[8] ^ tmp1, st[8], st[9]); st[9] = bitselect(st[9] ^ tmp2, st[9], tmp1);
-        //tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1);
-        //tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1);
-        //tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1);
-
         #pragma unroll 1
         for(int i = 0; i < 25; i += 5)
         {
-			ulong tmp[5];
+            ulong tmp[5];
 
-			#pragma unroll 1
-			for(int x = 0; x < 5; ++x)
-				tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]);
+            #pragma unroll 1
+            for(int x = 0; x < 5; ++x)
+                tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]);
 
-			#pragma unroll 1
-			for(int x = 0; x < 5; ++x) st[i + x] = tmp[x];
+            #pragma unroll 1
+            for(int x = 0; x < 5; ++x) st[i + x] = tmp[x];
         }
 
         //  Iota
@@ -246,69 +240,50 @@ void keccakf1600_1(ulong *st)
 }
 )==="
 R"===(
-void keccakf1600_2(ulong *st)
+
+void keccakf1600_2(__local ulong *st)
 {
     int i, round;
     ulong t, bc[5];
 
-	#pragma unroll 1
-    for(round = 0; round < 24; ++round)
+    #pragma unroll 1
+    for (round = 0; round < 24; ++round)
     {
+        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
+        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
+        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
+        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
+        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
 
-        // Theta
-        //bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
-        //bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
-        //bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
-        //bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
-        //bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-
-		/*
-		#pragma unroll
-        for (i = 0; i < 5; ++i) {
-            t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL);
-            st[i     ] ^= t;
-            st[i +  5] ^= t;
-            st[i + 10] ^= t;
-            st[i + 15] ^= t;
-            st[i + 20] ^= t;
-        }
-		*/
-
-		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
-		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
-		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
-		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
-		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
-
-		st[0] ^= bc[4];
-		st[5] ^= bc[4];
-		st[10] ^= bc[4];
-		st[15] ^= bc[4];
-		st[20] ^= bc[4];
-
-		st[1] ^= bc[0];
-		st[6] ^= bc[0];
-		st[11] ^= bc[0];
-		st[16] ^= bc[0];
-		st[21] ^= bc[0];
-
-		st[2] ^= bc[1];
-		st[7] ^= bc[1];
-		st[12] ^= bc[1];
-		st[17] ^= bc[1];
-		st[22] ^= bc[1];
-
-		st[3] ^= bc[2];
-		st[8] ^= bc[2];
-		st[13] ^= bc[2];
-		st[18] ^= bc[2];
-		st[23] ^= bc[2];
-
-		st[4] ^= bc[3];
-		st[9] ^= bc[3];
-		st[14] ^= bc[3];
-		st[19] ^= bc[3];
-		st[24] ^= bc[3];
+        st[0] ^= bc[4];
+        st[5] ^= bc[4];
+        st[10] ^= bc[4];
+        st[15] ^= bc[4];
+        st[20] ^= bc[4];
+
+        st[1] ^= bc[0];
+        st[6] ^= bc[0];
+        st[11] ^= bc[0];
+        st[16] ^= bc[0];
+        st[21] ^= bc[0];
+
+        st[2] ^= bc[1];
+        st[7] ^= bc[1];
+        st[12] ^= bc[1];
+        st[17] ^= bc[1];
+        st[22] ^= bc[1];
+
+        st[3] ^= bc[2];
+        st[8] ^= bc[2];
+        st[13] ^= bc[2];
+        st[18] ^= bc[2];
+        st[23] ^= bc[2];
+
+        st[4] ^= bc[3];
+        st[9] ^= bc[3];
+        st[14] ^= bc[3];
+        st[19] ^= bc[3];
+        st[24] ^= bc[3];
 
         // Rho Pi
         t = st[1];
@@ -319,54 +294,16 @@ void keccakf1600_2(ulong *st)
             t = bc[0];
         }
 
-
-
-		/*ulong tmp1 = st[1] ^ bc[0];
-
-        st[0] ^= bc[4];
-        st[1] = rotate(st[6] ^ bc[0], 44UL);
-        st[6] = rotate(st[9] ^ bc[3], 20UL);
-        st[9] = rotate(st[22] ^ bc[1], 61UL);
-        st[22] = rotate(st[14] ^ bc[3], 39UL);
-        st[14] = rotate(st[20] ^ bc[4], 18UL);
-        st[20] = rotate(st[2] ^ bc[1], 62UL);
-        st[2] = rotate(st[12] ^ bc[1], 43UL);
-        st[12] = rotate(st[13] ^ bc[2], 25UL);
-        st[13] = rotate(st[19] ^ bc[3], 8UL);
-        st[19] = rotate(st[23] ^ bc[2], 56UL);
-        st[23] = rotate(st[15] ^ bc[4], 41UL);
-        st[15] = rotate(st[4] ^ bc[3], 27UL);
-        st[4] = rotate(st[24] ^ bc[3], 14UL);
-        st[24] = rotate(st[21] ^ bc[0], 2UL);
-        st[21] = rotate(st[8] ^ bc[2], 55UL);
-        st[8] = rotate(st[16] ^ bc[0], 35UL);
-        st[16] = rotate(st[5] ^ bc[4], 36UL);
-        st[5] = rotate(st[3] ^ bc[2], 28UL);
-        st[3] = rotate(st[18] ^ bc[2], 21UL);
-        st[18] = rotate(st[17] ^ bc[1], 15UL);
-        st[17] = rotate(st[11] ^ bc[0], 10UL);
-        st[11] = rotate(st[7] ^ bc[1], 6UL);
-        st[7] = rotate(st[10] ^ bc[4], 3UL);
-        st[10] = rotate(tmp1, 1UL);
-		*/
-
-
-        //ulong tmp1 = st[0]; ulong tmp2 = st[1]; st[0] = bitselect(st[0] ^ st[2], st[0], st[1]); st[1] = bitselect(st[1] ^ st[3], st[1], st[2]); st[2] = bitselect(st[2] ^ st[4], st[2], st[3]); st[3] = bitselect(st[3] ^ tmp1, st[3], st[4]); st[4] = bitselect(st[4] ^ tmp2, st[4], tmp1);
-        //tmp1 = st[5]; tmp2 = st[6]; st[5] = bitselect(st[5] ^ st[7], st[5], st[6]); st[6] = bitselect(st[6] ^ st[8], st[6], st[7]); st[7] = bitselect(st[7] ^ st[9], st[7], st[8]); st[8] = bitselect(st[8] ^ tmp1, st[8], st[9]); st[9] = bitselect(st[9] ^ tmp2, st[9], tmp1);
-        //tmp1 = st[10]; tmp2 = st[11]; st[10] = bitselect(st[10] ^ st[12], st[10], st[11]); st[11] = bitselect(st[11] ^ st[13], st[11], st[12]); st[12] = bitselect(st[12] ^ st[14], st[12], st[13]); st[13] = bitselect(st[13] ^ tmp1, st[13], st[14]); st[14] = bitselect(st[14] ^ tmp2, st[14], tmp1);
-        //tmp1 = st[15]; tmp2 = st[16]; st[15] = bitselect(st[15] ^ st[17], st[15], st[16]); st[16] = bitselect(st[16] ^ st[18], st[16], st[17]); st[17] = bitselect(st[17] ^ st[19], st[17], st[18]); st[18] = bitselect(st[18] ^ tmp1, st[18], st[19]); st[19] = bitselect(st[19] ^ tmp2, st[19], tmp1);
-        //tmp1 = st[20]; tmp2 = st[21]; st[20] = bitselect(st[20] ^ st[22], st[20], st[21]); st[21] = bitselect(st[21] ^ st[23], st[21], st[22]); st[22] = bitselect(st[22] ^ st[24], st[22], st[23]); st[23] = bitselect(st[23] ^ tmp1, st[23], st[24]); st[24] = bitselect(st[24] ^ tmp2, st[24], tmp1);
-
         #pragma unroll
         for(int i = 0; i < 25; i += 5)
         {
-			ulong tmp1 = st[i], tmp2 = st[i + 1];
+            ulong tmp1 = st[i], tmp2 = st[i + 1];
 
-			st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
-			st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
-			st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
-			st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
-			st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
+            st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
+            st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
+            st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
+            st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
+            st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
         }
 
         //  Iota
@@ -379,24 +316,24 @@ R"===(
 
 void CNKeccak(ulong *output, ulong *input)
 {
-	ulong st[25];
+    ulong st[25];
 
-	// Copy 72 bytes
-	for(int i = 0; i < 9; ++i) st[i] = input[i];
+    // Copy 72 bytes
+    for(int i = 0; i < 9; ++i) st[i] = input[i];
 
-	// Last four and '1' bit for padding
-	//st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
+    // Last four and '1' bit for padding
+    //st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
 
-	st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
+    st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
 
-	for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
+    for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
 
-	// Last bit of padding
-	st[16] = 0x8000000000000000UL;
+    // Last bit of padding
+    st[16] = 0x8000000000000000UL;
 
-	keccakf1600_1(st);
+    keccakf1600_1(st);
 
-	for(int i = 0; i < 25; ++i) output[i] = st[i];
+    for(int i = 0; i < 25; ++i) output[i] = st[i];
 }
 
 static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
@@ -407,17 +344,17 @@ static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x
 
 void AESExpandKey256(uint *keybuf)
 {
-	//#pragma unroll 4
-	for(uint c = 8, i = 1; c < 40; ++c)
-	{
-		// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
-		uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
-
-		// If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
-		// then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
-		// is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
-		keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t);
-	}
+    //#pragma unroll 4
+    for(uint c = 8, i = 1; c < 40; ++c)
+    {
+        // For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
+        uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
+
+        // If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
+        // then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
+        // is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
+        keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t);
+    }
 }
 
 )==="
@@ -433,137 +370,167 @@ R"===(
 #   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
 #endif
 
-inline ulong getIdx()
+inline uint getIdx()
 {
 #if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2)
-	return get_global_id(0) - get_global_offset(0);
+    return get_global_id(0) - get_global_offset(0);
 #endif
 }
 
 #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
 
-#define JOIN_DO(x,y) x##y
+        #define JOIN_DO(x,y) x##y
 #define JOIN(x,y) JOIN_DO(x,y)
 
-__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
 {
-	ulong State[25];
-	uint ExpandedKey1[40];
-	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
-	uint4 text;
+    uint ExpandedKey1[40];
+    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
+    uint4 text;
 
-	const ulong gIdx = getIdx();
+    const uint gIdx = getIdx();
 
-	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
+	for(int i = get_local_id(1) * 8 + get_local_id(0);
 		i < 256;
-		i += WORKSIZE * 8)
+		i += 8 * 8)
 	{
-		const uint tmp = AES0_C[i];
-		AES0[i] = tmp;
-		AES1[i] = rotate(tmp, 8U);
-		AES2[i] = rotate(tmp, 16U);
-		AES3[i] = rotate(tmp, 24U);
-	}
+        const uint tmp = AES0_C[i];
+        AES0[i] = tmp;
+        AES1[i] = rotate(tmp, 8U);
+        AES2[i] = rotate(tmp, 16U);
+        AES3[i] = rotate(tmp, 24U);
+    }
+        
+    __local ulong State_buf[8 * 25];
 
-	barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-	// do not use early return here
+    // do not use early return here
 	if(gIdx < Threads)
 #endif
-	{
-		states += 25 * gIdx;
+    {
+        states += 25 * gIdx;
 
 #if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (MEMORY >> 4);
+        Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-		Scratchpad += gIdx;
+        Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+        Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
 #endif
 
-		((ulong8 *)State)[0] = vload8(0, input);
-		State[8] = input[8];
-		State[9] = input[9];
-		State[10] = input[10];
-
-		((uint *)State)[9] &= 0x00FFFFFFU;
-		((uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24;
-		((uint *)State)[10] &= 0xFF000000U;
-		/* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
-		 * handle get_global_id and get_global_offset as signed long long int and add
-		 * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
-		 * (even if it is correct casted to unsigned on the host)
-		 */
-		((uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
-
-		for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
-
-		// Last bit of padding
-		State[16] = 0x8000000000000000UL;
-
-		keccakf1600_2(State);
-	}
-
-	mem_fence(CLK_GLOBAL_MEM_FENCE);
-#if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
-#endif
-	{
-		#pragma unroll
-		for(int i = 0; i < 25; ++i) states[i] = State[i];
+        if (get_local_id(1) == 0)
+        {
+            __local ulong* State = State_buf + get_local_id(0) * 25;
+
+            ((__local ulong8 *)State)[0] = vload8(0, input);
+            State[8]  = input[8];
+            State[9]  = input[9];
+            State[10] = input[10];
+
+            ((__local uint *)State)[9]  &= 0x00FFFFFFU;
+            ((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
+            ((__local uint *)State)[10] &= 0xFF000000U;
+            /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
+             * handle get_global_id and get_global_offset as signed long long int and add
+             * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
+             * (even if it is correct casted to unsigned on the host)
+             */
+            ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
+
+            for (int i = 11; i < 25; ++i) {
+                State[i] = 0x00UL;
+            }
+
+            // Last bit of padding
+            State[16] = 0x8000000000000000UL;
+
+            keccakf1600_2(State);
+
+            #pragma unroll
+            for (int i = 0; i < 25; ++i) {
+                states[i] = State[i];
+            }
+        }
+    }
 
-		text = vload4(get_local_id(1) + 4, (__global uint *)(states));
+    barrier(CLK_GLOBAL_MEM_FENCE);
 
-		#pragma unroll
-		for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey1)[i] = states[i];
+#   if (COMP_MODE == 1)
+    // do not use early return here
+    if (gIdx < Threads)
+#   endif
+    {
+        text = vload4(get_local_id(1) + 4, (__global uint *)(states));
 
-		AESExpandKey256(ExpandedKey1);
-	}
+        #pragma unroll
+        for (int i = 0; i < 4; ++i) {
+            ((ulong *)ExpandedKey1)[i] = states[i];
+        }
 
-	mem_fence(CLK_LOCAL_MEM_FENCE);
+        AESExpandKey256(ExpandedKey1);
+    }
 
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+        
 // cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
 #if (ALGO == 4 || ALGO == 9 || ALGO == 10)
-	__local uint4 xin[8][WORKSIZE];
-
-	/* Also left over threads perform this loop.
-	 * The left over thread results will be ignored
-	 */
-	#pragma unroll 16
-	for(size_t i=0; i < 16; i++)
-	{
-		#pragma unroll 10
-		for(int j = 0; j < 10; ++j)
-			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
-		barrier(CLK_LOCAL_MEM_FENCE);
-		xin[get_local_id(1)][get_local_id(0)] = text;
-		barrier(CLK_LOCAL_MEM_FENCE);
-		text = mix_and_propagate(xin);
-	}
+    __local uint4 xin[8][8];
+    {
+        
+
+        /* Also left over threads perform this loop.
+         * The left over thread results will be ignored
+         */
+        #pragma unroll 16
+        for (size_t i = 0; i < 16; i++) {
+            #pragma unroll 10
+            for (int j = 0; j < 10; ++j) {
+                uint4 t = ((uint4 *)ExpandedKey1)[j];
+                t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
+                t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
+                t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
+                t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
+                text = t;
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+            xin[get_local_id(1)][get_local_id(0)] = text;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            text = mix_and_propagate(xin);
+        }
+    }
 #endif
 
 #if(COMP_MODE==1)
-	// do not use early return here
+    // do not use early return here
 	if(gIdx < Threads)
 #endif
-	{
+    {
 
-		#pragma unroll 2
-		for(int i = 0; i < (MEMORY >> 7); ++i)
-		{
-			#pragma unroll
-			for(int j = 0; j < 10; ++j)
-				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
-
-			Scratchpad[IDX((i << 3) + get_local_id(1))] = text;
-		}
-	}
-	mem_fence(CLK_GLOBAL_MEM_FENCE);
+        #pragma unroll 2
+        for(int i = 0; i < (MEMORY >> 4); i += 8) {
+            #pragma unroll 10
+            for (int j = 0; j < 10; ++j) {
+                uint4 t = ((uint4 *)ExpandedKey1)[j];
+                t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
+                t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
+                t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
+                t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
+                text = t;
+            }
+
+            Scratchpad[IDX(i + get_local_id(1))] = text;
+        }
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
+)==="
+R"===(
+        
 // cryptonight_monero_v8 && NVIDIA
 #if(ALGO==11 && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
@@ -573,7 +540,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 #endif
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
+__kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, uint Threads
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 , __global ulong *input
@@ -595,61 +562,61 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	ulong b[2];
 	uint4 b_x[1];
 #endif
-	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
+    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
 
 // cryptonight_monero_v8
 #if(ALGO==11)
-	__local uint RCP[256];
+    __local uint RCP[256];
 	uint2 division_result;
 	uint sqrt_result;
 #endif
-	const ulong gIdx = getIdx();
+    const uint gIdx = getIdx();
 
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
-		const uint tmp = AES0_C[i];
-		AES0[i] = tmp;
-		AES1[i] = rotate(tmp, 8U);
-		AES2[i] = rotate(tmp, 16U);
-		AES3[i] = rotate(tmp, 24U);
+        const uint tmp = AES0_C[i];
+        AES0[i] = tmp;
+        AES1[i] = rotate(tmp, 8U);
+        AES2[i] = rotate(tmp, 16U);
+        AES3[i] = rotate(tmp, 24U);
 // cryptonight_monero_v8
 #if(ALGO==11)
 		RCP[i] = RCP_C[i];
 #endif
-	}
+    }
 
-	barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
     uint2 tweak1_2;
 #endif
 
 #if(COMP_MODE==1)
-	// do not use early return here
+    // do not use early return here
 	if(gIdx < Threads)
 #endif
-	{
-		states += 25 * gIdx;
+    {
+        states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (MEMORY >> 4);
+        Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-		Scratchpad += gIdx;
+        Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+        Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
 #endif
 
-		a[0] = states[0] ^ states[4];
-		b[0] = states[2] ^ states[6];
-		a[1] = states[1] ^ states[5];
-		b[1] = states[3] ^ states[7];
+        a[0] = states[0] ^ states[4];
+        b[0] = states[2] ^ states[6];
+        a[1] = states[1] ^ states[5];
+        b[1] = states[3] ^ states[7];
 
 		b_x[0] = ((uint4 *)b)[0];
 
 // cryptonight_monero_v8
 #if(ALGO==11)
-		a[1] = states[1] ^ states[5];
-		b[2] = states[8] ^ states[10];
-		b[3] = states[9] ^ states[11];
+        a[1] = states[1] ^ states[5];
+        b[2] = states[8] ^ states[10];
+        b[3] = states[9] ^ states[11];
 		b_x[1] = ((uint4 *)b)[1];
 		division_result = as_uint2(states[12]);
 		sqrt_result = as_uint2(states[13]).s0;
@@ -662,20 +629,20 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		tweak1_2.s1 = (uint)get_global_id(0);
 		tweak1_2 ^= as_uint2(states[24]);
 #endif
-	}
-
-	mem_fence(CLK_LOCAL_MEM_FENCE);
+    }
+    
+    mem_fence(CLK_LOCAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-	// do not use early return here
+    // do not use early return here
 	if(gIdx < Threads)
 #endif
-	{
+    {
 		ulong idx0 = a[0] & MASK;
 
 		#pragma unroll CN_UNROLL
-		for(int i = 0; i < ITERATIONS; ++i)
-		{
+    for(int i = 0; i < ITERATIONS; ++i)
+    {
 			ulong c[2];
 // cryptonight_monero_v8 && NVIDIA
 #if(ALGO==11 && defined(__NV_CL_C_VERSION))
@@ -693,14 +660,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
 // cryptonight_monero_v8
 #if(ALGO==11)
-			{
+        {
 				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
 				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
 				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
 				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
 				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
-				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
-			}
+            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+        }
 #endif
 
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
@@ -738,8 +705,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 // cryptonight_monero_v8
 #if(ALGO==11)
 			// Use division and square root results from the _previous_ iteration to hide the latency
-			tmp.s0 ^= division_result.s0;
-			tmp.s1 ^= division_result.s1 ^ sqrt_result;
+            tmp.s0 ^= division_result.s0;
+            tmp.s1 ^= division_result.s1 ^ sqrt_result;
  			// Most and least significant bits in the divisor are set to 1
 			// to make sure we don't divide by a small or even number,
 			// so there are no shortcuts for such cases
@@ -755,15 +722,15 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			result_mul.s1 = c[0] * as_ulong2(tmp).s0;
 // cryptonight_monero_v8
 #if(ALGO==11)
-			{
+        {
 				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
 				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
 				result_mul ^= chunk2;
 				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
 				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
 				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
-				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
-			}
+            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+        }
 #endif
 			a[1] += result_mul.s1;
 			a[0] += result_mul.s0;
@@ -775,7 +742,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #	if(ALGO == 6 || ALGO == 10)
 			uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
-			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
+        SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
 #	else
 			((uint2 *)&(a[1]))[0] ^= tweak1_2;
@@ -787,8 +754,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 #endif
 
-			((uint4 *)a)[0] ^= tmp;
-
+        ((uint4 *)a)[0] ^= tmp;
+    
 // cryptonight_monero_v8
 #if (ALGO == 11)
 #	if defined(__NV_CL_C_VERSION)
@@ -804,7 +771,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if (ALGO == 4 || ALGO == 10)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
-			long q = fast_div_heavy(n, d | 0x5);
+                long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
 			idx0 = (d ^ q) & MASK;
 // cryptonight_haven
@@ -816,173 +783,178 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			idx0 = ((~d) ^ q) & MASK;
 #endif
 
-		}
-	}
-	mem_fence(CLK_GLOBAL_MEM_FENCE);
+    }
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 )==="
 R"===(
 
-__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
 {
-	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
-	uint ExpandedKey2[40];
-	ulong State[25];
-	uint4 text;
-
-	const ulong gIdx = getIdx();
-
-	for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
-		i < 256;
-		i += WORKSIZE * 8)
-	{
-		const uint tmp = AES0_C[i];
-		AES0[i] = tmp;
-		AES1[i] = rotate(tmp, 8U);
-		AES2[i] = rotate(tmp, 16U);
-		AES3[i] = rotate(tmp, 24U);
-	}
+    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
+    uint ExpandedKey2[40];
+    uint4 text;
+
+    const uint gIdx = getIdx();
+
+    for (int i = get_local_id(1) * 8 + get_local_id(0); i < 256; i += 8 * 8) {
+        const uint tmp = AES0_C[i];
+        AES0[i] = tmp;
+        AES1[i] = rotate(tmp, 8U);
+        AES2[i] = rotate(tmp, 16U);
+        AES3[i] = rotate(tmp, 24U);
+    }
 
-	barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
 
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+    __local uint4 xin1[8][8];
+    __local uint4 xin2[8][8];
+#endif
+        
 #if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
+    // do not use early return here
+    if(gIdx < Threads)
 #endif
-	{
-		states += 25 * gIdx;
+    {
+        states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (MEMORY >> 4);
+        Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-		Scratchpad += gIdx;
+        Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+        Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
 #endif
 
-		#if defined(__Tahiti__) || defined(__Pitcairn__)
+        #if defined(__Tahiti__) || defined(__Pitcairn__)
+
+        for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
+        text = vload4(get_local_id(1) + 4, (__global uint *)states);
 
-		for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
-		text = vload4(get_local_id(1) + 4, (__global uint *)states);
+        #else
 
-		#else
+        text = vload4(get_local_id(1) + 4, (__global uint *)states);
+        ((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
 
-		text = vload4(get_local_id(1) + 4, (__global uint *)states);
-		((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
+        #endif
 
-		#endif
+        AESExpandKey256(ExpandedKey2);
+    }
 
-		AESExpandKey256(ExpandedKey2);
-	}
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-	barrier(CLK_LOCAL_MEM_FENCE);
 // cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
 #if (ALGO == 4 || ALGO == 9 || ALGO == 10)
-	__local uint4 xin[8][WORKSIZE];
+    __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
+    __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
+    __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
+    __local uint4* xin2_load = &xin2[(get_local_id(1) + 1) % 8][get_local_id(0)];
+    *xin2_store = (uint4)(0, 0, 0, 0);
 #endif
 
-#if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
+#if(COMP_MODE == 1)
+    // do not use early return here
+    if (gIdx < Threads)
 #endif
-	{
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
+    {
 #if (ALGO == 4 || ALGO == 9 || ALGO == 10)
-		#pragma unroll 2
-		for(int i = 0; i < (MEMORY >> 7); ++i)
-		{
-			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
-
-			#pragma unroll 10
-			for(int j = 0; j < 10; ++j)
-				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+        #pragma unroll 2
+        for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
+        {
+            text ^= Scratchpad[IDX(i1)];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            text ^= *xin2_load;
 
+            #pragma unroll 10
+            for(int j = 0; j < 10; ++j)
+                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
 
-			barrier(CLK_LOCAL_MEM_FENCE);
-			xin[get_local_id(1)][get_local_id(0)] = text;
-			barrier(CLK_LOCAL_MEM_FENCE);
-			text = mix_and_propagate(xin);
-		}
+            *xin1_store = text;
 
-		#pragma unroll 2
-		for(int i = 0; i < (MEMORY >> 7); ++i)
-		{
-			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+            text ^= Scratchpad[IDX(i1 + 8)];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            text ^= *xin1_load;
 
-			#pragma unroll 10
-			for(int j = 0; j < 10; ++j)
-				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+            #pragma unroll 10
+            for(int j = 0; j < 10; ++j)
+                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
 
+            *xin2_store = text;
+        }
 
-			barrier(CLK_LOCAL_MEM_FENCE);
-			xin[get_local_id(1)][get_local_id(0)] = text;
-			barrier(CLK_LOCAL_MEM_FENCE);
-			text = mix_and_propagate(xin);
-		}
+        barrier(CLK_LOCAL_MEM_FENCE);
+        text ^= *xin2_load;
 
 #else
-		#pragma unroll 2
-		for(int i = 0; i < (MEMORY >> 7); ++i)
-		{
-			text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+        #pragma unroll 2
+        for (int i = 0; i < (MEMORY >> 7); ++i) {
+            text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
 
-			#pragma unroll 10
-			for(int j = 0; j < 10; ++j)
-				text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-		}
+            #pragma unroll 10
+            for(int j = 0; j < 10; ++j)
+                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+        }
 #endif
-	}
+    }
 
-// cryptonight_heavy or cryptonight_haven || cryptonight_bittube2
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
 #if (ALGO == 4 || ALGO == 9 || ALGO == 10)
-	/* Also left over threads perform this loop.
-	 * The left over thread results will be ignored
-	 */
-	#pragma unroll 16
-	for(size_t i=0; i < 16; i++)
-	{
-		#pragma unroll 10
-		for(int j = 0; j < 10; ++j)
-			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-		barrier(CLK_LOCAL_MEM_FENCE);
-		xin[get_local_id(1)][get_local_id(0)] = text;
-		barrier(CLK_LOCAL_MEM_FENCE);
-		text = mix_and_propagate(xin);
-	}
+    /* Also left over threads performe this loop.
+     * The left over thread results will be ignored
+     */
+    #pragma unroll 16
+    for(size_t i = 0; i < 16; i++)
+    {
+        #pragma unroll 10
+        for (int j = 0; j < 10; ++j) {
+            text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+        *xin1_store = text;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        text ^= *xin1_load;
+    }
 #endif
 
+    __local ulong State_buf[8 * 25];
 #if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
+    // do not use early return here
+    if(gIdx < Threads)
 #endif
-	{
-		vstore2(as_ulong2(text), get_local_id(1) + 4, states);
-	}
+    {
+        vstore2(as_ulong2(text), get_local_id(1) + 4, states);
+    }
 
-	barrier(CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_GLOBAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-	// do not use early return here
-	if(gIdx < Threads)
+    // do not use early return here
+    if(gIdx < Threads)
 #endif
-	{
-		if(!get_local_id(1))
-		{
-			for(int i = 0; i < 25; ++i) State[i] = states[i];
+    {
+        if(!get_local_id(1))
+        {
+            __local ulong* State = State_buf + get_local_id(0) * 25;
+
+            for(int i = 0; i < 25; ++i) State[i] = states[i];
 
-			keccakf1600_2(State);
+            keccakf1600_2(State);
 
-			for(int i = 0; i < 25; ++i) states[i] = State[i];
+            for(int i = 0; i < 25; ++i) states[i] = State[i];
 
-			ulong StateSwitch = State[0] & 3;
-			__global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
-			__global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
-			__global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
-			destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
-		}
-	}
-	mem_fence(CLK_GLOBAL_MEM_FENCE);
+            uint StateSwitch = State[0] & 3;
+            __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
+            __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
+            __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
+            destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
+        }
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 )==="
@@ -994,276 +966,276 @@ R"===(
 
 #define VSWAP4(x)	((((x) >> 24) & 0xFFU) | (((x) >> 8) & 0xFF00U) | (((x) << 8) & 0xFF0000U) | (((x) << 24) & 0xFF000000U))
 
-__kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
+__kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-	const ulong idx = get_global_id(0) - get_global_offset(0);
+    const ulong idx = get_global_id(0) - get_global_offset(0);
 
-	// do not use early return here
-	if(idx < Threads)
-	{
-		states += 25 * BranchBuf[idx];
+    // do not use early return here
+    if(idx < Threads)
+    {
+        states += 25 * BranchBuf[idx];
 
-		// skein
-		ulong8 h = vload8(0, SKEIN512_256_IV);
+        // skein
+        ulong8 h = vload8(0, SKEIN512_256_IV);
 
-		// Type field begins with final bit, first bit, then six bits of type; the last 96
-		// bits are input processed (including in the block to be processed with that tweak)
-		// The output transform is only one run of UBI, since we need only 256 bits of output
-		// The tweak for the output transform is Type = Output with the Final bit set
-		// T[0] for the output is 8, and I don't know why - should be message size...
-		ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
-		ulong8 p, m;
+        // Type field begins with final bit, first bit, then six bits of type; the last 96
+        // bits are input processed (including in the block to be processed with that tweak)
+        // The output transform is only one run of UBI, since we need only 256 bits of output
+        // The tweak for the output transform is Type = Output with the Final bit set
+        // T[0] for the output is 8, and I don't know why - should be message size...
+        ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
+        ulong8 p, m;
 
-		for(uint i = 0; i < 4; ++i)
-		{
-			t[0] += i < 3 ? 0x40UL : 0x08UL;
+        for(uint i = 0; i < 4; ++i)
+        {
+            t[0] += i < 3 ? 0x40UL : 0x08UL;
 
-			t[2] = t[0] ^ t[1];
+            t[2] = t[0] ^ t[1];
 
-			m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
-			const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
-			p = Skein512Block(m, h, h8, t);
+            m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+            const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+            p = Skein512Block(m, h, h8, t);
 
-			h = m ^ p;
+            h = m ^ p;
 
-			t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL;
-		}
+            t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL;
+        }
 
-		t[0] = 0x08UL;
-		t[1] = 0xFF00000000000000UL;
-		t[2] = t[0] ^ t[1];
+        t[0] = 0x08UL;
+        t[1] = 0xFF00000000000000UL;
+        t[2] = t[0] ^ t[1];
 
-		p = (ulong8)(0);
-		const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+        p = (ulong8)(0);
+        const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
 
-		p = Skein512Block(p, h, h8, t);
+        p = Skein512Block(p, h, h8, t);
 
-		//vstore8(p, 0, output);
+        //vstore8(p, 0, output);
 
-		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-		// and expect an accurate result for target > 32-bit without implementing carries
+        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+        // and expect an accurate result for target > 32-bit without implementing carries
 		if(p.s3 <= Target)
 		{
-			ulong outIdx = atomic_inc(output + 0xFF);
+            ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-		}
-	}
-	mem_fence(CLK_GLOBAL_MEM_FENCE);
+            }
+        }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 #define SWAP8(x)	as_ulong(as_uchar8(x).s76543210)
 
 #define JHXOR \
-	h0h ^= input[0]; \
-	h0l ^= input[1]; \
-	h1h ^= input[2]; \
-	h1l ^= input[3]; \
-	h2h ^= input[4]; \
-	h2l ^= input[5]; \
-	h3h ^= input[6]; \
-	h3l ^= input[7]; \
+    h0h ^= input[0]; \
+    h0l ^= input[1]; \
+    h1h ^= input[2]; \
+    h1l ^= input[3]; \
+    h2h ^= input[4]; \
+    h2l ^= input[5]; \
+    h3h ^= input[6]; \
+    h3l ^= input[7]; \
 \
-	E8; \
+    E8; \
 \
-	h4h ^= input[0]; \
-	h4l ^= input[1]; \
-	h5h ^= input[2]; \
-	h5l ^= input[3]; \
-	h6h ^= input[4]; \
-	h6l ^= input[5]; \
-	h7h ^= input[6]; \
-	h7l ^= input[7]
-
-__kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
+    h4h ^= input[0]; \
+    h4l ^= input[1]; \
+    h5h ^= input[2]; \
+    h5l ^= input[3]; \
+    h6h ^= input[4]; \
+    h6l ^= input[5]; \
+    h7h ^= input[6]; \
+    h7l ^= input[7]
+
+__kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-	const uint idx = get_global_id(0) - get_global_offset(0);
+    const uint idx = get_global_id(0) - get_global_offset(0);
 
-	// do not use early return here
-	if(idx < Threads)
-	{
-		states += 25 * BranchBuf[idx];
+    // do not use early return here
+    if(idx < Threads)
+    {
+        states += 25 * BranchBuf[idx];
 
-		sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
-		sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
-		sph_u64 tmp;
+        sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
+        sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
+        sph_u64 tmp;
 
-		for(int i = 0; i < 3; ++i)
-		{
-			ulong input[8];
+        for(int i = 0; i < 3; ++i)
+        {
+            ulong input[8];
 
-			const int shifted = i << 3;
-			for(int x = 0; x < 8; ++x) input[x] = (states[shifted + x]);
-			JHXOR;
-		}
-		{
-			ulong input[8];
-			input[0] = (states[24]);
-			input[1] = 0x80UL;
-			#pragma unroll 6
-			for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
-			JHXOR;
-		}
-		{
-			ulong input[8];
-			for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
-			input[7] = 0x4006000000000000UL;
-			JHXOR;
-		}
-
-		//output[0] = h6h;
-		//output[1] = h6l;
-		//output[2] = h7h;
-		//output[3] = h7l;
-
-		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-		// and expect an accurate result for target > 32-bit without implementing carries
+            const int shifted = i << 3;
+            for(int x = 0; x < 8; ++x) input[x] = (states[shifted + x]);
+            JHXOR;
+        }
+        {
+            ulong input[8];
+            input[0] = (states[24]);
+            input[1] = 0x80UL;
+            #pragma unroll 6
+            for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
+            JHXOR;
+        }
+        {
+            ulong input[8];
+            for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
+            input[7] = 0x4006000000000000UL;
+            JHXOR;
+        }
+
+        //output[0] = h6h;
+        //output[1] = h6l;
+        //output[2] = h7h;
+        //output[3] = h7l;
+
+        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+        // and expect an accurate result for target > 32-bit without implementing carries
 		if(h7l <= Target)
 		{
-			ulong outIdx = atomic_inc(output + 0xFF);
+            ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-		}
-	}
-}
+            }
+        }
+    }
 
 #define SWAP4(x)	as_uint(as_uchar4(x).s3210)
 
-__kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
+__kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-	const uint idx = get_global_id(0) - get_global_offset(0);
+    const uint idx = get_global_id(0) - get_global_offset(0);
 
-	// do not use early return here
-	if(idx < Threads)
-	{
-		states += 25 * BranchBuf[idx];
+    // do not use early return here
+    if(idx < Threads)
+    {
+        states += 25 * BranchBuf[idx];
 
-		unsigned int m[16];
-		unsigned int v[16];
-		uint h[8];
+        unsigned int m[16];
+        unsigned int v[16];
+        uint h[8];
 
-		((uint8 *)h)[0] = vload8(0U, c_IV256);
+        ((uint8 *)h)[0] = vload8(0U, c_IV256);
 
-		#pragma unroll 4
-		for(uint i = 0, bitlen = 0; i < 4; ++i)
-		{
-			if(i < 3)
-			{
-				((uint16 *)m)[0] = vload16(i, (__global uint *)states);
-				for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]);
-				bitlen += 512;
-			}
-			else
-			{
-				m[0] = SWAP4(((__global uint *)states)[48]);
-				m[1] = SWAP4(((__global uint *)states)[49]);
-				m[2] = 0x80000000U;
-
-				for(int i = 3; i < 13; ++i) m[i] = 0x00U;
-
-				m[13] = 1U;
-				m[14] = 0U;
-				m[15] = 0x640;
-				bitlen += 64;
-			}
-
-			((uint16 *)v)[0].lo = ((uint8 *)h)[0];
-			((uint16 *)v)[0].hi = vload8(0U, c_u256);
-
-			//v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-			//v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-
-			v[12] ^= bitlen;
-			v[13] ^= bitlen;
-
-			for(int r = 0; r < 14; r++)
-			{
-				GS(0, 4, 0x8, 0xC, 0x0);
-				GS(1, 5, 0x9, 0xD, 0x2);
-				GS(2, 6, 0xA, 0xE, 0x4);
-				GS(3, 7, 0xB, 0xF, 0x6);
-				GS(0, 5, 0xA, 0xF, 0x8);
-				GS(1, 6, 0xB, 0xC, 0xA);
-				GS(2, 7, 0x8, 0xD, 0xC);
-				GS(3, 4, 0x9, 0xE, 0xE);
-			}
-
-			((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
-		}
-
-		for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]);
-
-		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-		// and expect an accurate result for target > 32-bit without implementing carries
-		uint2 t = (uint2)(h[6],h[7]);
+        #pragma unroll 4
+        for(uint i = 0, bitlen = 0; i < 4; ++i)
+        {
+            if(i < 3)
+            {
+                ((uint16 *)m)[0] = vload16(i, (__global uint *)states);
+                for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]);
+                bitlen += 512;
+            }
+            else
+            {
+                m[0] = SWAP4(((__global uint *)states)[48]);
+                m[1] = SWAP4(((__global uint *)states)[49]);
+                m[2] = 0x80000000U;
+
+                for(int i = 3; i < 13; ++i) m[i] = 0x00U;
+
+                m[13] = 1U;
+                m[14] = 0U;
+                m[15] = 0x640;
+                bitlen += 64;
+            }
+
+            ((uint16 *)v)[0].lo = ((uint8 *)h)[0];
+            ((uint16 *)v)[0].hi = vload8(0U, c_u256);
+
+            //v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U;
+            //v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U;
+
+            v[12] ^= bitlen;
+            v[13] ^= bitlen;
+
+            for(int r = 0; r < 14; r++)
+            {
+                GS(0, 4, 0x8, 0xC, 0x0);
+                GS(1, 5, 0x9, 0xD, 0x2);
+                GS(2, 6, 0xA, 0xE, 0x4);
+                GS(3, 7, 0xB, 0xF, 0x6);
+                GS(0, 5, 0xA, 0xF, 0x8);
+                GS(1, 6, 0xB, 0xC, 0xA);
+                GS(2, 7, 0x8, 0xD, 0xC);
+                GS(3, 4, 0x9, 0xE, 0xE);
+            }
+
+            ((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
+        }
+
+        for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]);
+
+        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+        // and expect an accurate result for target > 32-bit without implementing carries
+        uint2 t = (uint2)(h[6],h[7]);
 		if( as_ulong(t) <= Target)
 		{
-			ulong outIdx = atomic_inc(output + 0xFF);
+            ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-		}
-	}
-}
+            }
+        }
+    }
 
-__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
+__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-	const uint idx = get_global_id(0) - get_global_offset(0);
+    const uint idx = get_global_id(0) - get_global_offset(0);
 
-	// do not use early return here
-	if(idx < Threads)
-	{
-		states += 25 * BranchBuf[idx];
+    // do not use early return here
+    if(idx < Threads)
+    {
+        states += 25 * BranchBuf[idx];
 
-		ulong State[8];
+        ulong State[8];
 
-		for(int i = 0; i < 7; ++i) State[i] = 0UL;
+        for(int i = 0; i < 7; ++i) State[i] = 0UL;
 
-		State[7] = 0x0001000000000000UL;
+        State[7] = 0x0001000000000000UL;
 
-		#pragma unroll 4
-		for(uint i = 0; i < 4; ++i)
-		{
-			volatile ulong H[8], M[8];
+        #pragma unroll 4
+        for(uint i = 0; i < 4; ++i)
+        {
+            volatile ulong H[8], M[8];
 
-			if(i < 3)
-			{
-				((ulong8 *)M)[0] = vload8(i, states);
-			}
-			else
-			{
-				M[0] = states[24];
-				M[1] = 0x80UL;
+            if(i < 3)
+            {
+                ((ulong8 *)M)[0] = vload8(i, states);
+            }
+            else
+            {
+                M[0] = states[24];
+                M[1] = 0x80UL;
 
-				for(int x = 2; x < 7; ++x) M[x] = 0UL;
+                for(int x = 2; x < 7; ++x) M[x] = 0UL;
 
-				M[7] = 0x0400000000000000UL;
-			}
+                M[7] = 0x0400000000000000UL;
+            }
 
-			for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x];
+            for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x];
 
-			PERM_SMALL_P(H);
-			PERM_SMALL_Q(M);
+            PERM_SMALL_P(H);
+            PERM_SMALL_Q(M);
 
-			for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x];
-		}
+            for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x];
+        }
 
-		ulong tmp[8];
+        ulong tmp[8];
 
-		for(int i = 0; i < 8; ++i) tmp[i] = State[i];
+        for(int i = 0; i < 8; ++i) tmp[i] = State[i];
 
-		PERM_SMALL_P(State);
+        PERM_SMALL_P(State);
 
-		for(int i = 0; i < 8; ++i) State[i] ^= tmp[i];
+        for(int i = 0; i < 8; ++i) State[i] ^= tmp[i];
 
-		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-		// and expect an accurate result for target > 32-bit without implementing carries
+        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+        // and expect an accurate result for target > 32-bit without implementing carries
 		if(State[7] <= Target)
 		{
-			ulong outIdx = atomic_inc(output + 0xFF);
+            ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-		}
-	}
-}
+            }
+        }
+    }
 
-)==="
+)==="
\ No newline at end of file
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
index 1d078b893a5429d36d9d99909fc907a98508a810..21268fd78a9885cd03cfaadea3971210beed7bbd 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
@@ -2,49 +2,24 @@ R"===(
 #ifndef FAST_DIV_HEAVY_CL
 #define FAST_DIV_HEAVY_CL
 
-inline ulong get_reciprocal_heavy(uint a)
+inline long fast_div_heavy(long _a, int _b)
 {
-	const uint shift = clz(a);
-	a <<= shift;
-
-	const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23));
-	const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256);
-
-	const float r = native_recip(a_hi);
-
-	const uint tmp0 = as_uint(r);
-	const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23);
-	const float r_scaled = as_float(tmp1);
-
-	const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
-
-	const float r_scaled_hi = as_float(tmp1 & ~4095U);
-	const float h_hi = as_float(as_uint(h) & ~4095U);
+	long a = abs(_a);
+	int b = abs(_b);
 
-	const float r_scaled_lo = r_scaled - r_scaled_hi;
-	const float h_lo = h - h_hi;
+	float rcp = native_recip(convert_float_rte(b));
+	float rcp2 = as_float(as_uint(rcp) + (32U << 23));
 
-	const float x1 = h_hi * r_scaled_hi;
-	const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo;
+	ulong q1 = convert_ulong_rte(convert_float_rte(as_int2(a).s1) * rcp2);
+	a -= q1 * as_uint(b);
 
-	const long h1 = convert_long_rte(x1);
-	const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2));
-
-	const ulong result = tmp0 & 0xFFFFFF;
-	return (result << (shift + 9)) - ((h1 + h2) >> 2);
-}
-
-inline long fast_div_heavy(long _a, int _b)
-{
-	const ulong a = abs(_a);
-	const uint b = abs(_b);
-	ulong q = mul_hi(a, get_reciprocal_heavy(b));
+	long q2 = convert_long_rte(convert_float_rtn(a) * rcp);
+	int a2 = as_int2(a).s0 - as_int2(q2).s0 * b;
 
-	const long tmp = a - q * b;
-	const int overshoot = (tmp < 0) ? 1 : 0;
-	const int undershoot = (tmp >= b) ? 1 : 0;
-	q += undershoot - overshoot;
+	int q3 = convert_int_rte(convert_float_rte(a2) * rcp);
+	q3 += (a2 - q3 * b) >> 31;
 
+	const long q = q1 + q2 + q3;
 	return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
 }
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index 1ef1dead4851bccc174b1b3d3750d4209d80fca8..2c1b13865c78d78e972c3430e6ab62213aa994aa 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -125,3 +125,4 @@ inline uint fast_sqrt_v2(const ulong n1)
 #endif
 
 )==="
+        
\ No newline at end of file
diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp
index 92bb015063eb46b3b1bdc494611a69e5d8ca9e60..0eea9fdd7739a4f50cde75b9e1633369f1d15d77 100644
--- a/xmrstak/backend/backendConnector.cpp
+++ b/xmrstak/backend/backendConnector.cpp
@@ -60,11 +60,29 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 
 	std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>;
 
+#ifndef CONF_NO_OPENCL
+	if(params::inst().useAMD)
+	{
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
+		plugin amdplugin;
+		amdplugin.load(backendName, "xmrstak_opencl_backend");
+		std::vector<iBackend*>* amdThreads = amdplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
+		size_t numWorkers = 0u;
+		if(amdThreads != nullptr)
+		{
+			pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
+			numWorkers = amdThreads->size();
+			delete amdThreads;
+		}
+		if(numWorkers == 0)
+			printer::inst()->print_msg(L0, "WARNING: backend %s (OpenCL) disabled.", backendName.c_str());
+	}
+#endif
+
 #ifndef CONF_NO_CUDA
 	if(params::inst().useNVIDIA)
 	{
 		plugin nvidiaplugin;
-		std::vector<iBackend*>* nvidiaThreads;
 		std::vector<std::string> libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"};
 		size_t numWorkers = 0u;
 
@@ -96,25 +114,6 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 	}
 #endif
 
-#ifndef CONF_NO_OPENCL
-	if(params::inst().useAMD)
-	{
-		const std::string backendName = xmrstak::params::inst().openCLVendor;
-		plugin amdplugin;
-		amdplugin.load(backendName, "xmrstak_opencl_backend");
-		std::vector<iBackend*>* amdThreads = amdplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
-		size_t numWorkers = 0u;
-		if(amdThreads != nullptr)
-		{
-			pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
-			numWorkers = amdThreads->size();
-			delete amdThreads;
-		}
-		if(numWorkers == 0)
-			printer::inst()->print_msg(L0, "WARNING: backend %s (OpenCL) disabled.", backendName.c_str());
-	}
-#endif
-
 #ifndef CONF_NO_CPU
 	if(params::inst().useCPU)
 	{