diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..25ba43d61fca0ed7033cc3ceb959a29aecae099a
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,14 @@
+IndentWidth: 4
+TabWidth: 4
+ColumnLimit: 0
+BreakBeforeBraces: Allman
+AllowShortIfStatementsOnASingleLine: false
+IndentCaseLabels: false
+SpaceBeforeParens: Never
+UseTab: Always
+AlignAfterOpenBracket: DontAlign
+PointerBindsToType: true
+BreakConstructorInitializers: AfterColon
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
deleted file mode 100644
index 8451f3289c3ec5d91bd0d863557a790a6ed9df6e..0000000000000000000000000000000000000000
--- a/.github/ISSUE_TEMPLATE.md
+++ /dev/null
@@ -1,30 +0,0 @@
-Please provide as much as possible information to reproduce the issue.
-
-# Basic information
-  - Type of the CPU.
-  - Type of the GPU (if you try to miner with the GPU).
-
-# Compile issues
-  - Which OS do you use?
-  ```
-  add **all** commands you used and the **full** compile output here
-  ```
-  ```
-  run `cmake -LA .` in the build folder and add the output here
-  ```
-
-# Issue with the execution
-  - Do you compiled the miner by our own?
-  ```
-  run `./xmr-stak --version-long` and add the output here
-  ```
-
-# AMD OpenCl issue
-
-  ```
-  run `clinfo` and add the output here
-  ```
-
-# Stability issue
-  - Is the CPU or GPU overclocked?
-  - Is the Main memory of the CPU or GPU undervolted?
diff --git a/.github/ISSUE_TEMPLATE/compile_bug_report.md b/.github/ISSUE_TEMPLATE/compile_bug_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..899ad941f41dca17477eb7184c6efdfbd9c229b7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/compile_bug_report.md
@@ -0,0 +1,35 @@
+---
+name: Compile bug report
+about: You have an issue to compile xmr-stak.
+
+---
+
+`...` are the placeholder for your answers. Please answer each question!
+
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**Which operating system do you use? **
+
+```
+...
+```
+
+**To Reproduce**
+```
+# Please post all commands and the output.
+...
+```
+
+**Additional information.**
+
+```
+# run `cmake -LA .` in the build folder and add the output here
+...
+```
+
+**Feel free to add more information.**
+```
+...
+```
diff --git a/.github/ISSUE_TEMPLATE/execution_bug_report.md b/.github/ISSUE_TEMPLATE/execution_bug_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..44ac89bf1eda9725440ff346881f8e978c53beef
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/execution_bug_report.md
@@ -0,0 +1,7 @@
+---
+name: Execution bug report
+about: You have an issue to execute xmr-stak.
+
+---
+
+**Most execution issues are caused by driver problems. Please use the [xmr-stak sub-reddit](https://www.reddit.com/r/XmrStak/) to ask for help instead of opening an issue here.**
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..90f5e4f3d37d003e0abeb2c0f5c9c4446f5d97b7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,7 @@
+---
+name: Feature request
+about: Suggest an idea for xmr-stak.
+
+---
+
+**Please explain the feature as good as possible.**
diff --git a/.github/ISSUE_TEMPLATE/tuning_help.md b/.github/ISSUE_TEMPLATE/tuning_help.md
new file mode 100644
index 0000000000000000000000000000000000000000..40dedef05d740c874dc9596312985150de6283a6
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/tuning_help.md
@@ -0,0 +1,7 @@
+---
+name: Need help for optimization.
+about: You need help to optimize your setup.
+
+---
+
+**Please use the [xmr-stak sub-reddit](https://www.reddit.com/r/XmrStak/) to discuss optimizations.**
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 004b5555c94d80fd9ec089b6d3201ea20bb389b7..41e993eee79beb6fccf0cea03a299c1f5545ceb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,10 +84,11 @@ if(CUDA_ENABLE)
             /usr
             /usr/local/cuda
         PATH_SUFFIXES
-            lib64	
+            lib64
             lib/x64
             lib/Win32
-            lib64/stubs)
+            lib64/stubs
+            lib)
 
         #nvrtc
         find_library(CUDA_NVRTC_LIB 
@@ -104,7 +105,8 @@ if(CUDA_ENABLE)
         PATH_SUFFIXES
             lib64
             lib/x64
-            lib/Win32)
+            lib/Win32
+            lib)
 
         list(APPEND BACKEND_TYPES "nvidia")
         option(XMR-STAK_LARGEGRID "Support large CUDA block count > 128" ON)
@@ -322,7 +324,7 @@ endif()
 ################################################################################
 
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-	set_source_files_properties(xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2")
+    set_source_files_properties(xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2")
 endif()
 
 ################################################################################
diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.cpp b/xmrstak/backend/amd/OclCryptonightR_gen.cpp
index 7358e98570386de0353815df3eb19eab717455f8..2a60c46d94c8e5491171ebf2e20d54431b2bca32 100644
--- a/xmrstak/backend/amd/OclCryptonightR_gen.cpp
+++ b/xmrstak/backend/amd/OclCryptonightR_gen.cpp
@@ -1,19 +1,18 @@
-#include <string>
-#include <sstream>
-#include <mutex>
 #include <cstring>
+#include <mutex>
+#include <sstream>
+#include <string>
 #include <thread>
 
-
 #include "xmrstak/backend/amd/OclCryptonightR_gen.hpp"
 #include "xmrstak/backend/cpu/crypto/variant4_random_math.h"
-#include "xmrstak/misc/console.hpp"
 #include "xmrstak/cpputil/read_write_lock.h"
+#include "xmrstak/misc/console.hpp"
 
 #include <chrono>
-#include <thread>
 #include <iostream>
-
+#include <regex>
+#include <thread>
 
 namespace xmrstak
 {
@@ -22,16 +21,16 @@ namespace amd
 
 static std::string get_code(const V4_Instruction* code, int code_size)
 {
-    std::stringstream s;
+	std::stringstream s;
 
-	for (int i = 0; i < code_size; ++i)
+	for(int i = 0; i < code_size; ++i)
 	{
 		const V4_Instruction inst = code[i];
 
 		const uint32_t a = inst.dst_index;
 		const uint32_t b = inst.src_index;
 
-		switch (inst.opcode)
+		switch(inst.opcode)
 		{
 		case MUL:
 			s << 'r' << a << "*=r" << b << ';';
@@ -58,37 +57,39 @@ static std::string get_code(const V4_Instruction* code, int code_size)
 		s << '\n';
 	}
 
-    return s.str();
+	return s.str();
 }
 
 struct CacheEntry
 {
-    CacheEntry(xmrstak_algo algo, uint64_t height, size_t deviceIdx, cl_program program) :
-        algo(algo),
-        height(height),
-        deviceIdx(deviceIdx),
-        program(program)
-    {}
-
-    xmrstak_algo algo;
-    uint64_t height;
-    size_t deviceIdx;
-    cl_program program;
+	CacheEntry(xmrstak_algo algo, uint64_t height_offset, size_t deviceIdx, cl_program program) :
+		algo(algo),
+		height_offset(height_offset),
+		deviceIdx(deviceIdx),
+		program(program)
+	{
+	}
+
+	xmrstak_algo algo;
+	uint64_t height_offset;
+	size_t deviceIdx;
+	cl_program program;
 };
 
 struct BackgroundTaskBase
 {
-    virtual ~BackgroundTaskBase() {}
-    virtual void exec() = 0;
+	virtual ~BackgroundTaskBase() {}
+	virtual void exec() = 0;
 };
 
-template<typename T>
+template <typename T>
 struct BackgroundTask : public BackgroundTaskBase
 {
-    BackgroundTask(T&& func) : m_func(std::move(func)) {}
-    void exec() override { m_func(); }
+	BackgroundTask(T&& func) :
+		m_func(std::move(func)) {}
+	void exec() override { m_func(); }
 
-    T m_func;
+	T m_func;
 };
 
 static ::cpputil::RWLock CryptonightR_cache_mutex;
@@ -99,99 +100,113 @@ static std::mutex background_tasks_mutex;
 static std::vector<BackgroundTaskBase*> background_tasks;
 static std::thread* background_thread = nullptr;
 
+static cl_program search_program(
+	const GpuContext* ctx,
+	xmrstak_algo algo,
+	uint64_t height_offset,
+	bool lock_cache = true)
+{
+	if(lock_cache)
+		CryptonightR_cache_mutex.ReadLock();
+
+	// Check if the cache has this program
+	for(const CacheEntry& entry : CryptonightR_cache)
+	{
+		if((entry.algo == algo) && (entry.height_offset == height_offset) && (entry.deviceIdx == ctx->deviceIdx))
+		{
+			printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu found in cache", height_offset);
+			auto result = entry.program;
+			if(lock_cache)
+				CryptonightR_cache_mutex.UnLock();
+			return result;
+		}
+	}
+	if(lock_cache)
+		CryptonightR_cache_mutex.UnLock();
+
+	return nullptr;
+}
+
 static void background_thread_proc()
 {
-    std::vector<BackgroundTaskBase*> tasks;
-    for (;;) {
-        tasks.clear();
-        {
-            std::lock_guard<std::mutex> g(background_tasks_mutex);
-            background_tasks.swap(tasks);
-        }
-
-        for (BackgroundTaskBase* task : tasks) {
-            task->exec();
-            delete task;
-        }
+	std::vector<BackgroundTaskBase*> tasks;
+	for(;;)
+	{
+		tasks.clear();
+		{
+			std::lock_guard<std::mutex> g(background_tasks_mutex);
+			background_tasks.swap(tasks);
+		}
+
+		for(BackgroundTaskBase* task : tasks)
+		{
+			task->exec();
+			delete task;
+		}
 
 		std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    }
+	}
 }
 
-template<typename T>
+template <typename T>
 static void background_exec(T&& func)
 {
-    BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
+	BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
 
-    std::lock_guard<std::mutex> g(background_tasks_mutex);
-    background_tasks.push_back(task);
-    if (!background_thread) {
-        background_thread = new std::thread(background_thread_proc);
-    }
+	std::lock_guard<std::mutex> g(background_tasks_mutex);
+	background_tasks.push_back(task);
+	if(!background_thread)
+	{
+		background_thread = new std::thread(background_thread_proc);
+	}
 }
 
 static cl_program CryptonightR_build_program(
-    const GpuContext* ctx,
-    xmrstak_algo algo,
-    uint64_t height,
-    uint32_t precompile_count,
-    cl_kernel old_kernel,
-    std::string source_code,
-    std::string options)
+	const GpuContext* ctx,
+	xmrstak_algo algo,
+	uint64_t height_offset,
+	uint64_t height_chunk_size,
+	uint32_t precompile_count,
+	std::string source_code,
+	std::string options)
 {
-    if(old_kernel)
-        clReleaseKernel(old_kernel);
-
-
-    std::vector<cl_program> old_programs;
-    old_programs.reserve(32);
-    {
+	std::vector<cl_program> old_programs;
+	old_programs.reserve(32);
+	{
 		CryptonightR_cache_mutex.WriteLock();
 
-        // Remove old programs from cache
-        for(size_t i = 0; i < CryptonightR_cache.size();)
-        {
-            const CacheEntry& entry = CryptonightR_cache[i];
-            if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
-            {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
-                old_programs.push_back(entry.program);
-                CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
-                CryptonightR_cache.pop_back();
-            }
-            else
-            {
-                ++i;
-            }
-        }
+		// Remove old programs from cache
+		for(size_t i = 0; i < CryptonightR_cache.size();)
+		{
+			const CacheEntry& entry = CryptonightR_cache[i];
+			if((entry.algo == algo) && (entry.height_offset + (2 + precompile_count) * height_chunk_size < height_offset))
+			{
+				printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu released (old program)", entry.height_offset);
+				old_programs.push_back(entry.program);
+				CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
+				CryptonightR_cache.pop_back();
+			}
+			else
+			{
+				++i;
+			}
+		}
 		CryptonightR_cache_mutex.UnLock();
-    }
-
-    for(cl_program p : old_programs) {
-        clReleaseProgram(p);
-    }
+	}
 
-    std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
+	for(cl_program p : old_programs)
+	{
+		clReleaseProgram(p);
+	}
 
-    cl_program program = nullptr;
-    {
-		CryptonightR_cache_mutex.ReadLock();
+	std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
 
-        // Check if the cache already has this program (some other thread might have added it first)
-        for (const CacheEntry& entry : CryptonightR_cache)
-        {
-            if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx))
-            {
-                program = entry.program;
-                break;
-            }
-        }
-		CryptonightR_cache_mutex.UnLock();
-    }
+	cl_program program = search_program(ctx, algo, height_offset);
 
-    if (program) {
-        return program;
-    }
+	if(program)
+	{
+		return program;
+	}
 
 	cl_int ret;
 	const char* source = source_code.c_str();
@@ -199,7 +214,7 @@ static cl_program CryptonightR_build_program(
 	program = clCreateProgramWithSource(ctx->opencl_ctx, 1, (const char**)&source, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L0,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
+		printer::inst()->print_msg(L0, "Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
 		return program;
 	}
 
@@ -207,11 +222,11 @@ static cl_program CryptonightR_build_program(
 	if(ret != CL_SUCCESS)
 	{
 		size_t len;
-		printer::inst()->print_msg(L0,"Error %s when calling clBuildProgram.", err_to_str(ret));
+		printer::inst()->print_msg(L0, "Error %s when calling clBuildProgram.", err_to_str(ret));
 
 		if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+			printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
 			return program;
 		}
 
@@ -221,12 +236,12 @@ static cl_program CryptonightR_build_program(
 		if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
 		{
 			free(BuildLog);
-			printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+			printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
 			return program;
 		}
 
 		printer::inst()->print_str("Build log:\n");
-		std::cerr<<BuildLog<<std::endl;
+		std::cerr << BuildLog << std::endl;
 
 		free(BuildLog);
 		return program;
@@ -237,61 +252,89 @@ static cl_program CryptonightR_build_program(
 	{
 		if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+			printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
 			return program;
 		}
 		std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-	}
-	while(status == CL_BUILD_IN_PROGRESS);
+	} while(status == CL_BUILD_IN_PROGRESS);
 
+	CryptonightR_cache_mutex.WriteLock();
+	auto cached_program = search_program(ctx, algo, height_offset, false);
 
-    printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height);
+	if(cached_program)
+	{
+		printer::inst()->print_msg(LDEBUG, "CryptonightR: release already existing program %llu", height_offset);
+		clReleaseProgram(program);
+		program = cached_program;
+	}
+	else
+	{
+		CryptonightR_cache.emplace_back(algo, height_offset, ctx->deviceIdx, program);
+		printer::inst()->print_msg(LDEBUG, "CryptonightR: cache compiled program for height_offset %llu", height_offset);
+	}
 
-	CryptonightR_cache_mutex.WriteLock();
-	CryptonightR_cache.emplace_back(algo, height, ctx->deviceIdx, program);
 	CryptonightR_cache_mutex.UnLock();
-    return program;
+	return program;
 }
 
-cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background, cl_kernel old_kernel)
+cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background)
 {
-	printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height);
-
-    if (background) {
-        background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false, old_kernel); });
-        return nullptr;
-    }
-
-    const char* source_code_template =
-        #include "amd_gpu/opencl/wolf-aes.cl"
-        #include "amd_gpu/opencl/cryptonight_r.cl"
-    ;
-    const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
-    const char* offset = strstr(source_code_template, include_name);
-    if (!offset)
-    {
-        printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo);
-        return nullptr;
-    }
-
-    V4_Instruction code[256];
-    int code_size;
-    switch (algo.Id())
-    {
-    case cryptonight_r_wow:
-        code_size = v4_random_math_init<cryptonight_r_wow>(code, height);
-        break;
-    case cryptonight_r:
-        code_size = v4_random_math_init<cryptonight_r>(code, height);
-        break;
-    default:
-        printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo);
-        return nullptr;
-    }
-
-    std::string source_code(source_code_template, offset);
-    source_code.append(get_code(code, code_size));
-    source_code.append(offset + sizeof(include_name) - 1);
+	if(background)
+	{
+		background_exec([=]() { CryptonightR_get_program(ctx, algo, height_offset, height_chunk_size, precompile_count, false); });
+		return nullptr;
+	}
+
+	auto program = search_program(ctx, algo, height_offset);
+
+	if(program != nullptr)
+		return program;
+
+	printer::inst()->print_msg(LDEBUG, "CryptonightR: create code for block %llu to %llu", height_offset, height_offset + height_chunk_size);
+
+	const char* source_code_definitions =
+#include "amd_gpu/opencl/cryptonight_r_def.rtcl"
+#include "amd_gpu/opencl/wolf-aes.cl"
+		;
+
+	const char* source_code_template =
+#include "amd_gpu/opencl/cryptonight_r.rtcl"
+		;
+	const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
+	const char* offset = strstr(source_code_template, include_name);
+	if(!offset)
+	{
+		printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo);
+		return nullptr;
+	}
+
+	std::string source_code(source_code_definitions);
+
+	for(uint64_t c = 0; c < height_chunk_size; ++c)
+	{
+		V4_Instruction code[256];
+		int code_size;
+		switch(algo.Id())
+		{
+		case cryptonight_r_wow:
+			code_size = v4_random_math_init<cryptonight_r_wow>(code, height_offset + c);
+			break;
+		case cryptonight_r:
+			code_size = v4_random_math_init<cryptonight_r>(code, height_offset + c);
+			break;
+		default:
+			printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo);
+			return nullptr;
+		}
+
+		std::string kernel_code(source_code_template, offset);
+		kernel_code.append(get_code(code, code_size));
+		kernel_code.append(offset + sizeof(include_name) - 1);
+
+		std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height_offset + c);
+
+		source_code += std::regex_replace(kernel_code, std::regex("cn1_cryptonight_r"), kernel_name);
+	}
 
 	// scratchpad size for the selected mining algorithm
 	size_t hashMemSize = algo.Mem();
@@ -329,28 +372,12 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t
 	if(algo == cryptonight_gpu)
 		options += " -cl-fp32-correctly-rounded-divide-sqrt";
 
+	program = search_program(ctx, algo, height_offset);
 
-    const char* source = source_code.c_str();
-
-    {
-		CryptonightR_cache_mutex.ReadLock();
-
-        // Check if the cache has this program
-        for (const CacheEntry& entry : CryptonightR_cache)
-        {
-            if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx))
-            {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height);
-				auto result = entry.program;
-				CryptonightR_cache_mutex.UnLock();
-                return result;
-            }
-        }
-		CryptonightR_cache_mutex.UnLock();
-
-    }
+	if(program != nullptr)
+		return program;
 
-    return CryptonightR_build_program(ctx, algo, height, precompile_count, old_kernel, source, options);
+	return CryptonightR_build_program(ctx, algo, height_offset, precompile_count, height_chunk_size, source_code, options);
 }
 
 } // namespace amd
diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
index 5f97d1e5142fa2552146c22af257ffbe02f39ae8..f8772b1f52e47df6ab2041bedd682e99e51d784e 100644
--- a/xmrstak/backend/amd/OclCryptonightR_gen.hpp
+++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
@@ -3,8 +3,8 @@
 #include "xmrstak/backend/cryptonight.hpp"
 
 #include <stdint.h>
-#include <vector>
 #include <string>
+#include <vector>
 
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
@@ -20,7 +20,7 @@ namespace amd
 {
 
 cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo,
-	uint64_t height, uint32_t precompile_count, bool background = false, cl_kernel old_kernel = nullptr);
+	uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background = false);
 
 } // namespace amd
 } // namespace xmrstak
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 9c9db2ee3d274fa1d170b043548e4f1b116b72de..77857612e96581e1a638e79af62aecec2710167f 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -13,58 +13,43 @@
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
 
+#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
-#include "xmrstak/picosha2/picosha2.hpp"
+#include "xmrstak/net/msgstruct.hpp"
 #include "xmrstak/params.hpp"
+#include "xmrstak/picosha2/picosha2.hpp"
 #include "xmrstak/version.hpp"
-#include "xmrstak/net/msgstruct.hpp"
-#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp"
 
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <math.h>
+#include <regex>
 #include <stdio.h>
 #include <string.h>
-#include <math.h>
-#include <iostream>
 #include <vector>
-#include <algorithm>
-#include <regex>
-#include <cassert>
-#include <algorithm>
 
 #include <fstream>
+#include <iostream>
 #include <sstream>
-#include <vector>
 #include <string>
-#include <iostream>
 #include <thread>
+#include <vector>
 
 #if defined _MSC_VER
 #include <direct.h>
 #elif defined __GNUC__
-#include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #endif
 
-
 #ifdef _WIN32
 #include <windows.h>
-#include <Shlobj.h>
 
 static inline void create_directory(std::string dirname)
 {
-    _mkdir(dirname.data());
-}
-
-static inline std::string get_home()
-{
-	char path[MAX_PATH + 1];
-	// get folder "appdata\local"
-	if (SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE))
-	{
-		return path;
-	}
-	else
-		return ".";
+	_mkdir(dirname.data());
 }
 
 static inline void port_sleep(size_t sec)
@@ -72,24 +57,14 @@ static inline void port_sleep(size_t sec)
 	Sleep(sec * 1000);
 }
 #else
-#include <unistd.h>
 #include <pwd.h>
+#include <unistd.h>
 
 static inline void create_directory(std::string dirname)
 {
 	mkdir(dirname.data(), 0744);
 }
 
-static inline std::string get_home()
-{
-	const char *home = ".";
-
-	if ((home = getenv("HOME")) == nullptr)
-		home = getpwuid(getuid())->pw_dir;
-
-	return home;
-}
-
 static inline void port_sleep(size_t sec)
 {
 	sleep(sec);
@@ -123,7 +98,7 @@ char* LoadTextFile(const char* filename)
 	flen = ftell(kernel);
 	fseek(kernel, 0, SEEK_SET);
 
-	out = (char*)malloc(flen+1);
+	out = (char*)malloc(flen + 1);
 	size_t r = fread(out, flen, 1, kernel);
 	fclose(kernel);
 
@@ -144,7 +119,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 
 	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &MaximumWorkSize, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -163,16 +138,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 */
 		MaximumWorkSize /= 8;
 	}
-	printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
+	printer::inst()->print_msg(L1, "Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
 
 	if(ctx->workSize > MaximumWorkSize)
 	{
 		ctx->workSize = MaximumWorkSize;
-		printer::inst()->print_msg(L1,"Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
+		printer::inst()->print_msg(L1, "Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
 	}
 
 	const std::string backendName = xmrstak::params::inst().openCLVendor;
-	if( (ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0)
+	if((ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0)
 	{
 		size_t reduced_intensity = (ctx->rawIntensity / ctx->workSize) * ctx->workSize;
 		ctx->rawIntensity = reduced_intensity;
@@ -180,29 +155,29 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	}
 
 #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
-	const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 };
+	const cl_queue_properties CommandQueueProperties[] = {0, 0, 0};
 	ctx->CommandQueues = clCreateCommandQueueWithProperties(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret);
 #else
-	const cl_command_queue_properties CommandQueueProperties = { 0 };
+	const cl_command_queue_properties CommandQueueProperties = {0};
 	ctx->CommandQueues = clCreateCommandQueue(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret);
 #endif
 
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx->computeUnits), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx);
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx);
 		return ERR_OCL_API;
 	}
 
 	ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -216,14 +191,14 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -231,7 +206,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[2] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -239,7 +214,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[3] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -247,7 +222,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[4] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -255,7 +230,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->ExtraBuffers[5] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -263,21 +238,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	ctx->OutputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * 0x100, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	std::vector<char> devNameVec(1024);
 	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret),ctx->deviceIdx );
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret), ctx->deviceIdx);
 		return ERR_OCL_API;
 	}
 
 	std::vector<char> openCLDriverVer(1024);
 	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx );
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret), ctx->deviceIdx);
 		return ERR_OCL_API;
 	}
 
@@ -339,7 +314,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		isWindowsOs = 1;
 #endif
 		options += " -DIS_WINDOWS_OS=" + std::to_string(isWindowsOs);
-		
+
 		if(miner_algo == cryptonight_gpu)
 			options += " -cl-fp32-correctly-rounded-divide-sqrt";
 
@@ -358,16 +333,18 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		std::string hash_hex_str;
 		picosha2::hash256_hex_string(src_str, hash_hex_str);
 
-		std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin";
+		const std::string cache_dir = xmrstak::params::inst().rootAMDCacheDir;
+
+		std::string cache_file = cache_dir + hash_hex_str + ".openclbin";
 		std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary);
 		if(xmrstak::params::inst().AMDCache == false || !clBinFile.good())
 		{
 			if(xmrstak::params::inst().AMDCache)
-				printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
+				printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code %s not found. Compiling ...", ctx->deviceIdx, cache_file.c_str());
 			ctx->Program[miner_algo] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
 			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
+				printer::inst()->print_msg(L1, "Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
 				return ERR_OCL_API;
 			}
 
@@ -375,11 +352,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			if(ret != CL_SUCCESS)
 			{
 				size_t len;
-				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
+				printer::inst()->print_msg(L1, "Error %s when calling clBuildProgram.", err_to_str(ret));
 
 				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
 				{
-					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+					printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
 					return ERR_OCL_API;
 				}
 
@@ -389,28 +366,27 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
 				{
 					free(BuildLog);
-					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+					printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
 					return ERR_OCL_API;
 				}
 
 				printer::inst()->print_str("Build log:\n");
-				std::cerr<<BuildLog<<std::endl;
+				std::cerr << BuildLog << std::endl;
 
 				free(BuildLog);
 				return ERR_OCL_API;
 			}
 
 			cl_uint num_devices;
-			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
-
+			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
 
 			std::vector<cl_device_id> devices_ids(num_devices);
-			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
+			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id) * devices_ids.size(), devices_ids.data(), NULL);
 			int dev_id = 0;
 			/* Search for the gpu within the program context.
 			 * The id can be different to  ctx->DeviceID.
 			 */
-			for(auto & ocl_device : devices_ids)
+			for(auto& ocl_device : devices_ids)
 			{
 				if(ocl_device == ctx->DeviceID)
 					break;
@@ -422,17 +398,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			{
 				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
 				{
-					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+					printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
 					return ERR_OCL_API;
 				}
 				port_sleep(1);
-			}
-			while(status == CL_BUILD_IN_PROGRESS);
+			} while(status == CL_BUILD_IN_PROGRESS);
 
 			if(xmrstak::params::inst().AMDCache)
 			{
 				std::vector<size_t> binary_sizes(num_devices);
-				clGetProgramInfo (ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
+				clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
 
 				std::vector<char*> all_programs(num_devices);
 				std::vector<std::vector<char>> program_storage;
@@ -440,7 +415,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				int p_id = 0;
 				size_t mem_size = 0;
 				// create memory  structure to query all OpenCL program binaries
-				for(auto & p : all_programs)
+				for(auto& p : all_programs)
 				{
 					program_storage.emplace_back(std::vector<char>(binary_sizes[p_id]));
 					all_programs[p_id] = program_storage[p_id].data();
@@ -448,9 +423,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 					p_id++;
 				}
 
-				if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+				if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(), NULL)) != CL_SUCCESS)
 				{
-					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
+					printer::inst()->print_msg(L1, "Error %s when calling clGetProgramInfo.", err_to_str(ret));
 					return ERR_OCL_API;
 				}
 
@@ -458,12 +433,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary);
 				file_stream.write(all_programs[dev_id], binary_sizes[dev_id]);
 				file_stream.close();
-				printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str());
+				printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s", ctx->deviceIdx, cache_file.c_str());
 			}
 		}
 		else
 		{
-			printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str());
+			printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s", ctx->deviceIdx, cache_file.c_str());
 			std::ostringstream ss;
 			ss << clBinFile.rdbuf();
 			std::string s = ss.str();
@@ -474,22 +449,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			cl_int clStatus;
 			ctx->Program[miner_algo] = clCreateProgramWithBinary(
 				opencl_ctx, 1, &ctx->DeviceID, &bin_size,
-				(const unsigned char **)&data_ptr, &clStatus, &ret
-			);
+				(const unsigned char**)&data_ptr, &clStatus, &ret);
 			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
+				printer::inst()->print_msg(L1, "Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
 				return ERR_OCL_API;
 			}
 			ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, NULL, NULL, NULL);
 			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
+				printer::inst()->print_msg(L1, "Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
 				return ERR_OCL_API;
 			}
 		}
 
-		std::vector<std::string> KernelNames = { "cn2", "Blake", "Groestl", "JH", "Skein" };
+		std::vector<std::string> KernelNames = {"cn2", "Blake", "Groestl", "JH", "Skein"};
 		if(miner_algo == cryptonight_gpu)
 		{
 			KernelNames.insert(KernelNames.begin(), "cn1_cn_gpu");
@@ -515,7 +489,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret);
 			if(ret != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
+				printer::inst()->print_msg(L1, "Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
 				return ERR_OCL_API;
 			}
 		}
@@ -529,30 +503,28 @@ const cl_platform_info attributeTypes[5] = {
 	CL_PLATFORM_VENDOR,
 	CL_PLATFORM_VERSION,
 	CL_PLATFORM_PROFILE,
-	CL_PLATFORM_EXTENSIONS
-};
+	CL_PLATFORM_EXTENSIONS};
 
 const char* const attributeNames[] = {
 	"CL_PLATFORM_NAME",
 	"CL_PLATFORM_VENDOR",
 	"CL_PLATFORM_VERSION",
 	"CL_PLATFORM_PROFILE",
-	"CL_PLATFORM_EXTENSIONS"
-};
+	"CL_PLATFORM_EXTENSIONS"};
 
-#define NELEMS(x)  (sizeof(x) / sizeof((x)[0]))
+#define NELEMS(x) (sizeof(x) / sizeof((x)[0]))
 
 uint32_t getNumPlatforms()
 {
 	cl_uint num_platforms = 0;
-	cl_platform_id * platforms = NULL;
+	cl_platform_id* platforms = NULL;
 	cl_int clStatus;
 
 	// Get platform and device information
 	clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
 	if(clStatus != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for number of platforms.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for number of platforms.", err_to_str(clStatus));
 		return 0u;
 	}
 
@@ -575,29 +547,29 @@ std::vector<GpuContext> getAMDDevices(int index)
 	platforms.resize(numPlatforms);
 	if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
 		return ctxVec;
 	}
 
-	if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS)
+	if((clStatus = clGetDeviceIDs(platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus));
 		return ctxVec;
 	}
 
 	device_list.resize(num_devices);
-	if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS)
+	if((clStatus = clGetDeviceIDs(platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus));
 		return ctxVec;
 	}
 
-	for (size_t k = 0; k < num_devices; k++)
+	for(size_t k = 0; k < num_devices; k++)
 	{
 		std::vector<char> devVendorVec(1024);
 		if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
+			printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
 			continue;
 		}
 
@@ -617,19 +589,19 @@ std::vector<GpuContext> getAMDDevices(int index)
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
@@ -639,14 +611,14 @@ std::vector<GpuContext> getAMDDevices(int index)
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
 			std::vector<char> openCLDriverVer(1024);
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k);
+				printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k);
 				continue;
 			}
 
@@ -657,7 +629,7 @@ std::vector<GpuContext> getAMDDevices(int index)
 			ctx.name = std::string(devNameVec.data());
 			ctx.DeviceID = device_list[k];
 			ctx.interleave = 40;
-			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
+			printer::inst()->print_msg(L0, "Found OpenCL GPU %s.", ctx.name.c_str());
 			ctxVec.push_back(ctx);
 		}
 	}
@@ -672,13 +644,13 @@ int getAMDPlatformIdx()
 
 	if(numPlatforms == 0)
 	{
-		printer::inst()->print_msg(L0,"WARNING: No OpenCL platform found.");
+		printer::inst()->print_msg(L0, "WARNING: No OpenCL platform found.");
 		return -1;
 	}
-	cl_platform_id * platforms = NULL;
+	cl_platform_id* platforms = NULL;
 	cl_int clStatus;
 
-	platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms);
+	platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms);
 	clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
 
 	int platformIndex = -1;
@@ -687,7 +659,8 @@ int getAMDPlatformIdx()
 
 	if(clStatus == CL_SUCCESS)
 	{
-		for (int i = 0; i < numPlatforms; i++) {
+		for(int i = 0; i < numPlatforms; i++)
+		{
 			size_t infoSize;
 			clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 0, NULL, &infoSize);
 			std::vector<char> platformNameVec(infoSize);
@@ -696,13 +669,13 @@ int getAMDPlatformIdx()
 			std::string platformName(platformNameVec.data());
 
 			bool isAMDOpenCL = platformName.find("Advanced Micro Devices") != std::string::npos ||
-				platformName.find("Apple") != std::string::npos ||
-				platformName.find("Mesa") != std::string::npos;
+							   platformName.find("Apple") != std::string::npos ||
+							   platformName.find("Mesa") != std::string::npos;
 			bool isNVIDIADevice = platformName.find("NVIDIA Corporation") != std::string::npos || platformName.find("NVIDIA") != std::string::npos;
 			std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor;
 			if((isAMDOpenCL && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA"))
 			{
-				printer::inst()->print_msg(L0,"Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i , platformName.c_str());
+				printer::inst()->print_msg(L0, "Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i, platformName.c_str());
 				if(platformName.find("Mesa") != std::string::npos)
 					mesaPlatform = i;
 				else
@@ -716,12 +689,12 @@ int getAMDPlatformIdx()
 		// fall back to Mesa OpenCL
 		if(platformIndex == -1 && mesaPlatform != -1)
 		{
-			printer::inst()->print_msg(L0,"No AMD platform found select Mesa as OpenCL platform");
+			printer::inst()->print_msg(L0, "No AMD platform found select Mesa as OpenCL platform");
 			platformIndex = mesaPlatform;
 		}
 	}
 	else
-		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
+		printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
 
 	free(platforms);
 	return platformIndex;
@@ -737,15 +710,14 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	if((ret = clGetPlatformIDs(0, NULL, &entries)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-
 	// The number of platforms naturally is the index of the last platform plus one.
 	if(entries <= platform_idx)
 	{
-		printer::inst()->print_msg(L1,"Selected OpenCL platform index %d doesn't exist.", platform_idx);
+		printer::inst()->print_msg(L1, "Selected OpenCL platform index %d doesn't exist.", platform_idx);
 		return ERR_STUPID_PARAMS;
 	}
 
@@ -757,7 +729,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 #endif
 	if((ret = clGetPlatformIDs(entries, PlatformIDList, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -768,12 +740,12 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	std::string platformName(platformNameVec.data());
 	if(xmrstak::params::inst().openCLVendor == "AMD" && platformName.find("Advanced Micro Devices") == std::string::npos)
 	{
-		printer::inst()->print_msg(L1,"WARNING: using non AMD device: %s", platformName.c_str());
+		printer::inst()->print_msg(L1, "WARNING: using non AMD device: %s", platformName.c_str());
 	}
 
 	if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &entries)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -782,7 +754,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	{
 		if(ctx[i].deviceIdx >= entries)
 		{
-			printer::inst()->print_msg(L1,"Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx);
+			printer::inst()->print_msg(L1, "Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx);
 			return ERR_STUPID_PARAMS;
 		}
 	}
@@ -794,7 +766,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 #endif
 	if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, entries, DeviceIDList, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -811,41 +783,41 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	cl_context opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList.data(), NULL, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clCreateContext.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	const char *fastIntMathV2CL =
-			#include "./opencl/fast_int_math_v2.cl"
-	;
-    const char *fastDivHeavyCL =
-        #include "./opencl/fast_div_heavy.cl"
-    ;
-	const char *cryptonightCL =
-			#include "./opencl/cryptonight.cl"
-	;
-	const char *blake256CL =
-			#include "./opencl/blake256.cl"
-	;
-	const char *groestl256CL =
-			#include "./opencl/groestl256.cl"
-	;
-	const char *jhCL =
-			#include "./opencl/jh.cl"
-	;
-	const char *wolfAesCL =
-			#include "./opencl/wolf-aes.cl"
-	;
-	const char *wolfSkeinCL =
-			#include "./opencl/wolf-skein.cl"
-	;
-	const char *cryptonight_gpu =
-			#include "./opencl/cryptonight_gpu.cl"
-	;
+	const char* fastIntMathV2CL =
+#include "./opencl/fast_int_math_v2.cl"
+		;
+	const char* fastDivHeavyCL =
+#include "./opencl/fast_div_heavy.cl"
+		;
+	const char* cryptonightCL =
+#include "./opencl/cryptonight.cl"
+		;
+	const char* blake256CL =
+#include "./opencl/blake256.cl"
+		;
+	const char* groestl256CL =
+#include "./opencl/groestl256.cl"
+		;
+	const char* jhCL =
+#include "./opencl/jh.cl"
+		;
+	const char* wolfAesCL =
+#include "./opencl/wolf-aes.cl"
+		;
+	const char* wolfSkeinCL =
+#include "./opencl/wolf-skein.cl"
+		;
+	const char* cryptonight_gpu =
+#include "./opencl/cryptonight_gpu.cl"
+		;
 
 	std::string source_code(cryptonightCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
-    source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
@@ -854,13 +826,14 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_CN_GPU"), cryptonight_gpu);
 
 	// create a directory  for the OpenCL compile cache
-	create_directory(get_home() + "/.openclcache");
+	const std::string cache_dir = xmrstak::params::inst().rootAMDCacheDir;
+	create_directory(cache_dir);
 
 	std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);
 
 	for(int i = 0; i < num_gpus; ++i)
 	{
-		printer::inst()->print_msg(LDEBUG,"OpenCL Init device %d", ctx[i].deviceIdx);
+		printer::inst()->print_msg(LDEBUG, "OpenCL Init device %d", ctx[i].deviceIdx);
 		const size_t devIdx = ctx[i].deviceIdx;
 		if(interleaveData.size() <= devIdx)
 		{
@@ -870,12 +843,11 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 		{
 			interleaveData[devIdx].reset(new InterleaveData{});
 			interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms();
-
 		}
-		ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU;
+		ctx[i].idWorkerOnDevice = interleaveData[devIdx]->numThreadsOnGPU;
 		++interleaveData[devIdx]->numThreadsOnGPU;
 		ctx[i].interleaveData = interleaveData[devIdx];
-		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
+		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave) / 100.0;
 		ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
 		ctx[i].opencl_ctx = opencl_ctx;
 
@@ -891,7 +863,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo, uint64_t height)
 {
 
-	auto & Kernels = ctx->Kernels[miner_algo.Id()];
+	auto& Kernels = ctx->Kernels[miner_algo.Id()];
 
 	cl_int ret;
 
@@ -905,35 +877,35 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 
 	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	if((ret = clSetKernelArg(Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Scratchpads
 	if((ret = clSetKernelArg(Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
 	if((ret = clSetKernelArg(Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
 	if((ret = clSetKernelArg(Kernels[0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
-		return(ERR_OCL_API);
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
+		return (ERR_OCL_API);
 	}
 
 	if(miner_algo == cryptonight_gpu)
@@ -942,79 +914,88 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		// Scratchpads
 		if((ret = clSetKernelArg(Kernels[7], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// States
 		if((ret = clSetKernelArg(Kernels[7], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 	}
 
-    // CN1 Kernel
+	// CN1 Kernel
 
-    if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) {
+	if((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow))
+	{
 
-		uint32_t PRECOMPILATION_DEPTH = 4;
+		uint32_t PRECOMPILATION_DEPTH = 1;
+		constexpr uint64_t height_chunk_size = 25;
+		uint64_t height_offset = (height / height_chunk_size) * height_chunk_size;
 
-        // Get new kernel
-        cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH);
+		// Get new kernel
+		cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset, height_chunk_size, PRECOMPILATION_DEPTH);
 
-        if (program != ctx->ProgramCryptonightR) {
-            cl_int ret;
-            cl_kernel kernel = clCreateKernel(program, "cn1_cryptonight_r", &ret);
+		if(program != ctx->ProgramCryptonightR || ctx->last_block_height != height)
+		{
+			cl_int ret;
+			std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height);
+			cl_kernel kernel = clCreateKernel(program, kernel_name.c_str(), &ret);
 
-            cl_kernel old_kernel = nullptr;
-            if (ret != CL_SUCCESS) {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret));
-            }
-            else {
-                old_kernel = Kernels[1];
-                Kernels[1] = kernel;
-            }
-            ctx->ProgramCryptonightR = program;
+			if(ret != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret));
+			}
+			else
+			{
+				cl_kernel old_kernel = Kernels[1];
+				if(old_kernel)
+					clReleaseKernel(old_kernel);
+				Kernels[1] = kernel;
+			}
+			ctx->ProgramCryptonightR = program;
+			ctx->last_block_height = height;
+			printer::inst()->print_msg(LDEBUG, "Set height %llu", height);
 
-            // Precompile next program in background
-            xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, PRECOMPILATION_DEPTH, true, old_kernel);
-            for (int i = 2; i <= PRECOMPILATION_DEPTH; ++i)
-                xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true, nullptr);
+			// Precompile next program in background
+			for(int i = 1; i <= PRECOMPILATION_DEPTH; ++i)
+				xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset + i * height_chunk_size, height_chunk_size, PRECOMPILATION_DEPTH, true);
 
-            printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
-        }
+			printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
+		}
 		else
 		{
 			printer::inst()->print_msg(LDEBUG, "Thread #%zu found CryptonightR", ctx->deviceIdx);
 		}
-    }
+	}
 
 	// Scratchpads
 	if((ret = clSetKernelArg(Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
 	if((ret = clSetKernelArg(Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
 	if((ret = clSetKernelArg(Kernels[1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
-		return(ERR_OCL_API);
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
+		return (ERR_OCL_API);
 	}
 
 	if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2)
 	{
 		// Input
-		if ((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret));
 			return ERR_OCL_API;
@@ -1025,14 +1006,14 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	// Scratchpads
 	if((ret = clSetKernelArg(Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
 	if((ret = clSetKernelArg(Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
@@ -1041,59 +1022,59 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		// Output
 		if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2);
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2);
 			return ERR_OCL_API;
 		}
 
 		// Target
 		if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3);
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3);
 			return ERR_OCL_API;
 		}
 
 		// Threads
 		if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
-			return(ERR_OCL_API);
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
+			return (ERR_OCL_API);
 		}
 	}
 	else
-		{
+	{
 		// Branch 0
 		if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// Branch 1
 		if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// Branch 2
 		if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// Branch 3
 		if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
 		// Threads
 		if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
-			return(ERR_OCL_API);
+			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
+			return (ERR_OCL_API);
 		}
 
 		for(int i = 0; i < 4; ++i)
@@ -1101,35 +1082,35 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 			// States
 			if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
 				return ERR_OCL_API;
 			}
 
 			// Nonce buffer
 			if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
 				return ERR_OCL_API;
 			}
 
 			// Output
 			if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
 				return ERR_OCL_API;
 			}
 
 			// Target
 			if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
 				return ERR_OCL_API;
 			}
 
 			if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
-				return(ERR_OCL_API);
+				printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+				return (ERR_OCL_API);
 			}
 		}
 	}
@@ -1153,7 +1134,7 @@ uint64_t updateTimings(GpuContext* ctx, const uint64_t t)
 		if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0)
 			ctx->interleaveData->avgKernelRuntime = runtime;
 		else
-			ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime) * averagingBias;
+			ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime)*averagingBias;
 	}
 	return runtime;
 }
@@ -1182,7 +1163,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 
 		if((dt > 0) && (dt < optimalTimeOffset))
 		{
-			delay = static_cast<int64_t>((optimalTimeOffset  - dt));
+			delay = static_cast<int64_t>((optimalTimeOffset - dt));
 
 			if(enableAutoAdjustment)
 			{
@@ -1201,8 +1182,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 			// avoid that the auto adjustment is disable interleaving
 			ctx->interleaveData->adjustThreshold = std::max(
 				ctx->interleaveData->adjustThreshold,
-				0.001
-			);
+				0.001);
 		}
 		delay = std::max(int64_t(0), delay);
 
@@ -1213,13 +1193,12 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 		{
 			// do not notify the user anymore if we reach a good delay
 			if(delay > maxDelay)
-				printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf",
+				printer::inst()->print_msg(L1, "OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf",
 					ctx->deviceIdx,
 					ctx->idWorkerOnDevice,
 					static_cast<uint32_t>(delay),
 					avgRuntime,
-					ctx->interleaveData->adjustThreshold * 100.
-				);
+					ctx->interleaveData->adjustThreshold * 100.);
 
 			std::this_thread::sleep_for(std::chrono::milliseconds(delay));
 		}
@@ -1230,12 +1209,12 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo)
 {
-	const auto & Kernels = ctx->Kernels[miner_algo.Id()];
+	const auto& Kernels = ctx->Kernels[miner_algo.Id()];
 
 	cl_int ret;
 	cl_uint zero = 0;
 	size_t BranchNonces[4];
-	memset(BranchNonces,0,sizeof(size_t)*4);
+	memset(BranchNonces, 0, sizeof(size_t) * 4);
 
 	size_t g_intensity = ctx->rawIntensity;
 	size_t w_size = ctx->workSize;
@@ -1246,28 +1225,28 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 		// round up to next multiple of w_size
 		g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
 		// number of global threads must be a multiple of the work group size (w_size)
-		assert(g_thd%w_size == 0);
+		assert(g_thd % w_size == 0);
 	}
 
 	for(int i = 2; i < 6; ++i)
 	{
 		if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->ExtraBuffers[i], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2);
+			printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2);
 			return ERR_OCL_API;
 		}
 	}
 
 	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
+	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = {g_thd, 8}, lthreads[2] = {8, 8};
 	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
 		return ERR_OCL_API;
 	}
 
@@ -1279,7 +1258,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 		size_t intens = g_intensity * thd;
 		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[7], 1, 0, &intens, &thd, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7);
+			printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7);
 			return ERR_OCL_API;
 		}
 
@@ -1288,7 +1267,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 
 		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, 0, &g_thd_cn_gpu, &w_size_cn_gpu, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
+			printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
 			return ERR_OCL_API;
 		}
 	}
@@ -1296,14 +1275,14 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 	{
 		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
+			printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
 			return ERR_OCL_API;
 		}
 	}
 
 	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
 		return ERR_OCL_API;
 	}
 
@@ -1314,7 +1293,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 			size_t tmpNonce = ctx->Nonce;
 			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 			{
-				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
+				printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
 				return ERR_OCL_API;
 			}
 		}
@@ -1323,11 +1302,11 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner
 	// this call is blocking therefore the access to the results without cl_finish is fine
 	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		printer::inst()->print_msg(L1, "Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	auto & numHashValues = HashOutput[0xFF];
+	auto& numHashValues = HashOutput[0xFF];
 	// avoid out of memory read, we have only storage for 0xFF results
 	if(numHashValues > 0xFF)
 		numHashValues = 0xFF;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index ae2b506dbeeb673ac68b21c2db8adb20cfde7b61..1ba300c7a55dc636509e17f385051c9934ac1a24 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "xmrstak/misc/console.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/console.hpp"
 
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
@@ -9,13 +9,13 @@
 #include <CL/cl.h>
 #endif
 
+#include <array>
+#include <map>
+#include <memory>
+#include <mutex>
 #include <stdint.h>
 #include <string>
 #include <vector>
-#include <mutex>
-#include <memory>
-#include <map>
-#include <array>
 
 #define ERR_SUCCESS (0)
 #define ERR_OCL_API (2)
@@ -23,13 +23,13 @@
 
 struct InterleaveData
 {
-    std::mutex mutex;
+	std::mutex mutex;
 
-    double adjustThreshold = 0.4;
-    double startAdjustThreshold = 0.4;
-    double avgKernelRuntime = 0.0;
-    uint64_t lastRunTimeStamp = 0;
-    uint32_t numThreadsOnGPU = 0;
+	double adjustThreshold = 0.4;
+	double startAdjustThreshold = 0.4;
+	double avgKernelRuntime = 0.0;
+	uint64_t lastRunTimeStamp = 0;
+	uint32_t numThreadsOnGPU = 0;
 };
 
 struct GpuContext
@@ -54,8 +54,9 @@ struct GpuContext
 	cl_mem ExtraBuffers[6];
 	cl_context opencl_ctx = nullptr;
 	std::map<xmrstak_algo_id, cl_program> Program;
-	std::map<xmrstak_algo_id, std::array<cl_kernel,8>> Kernels;
+	std::map<xmrstak_algo_id, std::array<cl_kernel, 8>> Kernels;
 	cl_program ProgramCryptonightR = nullptr;
+	uint64_t last_block_height = 0u;
 	size_t freeMem;
 	size_t maxMemPerAlloc;
 	int computeUnits;
@@ -66,148 +67,147 @@ struct GpuContext
 	uint64_t lastDelay = 0;
 
 	uint32_t Nonce;
-
 };
 
 namespace
 {
-	const char* err_to_str(cl_int ret)
+const char* err_to_str(cl_int ret)
+{
+	switch(ret)
 	{
-		switch(ret)
-		{
-		case CL_SUCCESS:
-			return "CL_SUCCESS";
-		case CL_DEVICE_NOT_FOUND:
-			return "CL_DEVICE_NOT_FOUND";
-		case CL_DEVICE_NOT_AVAILABLE:
-			return "CL_DEVICE_NOT_AVAILABLE";
-		case CL_COMPILER_NOT_AVAILABLE:
-			return "CL_COMPILER_NOT_AVAILABLE";
-		case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-			return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
-		case CL_OUT_OF_RESOURCES:
-			return "CL_OUT_OF_RESOURCES";
-		case CL_OUT_OF_HOST_MEMORY:
-			return "CL_OUT_OF_HOST_MEMORY";
-		case CL_PROFILING_INFO_NOT_AVAILABLE:
-			return "CL_PROFILING_INFO_NOT_AVAILABLE";
-		case CL_MEM_COPY_OVERLAP:
-			return "CL_MEM_COPY_OVERLAP";
-		case CL_IMAGE_FORMAT_MISMATCH:
-			return "CL_IMAGE_FORMAT_MISMATCH";
-		case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-			return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
-		case CL_BUILD_PROGRAM_FAILURE:
-			return "CL_BUILD_PROGRAM_FAILURE";
-		case CL_MAP_FAILURE:
-			return "CL_MAP_FAILURE";
-		case CL_MISALIGNED_SUB_BUFFER_OFFSET:
-			return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
-		case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
-			return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
-	#ifdef CL_VERSION_1_2
-		case CL_COMPILE_PROGRAM_FAILURE:
-			return "CL_COMPILE_PROGRAM_FAILURE";
-		case CL_LINKER_NOT_AVAILABLE:
-			return "CL_LINKER_NOT_AVAILABLE";
-		case CL_LINK_PROGRAM_FAILURE:
-			return "CL_LINK_PROGRAM_FAILURE";
-		case CL_DEVICE_PARTITION_FAILED:
-			return "CL_DEVICE_PARTITION_FAILED";
-		case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
-			return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
-	#endif
-		case CL_INVALID_VALUE:
-			return "CL_INVALID_VALUE";
-		case CL_INVALID_DEVICE_TYPE:
-			return "CL_INVALID_DEVICE_TYPE";
-		case CL_INVALID_PLATFORM:
-			return "CL_INVALID_PLATFORM";
-		case CL_INVALID_DEVICE:
-			return "CL_INVALID_DEVICE";
-		case CL_INVALID_CONTEXT:
-			return "CL_INVALID_CONTEXT";
-		case CL_INVALID_QUEUE_PROPERTIES:
-			return "CL_INVALID_QUEUE_PROPERTIES";
-		case CL_INVALID_COMMAND_QUEUE:
-			return "CL_INVALID_COMMAND_QUEUE";
-		case CL_INVALID_HOST_PTR:
-			return "CL_INVALID_HOST_PTR";
-		case CL_INVALID_MEM_OBJECT:
-			return "CL_INVALID_MEM_OBJECT";
-		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-			return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-		case CL_INVALID_IMAGE_SIZE:
-			return "CL_INVALID_IMAGE_SIZE";
-		case CL_INVALID_SAMPLER:
-			return "CL_INVALID_SAMPLER";
-		case CL_INVALID_BINARY:
-			return "CL_INVALID_BINARY";
-		case CL_INVALID_BUILD_OPTIONS:
-			return "CL_INVALID_BUILD_OPTIONS";
-		case CL_INVALID_PROGRAM:
-			return "CL_INVALID_PROGRAM";
-		case CL_INVALID_PROGRAM_EXECUTABLE:
-			return "CL_INVALID_PROGRAM_EXECUTABLE";
-		case CL_INVALID_KERNEL_NAME:
-			return "CL_INVALID_KERNEL_NAME";
-		case CL_INVALID_KERNEL_DEFINITION:
-			return "CL_INVALID_KERNEL_DEFINITION";
-		case CL_INVALID_KERNEL:
-			return "CL_INVALID_KERNEL";
-		case CL_INVALID_ARG_INDEX:
-			return "CL_INVALID_ARG_INDEX";
-		case CL_INVALID_ARG_VALUE:
-			return "CL_INVALID_ARG_VALUE";
-		case CL_INVALID_ARG_SIZE:
-			return "CL_INVALID_ARG_SIZE";
-		case CL_INVALID_KERNEL_ARGS:
-			return "CL_INVALID_KERNEL_ARGS";
-		case CL_INVALID_WORK_DIMENSION:
-			return "CL_INVALID_WORK_DIMENSION";
-		case CL_INVALID_WORK_GROUP_SIZE:
-			return "CL_INVALID_WORK_GROUP_SIZE";
-		case CL_INVALID_WORK_ITEM_SIZE:
-			return "CL_INVALID_WORK_ITEM_SIZE";
-		case CL_INVALID_GLOBAL_OFFSET:
-			return "CL_INVALID_GLOBAL_OFFSET";
-		case CL_INVALID_EVENT_WAIT_LIST:
-			return "CL_INVALID_EVENT_WAIT_LIST";
-		case CL_INVALID_EVENT:
-			return "CL_INVALID_EVENT";
-		case CL_INVALID_OPERATION:
-			return "CL_INVALID_OPERATION";
-		case CL_INVALID_GL_OBJECT:
-			return "CL_INVALID_GL_OBJECT";
-		case CL_INVALID_BUFFER_SIZE:
-			return "CL_INVALID_BUFFER_SIZE";
-		case CL_INVALID_MIP_LEVEL:
-			return "CL_INVALID_MIP_LEVEL";
-		case CL_INVALID_GLOBAL_WORK_SIZE:
-			return "CL_INVALID_GLOBAL_WORK_SIZE";
-		case CL_INVALID_PROPERTY:
-			return "CL_INVALID_PROPERTY";
-	#ifdef CL_VERSION_1_2
-		case CL_INVALID_IMAGE_DESCRIPTOR:
-			return "CL_INVALID_IMAGE_DESCRIPTOR";
-		case CL_INVALID_COMPILER_OPTIONS:
-			return "CL_INVALID_COMPILER_OPTIONS";
-		case CL_INVALID_LINKER_OPTIONS:
-			return "CL_INVALID_LINKER_OPTIONS";
-		case CL_INVALID_DEVICE_PARTITION_COUNT:
-			return "CL_INVALID_DEVICE_PARTITION_COUNT";
-	#endif
-	#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
-		case CL_INVALID_PIPE_SIZE:
-			return "CL_INVALID_PIPE_SIZE";
-		case CL_INVALID_DEVICE_QUEUE:
-			return "CL_INVALID_DEVICE_QUEUE";
-	#endif
-		default:
-			return "UNKNOWN_ERROR";
-		}
+	case CL_SUCCESS:
+		return "CL_SUCCESS";
+	case CL_DEVICE_NOT_FOUND:
+		return "CL_DEVICE_NOT_FOUND";
+	case CL_DEVICE_NOT_AVAILABLE:
+		return "CL_DEVICE_NOT_AVAILABLE";
+	case CL_COMPILER_NOT_AVAILABLE:
+		return "CL_COMPILER_NOT_AVAILABLE";
+	case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+		return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+	case CL_OUT_OF_RESOURCES:
+		return "CL_OUT_OF_RESOURCES";
+	case CL_OUT_OF_HOST_MEMORY:
+		return "CL_OUT_OF_HOST_MEMORY";
+	case CL_PROFILING_INFO_NOT_AVAILABLE:
+		return "CL_PROFILING_INFO_NOT_AVAILABLE";
+	case CL_MEM_COPY_OVERLAP:
+		return "CL_MEM_COPY_OVERLAP";
+	case CL_IMAGE_FORMAT_MISMATCH:
+		return "CL_IMAGE_FORMAT_MISMATCH";
+	case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+		return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+	case CL_BUILD_PROGRAM_FAILURE:
+		return "CL_BUILD_PROGRAM_FAILURE";
+	case CL_MAP_FAILURE:
+		return "CL_MAP_FAILURE";
+	case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+		return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+	case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+		return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+#ifdef CL_VERSION_1_2
+	case CL_COMPILE_PROGRAM_FAILURE:
+		return "CL_COMPILE_PROGRAM_FAILURE";
+	case CL_LINKER_NOT_AVAILABLE:
+		return "CL_LINKER_NOT_AVAILABLE";
+	case CL_LINK_PROGRAM_FAILURE:
+		return "CL_LINK_PROGRAM_FAILURE";
+	case CL_DEVICE_PARTITION_FAILED:
+		return "CL_DEVICE_PARTITION_FAILED";
+	case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+		return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+#endif
+	case CL_INVALID_VALUE:
+		return "CL_INVALID_VALUE";
+	case CL_INVALID_DEVICE_TYPE:
+		return "CL_INVALID_DEVICE_TYPE";
+	case CL_INVALID_PLATFORM:
+		return "CL_INVALID_PLATFORM";
+	case CL_INVALID_DEVICE:
+		return "CL_INVALID_DEVICE";
+	case CL_INVALID_CONTEXT:
+		return "CL_INVALID_CONTEXT";
+	case CL_INVALID_QUEUE_PROPERTIES:
+		return "CL_INVALID_QUEUE_PROPERTIES";
+	case CL_INVALID_COMMAND_QUEUE:
+		return "CL_INVALID_COMMAND_QUEUE";
+	case CL_INVALID_HOST_PTR:
+		return "CL_INVALID_HOST_PTR";
+	case CL_INVALID_MEM_OBJECT:
+		return "CL_INVALID_MEM_OBJECT";
+	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+		return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+	case CL_INVALID_IMAGE_SIZE:
+		return "CL_INVALID_IMAGE_SIZE";
+	case CL_INVALID_SAMPLER:
+		return "CL_INVALID_SAMPLER";
+	case CL_INVALID_BINARY:
+		return "CL_INVALID_BINARY";
+	case CL_INVALID_BUILD_OPTIONS:
+		return "CL_INVALID_BUILD_OPTIONS";
+	case CL_INVALID_PROGRAM:
+		return "CL_INVALID_PROGRAM";
+	case CL_INVALID_PROGRAM_EXECUTABLE:
+		return "CL_INVALID_PROGRAM_EXECUTABLE";
+	case CL_INVALID_KERNEL_NAME:
+		return "CL_INVALID_KERNEL_NAME";
+	case CL_INVALID_KERNEL_DEFINITION:
+		return "CL_INVALID_KERNEL_DEFINITION";
+	case CL_INVALID_KERNEL:
+		return "CL_INVALID_KERNEL";
+	case CL_INVALID_ARG_INDEX:
+		return "CL_INVALID_ARG_INDEX";
+	case CL_INVALID_ARG_VALUE:
+		return "CL_INVALID_ARG_VALUE";
+	case CL_INVALID_ARG_SIZE:
+		return "CL_INVALID_ARG_SIZE";
+	case CL_INVALID_KERNEL_ARGS:
+		return "CL_INVALID_KERNEL_ARGS";
+	case CL_INVALID_WORK_DIMENSION:
+		return "CL_INVALID_WORK_DIMENSION";
+	case CL_INVALID_WORK_GROUP_SIZE:
+		return "CL_INVALID_WORK_GROUP_SIZE";
+	case CL_INVALID_WORK_ITEM_SIZE:
+		return "CL_INVALID_WORK_ITEM_SIZE";
+	case CL_INVALID_GLOBAL_OFFSET:
+		return "CL_INVALID_GLOBAL_OFFSET";
+	case CL_INVALID_EVENT_WAIT_LIST:
+		return "CL_INVALID_EVENT_WAIT_LIST";
+	case CL_INVALID_EVENT:
+		return "CL_INVALID_EVENT";
+	case CL_INVALID_OPERATION:
+		return "CL_INVALID_OPERATION";
+	case CL_INVALID_GL_OBJECT:
+		return "CL_INVALID_GL_OBJECT";
+	case CL_INVALID_BUFFER_SIZE:
+		return "CL_INVALID_BUFFER_SIZE";
+	case CL_INVALID_MIP_LEVEL:
+		return "CL_INVALID_MIP_LEVEL";
+	case CL_INVALID_GLOBAL_WORK_SIZE:
+		return "CL_INVALID_GLOBAL_WORK_SIZE";
+	case CL_INVALID_PROPERTY:
+		return "CL_INVALID_PROPERTY";
+#ifdef CL_VERSION_1_2
+	case CL_INVALID_IMAGE_DESCRIPTOR:
+		return "CL_INVALID_IMAGE_DESCRIPTOR";
+	case CL_INVALID_COMPILER_OPTIONS:
+		return "CL_INVALID_COMPILER_OPTIONS";
+	case CL_INVALID_LINKER_OPTIONS:
+		return "CL_INVALID_LINKER_OPTIONS";
+	case CL_INVALID_DEVICE_PARTITION_COUNT:
+		return "CL_INVALID_DEVICE_PARTITION_COUNT";
+#endif
+#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
+	case CL_INVALID_PIPE_SIZE:
+		return "CL_INVALID_PIPE_SIZE";
+	case CL_INVALID_DEVICE_QUEUE:
+		return "CL_INVALID_DEVICE_QUEUE";
+#endif
+	default:
+		return "UNKNOWN_ERROR";
 	}
 }
+} // namespace
 
 uint32_t getNumPlatforms();
 int getAMDPlatformIdx();
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 12478aefba767629706408ef4ecdc9558bfa0ec3..471e46a53051ee0e6d643a8bb3775215e027a1c9 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -198,7 +198,7 @@ inline void keccakf1600_1(ulong st[25])
 	}
 }
 )==="
-R"===(
+	R"===(
 
 void keccakf1600_2(__local ulong *st)
 {
@@ -372,7 +372,7 @@ inline int4 _mm_alignr_epi8(int4 a, const uint rot)
 #endif
 
 )==="
-R"===(
+	R"===(
 
 void CNKeccak(ulong *output, ulong *input)
 {
@@ -416,7 +416,7 @@ void AESExpandKey256(uint *keybuf)
 }
 
 )==="
-R"===(
+	R"===(
 
 #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
 
@@ -577,7 +577,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 }
 
 )==="
-R"===(
+	R"===(
 
 // __NV_CL_C_VERSION checks if NVIDIA opencl is used
 #if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
@@ -867,7 +867,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 }
 
 )==="
-R"===(
+	R"===(
 
 __attribute__((reqd_work_group_size(8, 8, 1)))
 __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states,
@@ -1051,7 +1051,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 }
 
 )==="
-R"===(
+	R"===(
 
 #define VSWAP8(x)	(((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \
 		  | (((x) >>  8) & 0x00000000FF000000UL) | (((x) <<  8) & 0x000000FF00000000UL) \
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
index e87819760b7b62ab4fa97e62669803ad298a0407..bb37581f2d5442f177614480b88140d38238b129 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
@@ -84,7 +84,7 @@ inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4
 }
 
 )==="
-R"===(
+	R"===(
 
 static const __constant uint look[16][4] = {
 	{0, 1, 2, 3},
@@ -220,7 +220,7 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u
 }
 
 )==="
-R"===(
+	R"===(
 
 static const __constant uint skip[3] = {
 	20,22,22
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl
similarity index 88%
rename from xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl
rename to xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl
index 9edb774adcbc2a31bb25f3d6e581960d6b71a00f..cdb5aef3edc1f59e2f6604b3a9694e8d27bded96 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl
@@ -1,4 +1,5 @@
 R"===(
+
 /*
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -15,29 +16,15 @@ R"===(
  *
  */
 
-#define cryptonight_r_wow 15
-#define cryptonight_r 16
-
-#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT)
-
-#if(STRIDED_INDEX==0)
-#   define IDX(x)	(x)
-#elif(STRIDED_INDEX==1)
-#	define IDX(x)   (mul24(((uint)(x)), Threads))
-#elif(STRIDED_INDEX==2)
-#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
-#elif(STRIDED_INDEX==3)
-#	define IDX(x)   ((x) * WORKSIZE)
-#endif
-
+#ifndef SCRATCHPAD_CHUNK
 // __NV_CL_C_VERSION checks if NVIDIA opencl is used
-#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
-#	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
-#	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
-#else
-#	define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)])
+#	if((ALGO == cryptonight_r_wow || ALGO == cryptonight_r) && defined(__NV_CL_C_VERSION))
+#		define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
+#		define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
+#	else
+#		define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)])
+#	endif
 #endif
-
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *states, uint Threads)
 {
@@ -162,7 +149,9 @@ __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *stat
 #endif
 #define ROT_BITS 32
 
-	XMRSTAK_INCLUDE_RANDOM_MATH
+XMRSTAK_INCLUDE_RANDOM_MATH
+
+#undef ROT_BITS
 
 #if (ALGO == cryptonight_r)
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl
new file mode 100644
index 0000000000000000000000000000000000000000..2c318fcbf4cb4a5ebfc8410b2413fed6437350f0
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl
@@ -0,0 +1,33 @@
+R"===(
+/*
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#define cryptonight_r_wow 15
+#define cryptonight_r 16
+
+#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT)
+
+#if(STRIDED_INDEX==0)
+#   define IDX(x)	(x)
+#elif(STRIDED_INDEX==1)
+#	define IDX(x)   (mul24(((uint)(x)), Threads))
+#elif(STRIDED_INDEX==2)
+#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
+#elif(STRIDED_INDEX==3)
+#	define IDX(x)   ((x) * WORKSIZE)
+#endif
+
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
index 22603853f486df9b13f934a70be150b19dae8985..02ce53e03adbb23d4476954173c5afb72ebcee96 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl
@@ -125,7 +125,7 @@ static const __constant ulong T0_G[] =
 };
 
 )==="
-R"===(
+	R"===(
 
 static const __constant ulong T4_G[] =
 {
@@ -292,4 +292,3 @@ static const __constant ulong T4_G[] =
 		} while (0)
 
 )==="
-
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 120fb6898bda32eaefa116ab4bae74ba2bb3c2b2..dcabb301835af4c433c0f297578cbd8f725956ef 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -5,18 +5,18 @@
 #include "autoAdjust.hpp"
 #include "jconf.hpp"
 
-#include "xmrstak/misc/console.hpp"
-#include "xmrstak/misc/configEditor.hpp"
-#include "xmrstak/params.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
+#include "xmrstak/params.hpp"
 
-#include <vector>
+#include <algorithm>
 #include <cstdio>
+#include <iostream>
 #include <sstream>
 #include <string>
-#include <iostream>
-#include  <algorithm>
+#include <vector>
 
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
@@ -24,7 +24,6 @@
 #include <CL/cl.h>
 #endif
 
-
 namespace xmrstak
 {
 namespace amd
@@ -32,11 +31,9 @@ namespace amd
 
 class autoAdjust
 {
-public:
-
+  public:
 	autoAdjust()
 	{
-
 	}
 
 	/** print the adjusted values if needed
@@ -50,18 +47,17 @@ public:
 
 		if(platformIndex == -1)
 		{
-			printer::inst()->print_msg(L0,"WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver.");
+			printer::inst()->print_msg(L0, "WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver.");
 			return false;
 		}
 
 		devVec = getAMDDevices(platformIndex);
 
-
 		int deviceCount = devVec.size();
 
 		if(deviceCount == 0)
 		{
-			printer::inst()->print_msg(L0,"WARNING: No AMD device found.");
+			printer::inst()->print_msg(L0, "WARNING: No AMD device found.");
 			return false;
 		}
 
@@ -69,17 +65,16 @@ public:
 		return true;
 	}
 
-private:
-
+  private:
 	void generateThreadConfig(const int platformIndex)
 	{
 		// load the template of the backend config into a char variable
-		const char *tpl =
-			#include "./config.tpl"
-		;
+		const char* tpl =
+#include "./config.tpl"
+			;
 
 		configEditor configTpl{};
-		configTpl.set( std::string(tpl) );
+		configTpl.set(std::string(tpl));
 
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
@@ -107,8 +102,7 @@ private:
 				// UNKNOWN
 				ctx.name.compare("gfx900") == 0 ||
 				ctx.name.compare("gfx903") == 0 ||
-				ctx.name.compare("gfx905") == 0
-			)
+				ctx.name.compare("gfx905") == 0)
 			{
 				/* Increase the number of threads for AMD VEGA gpus.
 				 * Limit the number of threads based on the issue: https://github.com/fireice-uk/xmr-stak/issues/5#issuecomment-339425089
@@ -119,11 +113,8 @@ private:
 
 			// NVIDIA optimizations
 			if(
-				ctx.isNVIDIA && (
-					ctx.name.find("P100") != std::string::npos ||
-				    ctx.name.find("V100") != std::string::npos
-				)
-			)
+				ctx.isNVIDIA && (ctx.name.find("P100") != std::string::npos ||
+									ctx.name.find("V100") != std::string::npos))
 			{
 				// do not limit the number of threads
 				maxThreads = 40000u;
@@ -190,7 +181,7 @@ private:
 			// 240byte extra memory is used per thread for meta data
 			size_t perThread = hashMemSize + 240u;
 			size_t maxIntensity = memPerThread / perThread;
-			size_t possibleIntensity = std::min( maxThreads , maxIntensity );
+			size_t possibleIntensity = std::min(maxThreads, maxIntensity);
 			// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
 			size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
 			// in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units
@@ -198,25 +189,25 @@ private:
 				intensity = (possibleIntensity / 8) * 8;
 
 			//If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
-			if (intensity == 0)
+			if(intensity == 0)
 			{
 				printer::inst()->print_msg(L0, "WARNING: Auto detected intensity unexpectedly low. Try to set the environment variable GPU_SINGLE_ALLOC_PERCENT.");
 				intensity = possibleIntensity;
-
 			}
-			if (intensity != 0)
+			if(intensity != 0)
 			{
 				for(uint32_t thd = 0; thd < numThreads; ++thd)
 				{
 					conf += "  // gpu: " + ctx.name + std::string("  compute units: ") + std::to_string(ctx.computeUnits) + "\n";
 					conf += "  // memory:" + std::to_string(memPerThread / byteToMiB) + "|" +
-						std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" +  std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
+							std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
 					// set 8 threads per block (this is a good value for the most gpus)
 					conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
-						"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-						"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
-						"    \"unroll\" : " + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
-						"  },\n";
+							"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
+							"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+																													   "    \"unroll\" : " +
+							std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
+							"  },\n";
 				}
 			}
 			else
@@ -225,8 +216,8 @@ private:
 			}
 		}
 
-		configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex));
-		configTpl.replace("GPUCONFIG",conf);
+		configTpl.replace("PLATFORMINDEX", std::to_string(platformIndex));
+		configTpl.replace("GPUCONFIG", conf);
 		configTpl.write(params::inst().configFileAMD);
 
 		const std::string backendName = xmrstak::params::inst().openCLVendor;
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index d3dc00d0170fe4c442f47d4ca3a5f39fa791f8bc..c5a63c56fd0de772822c3f038ad9958e90c9f85c 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -21,10 +21,9 @@
   *
   */
 
-
 #include "jconf.hpp"
-#include "xmrstak/misc/jext.hpp"
 #include "xmrstak/misc/console.hpp"
+#include "xmrstak/misc/jext.hpp"
 
 #ifdef _WIN32
 #define strcasecmp _stricmp
@@ -37,7 +36,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-
 namespace xmrstak
 {
 namespace amd
@@ -48,9 +46,14 @@ using namespace rapidjson;
 /*
  * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
  */
-enum configEnum { aGpuThreadsConf, iPlatformIdx };
+enum configEnum
+{
+	aGpuThreadsConf,
+	iPlatformIdx
+};
 
-struct configVal {
+struct configVal
+{
 	configEnum iName;
 	const char* sName;
 	Type iType;
@@ -59,24 +62,25 @@ struct configVal {
 // Same order as in configEnum, as per comment above
 // kNullType means any type
 configVal oConfigValues[] = {
-	{ aGpuThreadsConf, "gpu_threads_conf", kNullType },
-	{ iPlatformIdx, "platform_index", kNumberType }
-};
-
-constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+	{aGpuThreadsConf, "gpu_threads_conf", kNullType},
+	{iPlatformIdx, "platform_index", kNumberType}};
 
+constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0]));
 
-enum optionalConfigEnum { iAutoTune };
+enum optionalConfigEnum
+{
+	iAutoTune
+};
 
-struct optionalConfigVal {
+struct optionalConfigVal
+{
 	optionalConfigEnum iName;
 	const char* sName;
 	Type iType;
 };
 
 optionalConfigVal oOptionalConfigValues[] = {
-	{ iAutoTune, "auto_tune", kNumberType }
-};
+	{iAutoTune, "auto_tune", kNumberType}};
 
 inline bool checkType(Type have, Type want)
 {
@@ -109,7 +113,7 @@ jconf::jconf()
 	prv = new opaque_private();
 }
 
-bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
+bool jconf::GetThreadConfig(size_t id, thd_cfg& cfg)
 {
 	if(id >= prv->configValues[aGpuThreadsConf]->Size())
 		return false;
@@ -176,7 +180,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 		return false;
 	}
 
-	if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 )
+	if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18)
 	{
 		printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
 		return false;
@@ -215,7 +219,7 @@ size_t jconf::GetPlatformIdx()
 size_t jconf::GetAutoTune()
 {
 	const Value* value = GetObjectMember(prv->jsonDoc, oOptionalConfigValues[iAutoTune].sName);
-	if( value != nullptr && value->IsUint64())
+	if(value != nullptr && value->IsUint64())
 	{
 		return value->GetUint64();
 	}
@@ -233,22 +237,22 @@ size_t jconf::GetThreadCount()
 
 bool jconf::parse_config(const char* sFilename)
 {
-	FILE * pFile;
-	char * buffer;
+	FILE* pFile;
+	char* buffer;
 	size_t flen;
 
 	pFile = fopen(sFilename, "rb");
-	if (pFile == NULL)
+	if(pFile == NULL)
 	{
 		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
 		return false;
 	}
 
-	fseek(pFile,0,SEEK_END);
+	fseek(pFile, 0, SEEK_END);
 	flen = ftell(pFile);
 	rewind(pFile);
 
-	if(flen >= 64*1024)
+	if(flen >= 64 * 1024)
 	{
 		fclose(pFile);
 		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
@@ -262,7 +266,7 @@ bool jconf::parse_config(const char* sFilename)
 	}
 
 	buffer = (char*)malloc(flen + 3);
-	if(fread(buffer+1, flen, 1, pFile) != 1)
+	if(fread(buffer + 1, flen, 1, pFile) != 1)
 	{
 		free(buffer);
 		fclose(pFile);
@@ -284,7 +288,7 @@ bool jconf::parse_config(const char* sFilename)
 	buffer[flen] = '}';
 	buffer[flen + 1] = '\0';
 
-	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	prv->jsonDoc.Parse<kParseCommentsFlag | kParseTrailingCommasFlag>(buffer, flen + 2);
 	free(buffer);
 
 	if(prv->jsonDoc.HasParseError())
@@ -294,7 +298,6 @@ bool jconf::parse_config(const char* sFilename)
 		return false;
 	}
 
-
 	if(!prv->jsonDoc.IsObject())
 	{ //This should never happen as we created the root ourselves
 		printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename);
@@ -326,7 +329,7 @@ bool jconf::parse_config(const char* sFilename)
 
 	size_t n_thd = prv->configValues[aGpuThreadsConf]->Size();
 	thd_cfg c;
-	for(size_t i=0; i < n_thd; i++)
+	for(size_t i = 0; i < n_thd; i++)
 	{
 		if(!GetThreadConfig(i, c))
 		{
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index 51a0c79ac025afb046e74caa4e0ea0754db64bf0..6f50c3059f60dca84b5ba8005b0f959132c401eb 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -12,16 +12,18 @@ namespace amd
 
 class jconf
 {
-public:
+  public:
 	static jconf* inst()
 	{
-		if (oInst == nullptr) oInst = new jconf;
+		if(oInst == nullptr)
+			oInst = new jconf;
 		return oInst;
 	};
 
 	bool parse_config(const char* sFilename = params::inst().configFileAMD.c_str());
 
-	struct thd_cfg {
+	struct thd_cfg
+	{
 		size_t index;
 		size_t intensity;
 		size_t w_size;
@@ -34,18 +36,17 @@ public:
 	};
 
 	size_t GetThreadCount();
-	bool GetThreadConfig(size_t id, thd_cfg &cfg);
+	bool GetThreadConfig(size_t id, thd_cfg& cfg);
 
 	size_t GetAutoTune();
 	size_t GetPlatformIdx();
 
-private:
+  private:
 	jconf();
 	static jconf* oInst;
 
 	struct opaque_private;
 	opaque_private* prv;
-
 };
 
 } // namespace amd
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 3be593175f6ee72539b06bf86077c23d22fc0379..0a181154c381ccb1fbe1dbc3d91f66888c9f4a94 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -22,23 +22,23 @@
   */
 
 #include "minethd.hpp"
-#include "autoAdjust.hpp"
 #include "amd_gpu/gpu.hpp"
+#include "autoAdjust.hpp"
 
-#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
 #include "xmrstak/backend/cpu/crypto/cryptonight.h"
-#include "xmrstak/misc/configEditor.hpp"
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
+#include "xmrstak/backend/cpu/hwlocMemory.hpp"
 #include "xmrstak/backend/cpu/minethd.hpp"
 #include "xmrstak/jconf.hpp"
-#include "xmrstak/misc/executor.hpp"
+#include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/environment.hpp"
+#include "xmrstak/misc/executor.hpp"
 #include "xmrstak/params.hpp"
-#include "xmrstak/backend/cpu/hwlocMemory.hpp"
 
 #include <assert.h>
-#include <cmath>
 #include <chrono>
+#include <cmath>
 #include <thread>
 #include <vector>
 
@@ -72,15 +72,16 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
 }
 
-extern "C"  {
+extern "C"
+{
 #ifdef WIN32
-__declspec(dllexport)
+	__declspec(dllexport)
 #endif
-std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
-{
-	environment::inst(&env);
-	return amd::minethd::thread_starter(threadOffset, pWork);
-}
+		std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
+	{
+		environment::inst(&env);
+		return amd::minethd::thread_starter(threadOffset, pWork);
+	}
 } // extern "C"
 
 bool minethd::init_gpus()
@@ -137,7 +138,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	pvThreads->reserve(n);
 
 	jconf::thd_cfg cfg;
-	for (i = 0; i < n; i++)
+	for(i = 0; i < n; i++)
 	{
 		jconf::inst()->GetThreadConfig(i, cfg);
 
@@ -161,7 +162,6 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	return pvThreads;
 }
 
-
 void minethd::work_main()
 {
 	if(affinity >= 0) //-1 means no affinity
@@ -172,7 +172,6 @@ void minethd::work_main()
 	lck.release();
 	std::this_thread::yield();
 
-	uint64_t iCount = 0;
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
 
@@ -204,16 +203,16 @@ void minethd::work_main()
 	double bestHashrate = 0.0;
 	uint32_t bestIntensity = pGpuCtx->maxRawIntensity;
 
-	while (bQuit == 0)
+	while(bQuit == 0)
 	{
-		if (oWork.bStall)
+		if(oWork.bStall)
 		{
 			/* We are stalled here because the executor didn't find a job for us yet,
 			 * either because of network latency, or a socket problem. Since we are
 			 * raison d'etre of this software it us sensible to just wait until we have something
 			 */
 
-			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+			while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
 			globalStates::inst().consume_work(oWork, iJobNo);
@@ -267,14 +266,14 @@ void minethd::work_main()
 			uint64_t t0 = interleaveAdjustDelay(pGpuCtx, adjustInterleave);
 
 			cl_uint results[0x100];
-			memset(results,0,sizeof(cl_uint)*(0x100));
+			memset(results, 0, sizeof(cl_uint) * (0x100));
 
 			XMRRunJob(pGpuCtx, results, miner_algo);
 
 			for(size_t i = 0; i < results[0xFF]; i++)
 			{
-				uint8_t	bWorkBlob[128];
-				uint8_t	bResult[32];
+				uint8_t bWorkBlob[128];
+				uint8_t bResult[32];
 
 				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
 				memset(bResult, 0, sizeof(job_result::bResult));
@@ -282,16 +281,13 @@ void minethd::work_main()
 				*(uint32_t*)(bWorkBlob + 39) = results[i];
 
 				cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
-				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
+				if((*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
 					executor::inst()->push_event(ex_event("AMD Invalid Result", pGpuCtx->deviceIdx, oWork.iPoolId));
 			}
 
-			iCount += pGpuCtx->rawIntensity;
-			uint64_t iStamp = get_timestamp_ms();
-			iHashCount.store(iCount, std::memory_order_relaxed);
-			iTimestamp.store(iStamp, std::memory_order_relaxed);
+			updateStats(pGpuCtx->rawIntensity, oWork.iPoolId);
 
 			accRuntime += updateTimings(pGpuCtx, t0);
 
@@ -317,20 +313,18 @@ void minethd::work_main()
 						// lock intensity to the best values
 						autoTune = 0;
 						pGpuCtx->rawIntensity = bestIntensity;
-						printer::inst()->print_msg(L1,"OpenCL %u|%u: lock intensity at %u",
+						printer::inst()->print_msg(L1, "OpenCL %u|%u: lock intensity at %u",
 							pGpuCtx->deviceIdx,
 							pGpuCtx->idWorkerOnDevice,
-							bestIntensity
-						);
+							bestIntensity);
 					}
 					else
 					{
-						printer::inst()->print_msg(L1,"OpenCL %u|%u: auto-tune validate intensity %u|%u",
+						printer::inst()->print_msg(L1, "OpenCL %u|%u: auto-tune validate intensity %u|%u",
 							pGpuCtx->deviceIdx,
 							pGpuCtx->idWorkerOnDevice,
 							pGpuCtx->rawIntensity,
-							bestIntensity
-						);
+							bestIntensity);
 					}
 					// update gpu with new intensity
 					XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, cpu_ctx->cn_r_ctx.height);
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 402d63cd693e55b0882751b8c38921094b7aa3e6..579abb1b56a94276eda280907150bba65de1126c 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -3,27 +3,26 @@
 #include "amd_gpu/gpu.hpp"
 #include "jconf.hpp"
 #include "xmrstak/backend/cpu/crypto/cryptonight.h"
-#include "xmrstak/backend/miner_work.hpp"
 #include "xmrstak/backend/iBackend.hpp"
+#include "xmrstak/backend/miner_work.hpp"
 #include "xmrstak/misc/environment.hpp"
 
-#include <thread>
 #include <atomic>
 #include <future>
+#include <thread>
 
 namespace xmrstak
 {
 namespace amd
 {
 
-class minethd  : public iBackend
+class minethd : public iBackend
 {
-public:
-
+  public:
 	static std::vector<iBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool init_gpus();
 
-private:
+  private:
 	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 	minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg);
diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp
index 0eea9fdd7739a4f50cde75b9e1633369f1d15d77..808fdca37d56ec24eec0dfacf64ec91116b8a390 100644
--- a/xmrstak/backend/backendConnector.cpp
+++ b/xmrstak/backend/backendConnector.cpp
@@ -21,31 +21,30 @@
   *
   */
 
-#include "iBackend.hpp"
 #include "backendConnector.hpp"
-#include "miner_work.hpp"
 #include "globalStates.hpp"
+#include "iBackend.hpp"
+#include "miner_work.hpp"
 #include "plugin.hpp"
-#include "xmrstak/misc/environment.hpp"
 #include "xmrstak/misc/console.hpp"
+#include "xmrstak/misc/environment.hpp"
 #include "xmrstak/params.hpp"
 
 #include "cpu/minethd.hpp"
 #ifndef CONF_NO_CUDA
-#	include "nvidia/minethd.hpp"
+#include "nvidia/minethd.hpp"
 #endif
 #ifndef CONF_NO_OPENCL
-#	include "amd/minethd.hpp"
+#include "amd/minethd.hpp"
 #endif
 
-#include <cstdlib>
 #include <assert.h>
-#include <cmath>
+#include <bitset>
 #include <chrono>
+#include <cmath>
+#include <cstdlib>
 #include <cstring>
 #include <thread>
-#include <bitset>
-
 
 namespace xmrstak
 {
@@ -86,7 +85,7 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 		std::vector<std::string> libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"};
 		size_t numWorkers = 0u;
 
-		for( const auto & name : libNames)
+		for(const auto& name : libNames)
 		{
 			printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str());
 			nvidiaplugin.load("NVIDIA", name);
diff --git a/xmrstak/backend/backendConnector.hpp b/xmrstak/backend/backendConnector.hpp
index 66d873e4853da9fd203c927d47848be5ca95874a..1f2cb8ff6bdd140605bdf3a1f0630c0411b3f59d 100644
--- a/xmrstak/backend/backendConnector.hpp
+++ b/xmrstak/backend/backendConnector.hpp
@@ -3,19 +3,18 @@
 #include "iBackend.hpp"
 #include "miner_work.hpp"
 
-#include <thread>
-#include <vector>
 #include <atomic>
 #include <mutex>
-
+#include <thread>
+#include <vector>
 
 namespace xmrstak
 {
 
-	struct BackendConnector
-	{
-		static std::vector<iBackend*>* thread_starter(miner_work& pWork);
-		static bool self_test();
-	};
+struct BackendConnector
+{
+	static std::vector<iBackend*>* thread_starter(miner_work& pWork);
+	static bool self_test();
+};
 
 } // namespace xmrstak
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index ba0e6984f0bc2b538b727f8c13990228933a8154..98c145004ddf847da7f458b75c8f26285e088318 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -2,12 +2,12 @@
 
 #include "jconf.hpp"
 
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
-#include "xmrstak/backend/cryptonight.hpp"
-#include "xmrstak/backend/cpu/cpuType.hpp"
 #include <string>
 
 #ifdef _WIN32
@@ -16,7 +16,6 @@
 #include <unistd.h>
 #endif // _WIN32
 
-
 namespace xmrstak
 {
 namespace cpu
@@ -24,8 +23,7 @@ namespace cpu
 
 class autoAdjust
 {
-public:
-
+  public:
 	bool printConfig()
 	{
 		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
@@ -42,10 +40,10 @@ public:
 		configEditor configTpl{};
 
 		// load the template of the backend config into a char variable
-		const char *tpl =
-			#include "./config.tpl"
-		;
-		configTpl.set( std::string(tpl) );
+		const char* tpl =
+#include "./config.tpl"
+			;
+		configTpl.set(std::string(tpl));
 
 		std::string conf;
 
@@ -75,14 +73,14 @@ public:
 				linux_layout ? "Linux" : "Windows");
 
 			uint32_t aff_id = 0;
-			for(uint32_t i=0; i < corecnt; i++)
+			for(uint32_t i = 0; i < corecnt; i++)
 			{
 				bool double_mode;
 
 				if(L3KB_size <= 0)
 					break;
 
-				double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt-i);
+				double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt - i);
 
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string(double_mode ? "true" : "false");
@@ -110,14 +108,14 @@ public:
 		if(useCryptonight_gpu)
 			conf += "*/\n";
 
-		configTpl.replace("CPUCONFIG",conf);
+		configTpl.replace("CPUCONFIG", conf);
 		configTpl.write(params::inst().configFileCPU);
 		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str());
 
 		return true;
 	}
 
-private:
+  private:
 	bool detectL3Size()
 	{
 		int32_t cpu_info[4];
@@ -125,8 +123,8 @@ private:
 
 		::jconf::cpuid(0, 0, cpu_info);
 		memcpy(cpustr, &cpu_info[1], 4);
-		memcpy(cpustr+4, &cpu_info[3], 4);
-		memcpy(cpustr+8, &cpu_info[2], 4);
+		memcpy(cpustr + 4, &cpu_info[3], 4);
+		memcpy(cpustr + 8, &cpu_info[2], 4);
 
 		if(strcmp(cpustr, "GenuineIntel") == 0)
 		{
@@ -139,7 +137,8 @@ private:
 			}
 
 			L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) *
-				(get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024;
+							(get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) /
+						1024;
 
 			return true;
 		}
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index f09b1ebc046f4a50f6123be4107e4bb49ae963c9..d1765155ac87d58312f897545e77d8de037e0c15 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -1,9 +1,9 @@
 #pragma once
 
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
-#include "xmrstak/backend/cryptonight.hpp"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -16,7 +16,6 @@
 #include <hwloc.h>
 #include <stdio.h>
 
-
 namespace xmrstak
 {
 namespace cpu
@@ -24,8 +23,7 @@ namespace cpu
 
 class autoAdjust
 {
-public:
-
+  public:
 	autoAdjust()
 	{
 		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
@@ -48,10 +46,10 @@ public:
 		configEditor configTpl{};
 
 		// load the template of the backend config into a char variable
-		const char *tpl =
-			#include "./config.tpl"
-		;
-		configTpl.set( std::string(tpl) );
+		const char* tpl =
+#include "./config.tpl"
+			;
+		configTpl.set(std::string(tpl));
 
 		// if cryptonight_gpu is used we will disable cpu mining but provide a inactive config
 		bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
@@ -69,7 +67,7 @@ public:
 			results.reserve(16);
 
 			findChildrenCaches(hwloc_get_root_obj(topology),
-				[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); } );
+				[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); });
 
 			if(tlcs.size() == 0)
 				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
@@ -97,7 +95,7 @@ public:
 		if(useCryptonight_gpu)
 			conf += "*/\n";
 
-		configTpl.replace("CPUCONFIG",conf);
+		configTpl.replace("CPUCONFIG", conf);
 		configTpl.write(params::inst().configFileCPU);
 		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str());
 		/* Destroy topology object. */
@@ -106,16 +104,16 @@ public:
 		return true;
 	}
 
-private:
+  private:
 	size_t hashMemSize = 0;
 	size_t halfHashMemSize = 0;
 
 	std::vector<uint32_t> results;
 
-	template<typename func>
+	template <typename func>
 	inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda)
 	{
-		for(size_t i=0; i < obj->arity; i++)
+		for(size_t i = 0; i < obj->arity; i++)
 		{
 			if(obj->children[i]->type == type)
 				lambda(obj->children[i]);
@@ -133,10 +131,10 @@ private:
 #endif // HWLOC_API_VERSION
 	}
 
-	template<typename func>
+	template <typename func>
 	inline void findChildrenCaches(hwloc_obj_t obj, func lambda)
 	{
-		for(size_t i=0; i < obj->arity; i++)
+		for(size_t i = 0; i < obj->arity; i++)
 		{
 			if(isCacheObject(obj->children[i]))
 				lambda(obj->children[i]);
@@ -159,7 +157,7 @@ private:
 			throw(std::runtime_error("Cache object hasn't got attributes."));
 
 		size_t PUs = 0;
-		findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; } );
+		findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; });
 
 		//Strange case, but we will handle it silently, surely there must be one PU somewhere?
 		if(PUs == 0)
@@ -172,7 +170,7 @@ private:
 				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
 
 			//Try our luck with lower level caches
-			for(size_t i=0; i < obj->arity; i++)
+			for(size_t i = 0; i < obj->arity; i++)
 				processTopLevelCache(obj->children[i]);
 			return;
 		}
@@ -180,7 +178,7 @@ private:
 		size_t cacheSize = obj->attr->cache.size;
 		if(isCacheExclusive(obj))
 		{
-			for(size_t i=0; i < obj->arity; i++)
+			for(size_t i = 0; i < obj->arity; i++)
 			{
 				hwloc_obj_t l2obj = obj->children[i];
 				//If L2 is exclusive and greater or equal to 2MB add room for one more hash
@@ -191,7 +189,7 @@ private:
 
 		std::vector<hwloc_obj_t> cores;
 		cores.reserve(16);
-		findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); } );
+		findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); });
 
 		size_t cacheHashes = (cacheSize + halfHashMemSize) / hashMemSize;
 
diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp
index c85682d4f3f0e1b5e2abcca7616bc1d0e0829b19..5e2519c3b92864fbf0956480212783686bfdec76 100644
--- a/xmrstak/backend/cpu/cpuType.cpp
+++ b/xmrstak/backend/cpu/cpuType.cpp
@@ -1,9 +1,9 @@
 
 #include "xmrstak/backend/cpu/cpuType.hpp"
 
+#include <cstdio>
 #include <cstring>
 #include <inttypes.h>
-#include <cstdio>
 
 #ifdef _WIN32
 #define strcasecmp _stricmp
@@ -16,64 +16,63 @@ namespace xmrstak
 {
 namespace cpu
 {
-	void cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
-	{
-		std::memset(val, 0, sizeof(int32_t)*4);
-
-	#ifdef _WIN32
-		__cpuidex(val, eax, ecx);
-	#else
-		__cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
-	#endif
-	}
-
-	int32_t get_masked(int32_t val, int32_t h, int32_t l)
-	{
-		val &= (0x7FFFFFFF >> (31-(h-l))) << l;
-		return val >> l;
-	}
+void cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
+{
+	std::memset(val, 0, sizeof(int32_t) * 4);
 
-	bool has_feature(int32_t val, int32_t bit)
-	{
-		int32_t mask = 1 << bit;
-		return (val & mask) != 0u;
+#ifdef _WIN32
+	__cpuidex(val, eax, ecx);
+#else
+	__cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
+#endif
+}
 
-	}
+int32_t get_masked(int32_t val, int32_t h, int32_t l)
+{
+	val &= (0x7FFFFFFF >> (31 - (h - l))) << l;
+	return val >> l;
+}
 
-	Model getModel()
-	{
-		int32_t cpu_info[4];
-		char cpustr[13] = {0};
+bool has_feature(int32_t val, int32_t bit)
+{
+	int32_t mask = 1 << bit;
+	return (val & mask) != 0u;
+}
 
-		cpuid(0, 0, cpu_info);
-		std::memcpy(cpustr, &cpu_info[1], 4);
-		std::memcpy(cpustr+4, &cpu_info[3], 4);
-		std::memcpy(cpustr+8, &cpu_info[2], 4);
+Model getModel()
+{
+	int32_t cpu_info[4];
+	char cpustr[13] = {0};
 
-		Model result;
+	cpuid(0, 0, cpu_info);
+	std::memcpy(cpustr, &cpu_info[1], 4);
+	std::memcpy(cpustr + 4, &cpu_info[3], 4);
+	std::memcpy(cpustr + 8, &cpu_info[2], 4);
 
-		cpuid(1, 0, cpu_info);
+	Model result;
 
-		result.family = get_masked(cpu_info[0], 12, 8);
-		result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4;
-		result.type_name = cpustr;
+	cpuid(1, 0, cpu_info);
 
-		// feature bits https://en.wikipedia.org/wiki/CPUID
-		// sse2
-		result.sse2 = has_feature(cpu_info[3], 26);
-		// aes-ni
-		result.aes = has_feature(cpu_info[2], 25);
-		// avx - 27 is the check if the OS overwrote cpu features
-		result.avx = has_feature(cpu_info[2], 28) && has_feature(cpu_info[2], 27) ;
+	result.family = get_masked(cpu_info[0], 12, 8);
+	result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4;
+	result.type_name = cpustr;
 
-		if(strcmp(cpustr, "AuthenticAMD") == 0)
-		{
-			if(result.family == 0xF)
-				result.family += get_masked(cpu_info[0], 28, 20);
-		}
+	// feature bits https://en.wikipedia.org/wiki/CPUID
+	// sse2
+	result.sse2 = has_feature(cpu_info[3], 26);
+	// aes-ni
+	result.aes = has_feature(cpu_info[2], 25);
+	// avx - 27 is the check if the OS overwrote cpu features
+	result.avx = has_feature(cpu_info[2], 28) && has_feature(cpu_info[2], 27);
 
-		return result;
+	if(strcmp(cpustr, "AuthenticAMD") == 0)
+	{
+		if(result.family == 0xF)
+			result.family += get_masked(cpu_info[0], 28, 20);
 	}
 
+	return result;
+}
+
 } // namespace cpu
 } // namespace xmrstak
diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp
index 7f6bfaf51b663f5ff0e22134463d51c9e2dea70c..2bafa41058e6a914d05d70ae1ed091f2ec65358f 100644
--- a/xmrstak/backend/cpu/cpuType.hpp
+++ b/xmrstak/backend/cpu/cpuType.hpp
@@ -1,32 +1,30 @@
 #pragma once
 
-#include <string>
 #include <cstdint>
-
+#include <string>
 
 namespace xmrstak
 {
 namespace cpu
 {
-	struct Model
-	{
-		uint32_t family = 0u;
-		uint32_t model = 0u;
-		bool aes = false;
-		bool sse2 = false;
-		bool avx = false;
-		std::string type_name = "unknown";
-	};
+struct Model
+{
+	uint32_t family = 0u;
+	uint32_t model = 0u;
+	bool aes = false;
+	bool sse2 = false;
+	bool avx = false;
+	std::string type_name = "unknown";
+};
 
-	Model getModel();
+Model getModel();
 
-	/** Mask bits between h and l and return the value
+/** Mask bits between h and l and return the value
 	 *
 	 * This enables us to put in values exactly like in the manual
 	 * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22)
 	 */
-	int32_t get_masked(int32_t val, int32_t h, int32_t l);
+int32_t get_masked(int32_t val, int32_t h, int32_t l);
 
-	
 } // namespace cpu
 } // namespace xmrstak
diff --git a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp
index 2fc1a8baafaad969619aa525285ba7f45528b240..5d55987ac72db3a681424b258d2cd348fb4d234a 100644
--- a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp
+++ b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp
@@ -1,77 +1,87 @@
 #include <cstring>
 
-typedef void(*void_func)();
+typedef void (*void_func)();
 
-#include "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.h"
-#include "cryptonight_aesni.h"
 #include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.h"
 #include "xmrstak/misc/console.hpp"
 
-static inline void add_code(uint8_t* &p, void (*p1)(), void (*p2)())
+static inline void add_code(uint8_t*& p, void (*p1)(), void (*p2)())
 {
-    const ptrdiff_t size = reinterpret_cast<const uint8_t*>(p2) - reinterpret_cast<const uint8_t*>(p1);
-    if (size > 0) {
-        memcpy(p, reinterpret_cast<void*>(p1), size);
-        p += size;
-    }
+	const ptrdiff_t size = reinterpret_cast<const uint8_t*>(p2) - reinterpret_cast<const uint8_t*>(p1);
+	if(size > 0)
+	{
+		memcpy(p, reinterpret_cast<void*>(p1), size);
+		p += size;
+	}
 }
 
-static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, int selected_asm)
+static inline void add_random_math(uint8_t*& p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, int selected_asm)
 {
-    uint32_t prev_rot_src = (uint32_t)(-1);
-
-    for (int i = 0;; ++i) {
-        const V4_Instruction inst = code[i];
-        if (inst.opcode == RET) {
-            break;
-        }
-
-        uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
-        uint8_t dst_index = inst.dst_index;
-        uint8_t src_index = inst.src_index;
-
-        const uint32_t a = inst.dst_index;
-        const uint32_t b = inst.src_index;
-        const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
-
-        switch (inst.opcode) {
-        case ROR:
-        case ROL:
-            if (b != prev_rot_src) {
-                prev_rot_src = b;
-                add_code(p, instructions_mov[c], instructions_mov[c + 1]);
-            }
-            break;
-        }
-
-        if (a == prev_rot_src) {
-            prev_rot_src = (uint32_t)(-1);
-        }
-
-        void_func begin = instructions[c];
+	uint32_t prev_rot_src = (uint32_t)(-1);
+
+	for(int i = 0;; ++i)
+	{
+		const V4_Instruction inst = code[i];
+		if(inst.opcode == RET)
+		{
+			break;
+		}
+
+		uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
+		uint8_t dst_index = inst.dst_index;
+		uint8_t src_index = inst.src_index;
+
+		const uint32_t a = inst.dst_index;
+		const uint32_t b = inst.src_index;
+		const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
+
+		switch(inst.opcode)
+		{
+		case ROR:
+		case ROL:
+			if(b != prev_rot_src)
+			{
+				prev_rot_src = b;
+				add_code(p, instructions_mov[c], instructions_mov[c + 1]);
+			}
+			break;
+		}
+
+		if(a == prev_rot_src)
+		{
+			prev_rot_src = (uint32_t)(-1);
+		}
+
+		void_func begin = instructions[c];
 
 		// AMD == 2
-        if ((selected_asm == 2) && (inst.opcode == MUL && !is_64_bit)) {
-            // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
-            // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
-            uint8_t* prefix = reinterpret_cast<uint8_t*>(begin);
-
-            if (*prefix == 0x49) {
-                *(p++) = 0x41;
-            }
-
-            begin = reinterpret_cast<void_func>(prefix + 1);
-        }
-
-        add_code(p, begin, instructions[c + 1]);
-
-        if (inst.opcode == ADD) {
-            *(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
-            if (is_64_bit) {
-                prev_rot_src = (uint32_t)(-1);
-            }
-        }
-    }
+		if((selected_asm == 2) && (inst.opcode == MUL && !is_64_bit))
+		{
+			// AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
+			// Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
+			uint8_t* prefix = reinterpret_cast<uint8_t*>(begin);
+
+			if(*prefix == 0x49)
+			{
+				*(p++) = 0x41;
+			}
+
+			begin = reinterpret_cast<void_func>(prefix + 1);
+		}
+
+		add_code(p, begin, instructions[c + 1]);
+
+		if(inst.opcode == ADD)
+		{
+			*(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
+			if(is_64_bit)
+			{
+				prev_rot_src = (uint32_t)(-1);
+			}
+		}
+	}
 }
 
 void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size)
@@ -84,14 +94,14 @@ void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size)
 	else
 		unprotectExecutableMemory(ctx->fun_data, allocation_size);
 
-    uint8_t* p0 = ctx->fun_data;
-    uint8_t* p = p0;
+	uint8_t* p0 = ctx->fun_data;
+	uint8_t* p = p0;
 	if(ctx->fun_data != nullptr)
 	{
 
 		if(N == 2)
 		{
-		    add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
+			add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
 			add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version);
 			add_code(p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
 			add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version);
diff --git a/xmrstak/backend/cpu/crypto/c_blake256.c b/xmrstak/backend/cpu/crypto/c_blake256.c
index e5fadfe74bf491d6469f32d7b948e908454899e8..93d9cadbb58e6bda4fce9ec28e27fac7d0af404f 100644
--- a/xmrstak/backend/cpu/crypto/c_blake256.c
+++ b/xmrstak/backend/cpu/crypto/c_blake256.c
@@ -8,66 +8,67 @@
  * HMAC is specified by RFC 2104.
  */
 
-#include <string.h>
-#include <stdio.h>
-#include <stdint.h>
 #include "c_blake256.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
 
-#define U8TO32(p) \
-	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |    \
-	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
-#define U32TO8(p, v) \
-	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
-	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+#define U8TO32(p)                                              \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
+		((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3])))
+#define U32TO8(p, v)               \
+	(p)[0] = (uint8_t)((v) >> 24); \
+	(p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >> 8);  \
+	(p)[3] = (uint8_t)((v));
 
 const uint8_t sigma[][16] = {
-	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
-	{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
-	{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
-	{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8},
-	{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13},
-	{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9},
-	{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11},
-	{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10},
-	{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5},
-	{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0},
-	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15},
-	{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3},
-	{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4},
-	{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8}
-};
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}};
 
 const uint32_t cst[16] = {
 	0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
 	0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
 	0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
-	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
-};
+	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917};
 
 static const uint8_t padding[] = {
-	0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-};
-
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
-void blake256_compress(state *S, const uint8_t *block) {
+void blake256_compress(state* S, const uint8_t* block)
+{
 	uint32_t v[16], m[16], i;
 
-#define ROT(x,n) (((x)<<(32-n))|((x)>>(n)))
-#define G(a,b,c,d,e)                                      \
-	v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e+1]]) + v[b]; \
-	v[d] = ROT(v[d] ^ v[a],16);                           \
-	v[c] += v[d];                                         \
-	v[b] = ROT(v[b] ^ v[c],12);                           \
-	v[a] += (m[sigma[i][e+1]] ^ cst[sigma[i][e]])+v[b];   \
-	v[d] = ROT(v[d] ^ v[a], 8);                           \
-	v[c] += v[d];                                         \
+#define ROT(x, n) (((x) << (32 - n)) | ((x) >> (n)))
+#define G(a, b, c, d, e)                                    \
+	v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e + 1]]) + v[b]; \
+	v[d] = ROT(v[d] ^ v[a], 16);                            \
+	v[c] += v[d];                                           \
+	v[b] = ROT(v[b] ^ v[c], 12);                            \
+	v[a] += (m[sigma[i][e + 1]] ^ cst[sigma[i][e]]) + v[b]; \
+	v[d] = ROT(v[d] ^ v[a], 8);                             \
+	v[c] += v[d];                                           \
 	v[b] = ROT(v[b] ^ v[c], 7);
 
-	for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
-	for (i = 0; i < 8;  ++i) v[i] = S->h[i];
-	v[ 8] = S->s[0] ^ 0x243F6A88;
-	v[ 9] = S->s[1] ^ 0x85A308D3;
+	for(i = 0; i < 16; ++i)
+		m[i] = U8TO32(block + i * 4);
+	for(i = 0; i < 8; ++i)
+		v[i] = S->h[i];
+	v[8] = S->s[0] ^ 0x243F6A88;
+	v[9] = S->s[1] ^ 0x85A308D3;
 	v[10] = S->s[2] ^ 0x13198A2E;
 	v[11] = S->s[3] ^ 0x03707344;
 	v[12] = 0xA4093822;
@@ -75,29 +76,34 @@ void blake256_compress(state *S, const uint8_t *block) {
 	v[14] = 0x082EFA98;
 	v[15] = 0xEC4E6C89;
 
-	if (S->nullt == 0) {
+	if(S->nullt == 0)
+	{
 		v[12] ^= S->t[0];
 		v[13] ^= S->t[0];
 		v[14] ^= S->t[1];
 		v[15] ^= S->t[1];
 	}
 
-	for (i = 0; i < 14; ++i) {
-		G(0, 4,  8, 12,  0);
-		G(1, 5,  9, 13,  2);
-		G(2, 6, 10, 14,  4);
-		G(3, 7, 11, 15,  6);
-		G(3, 4,  9, 14, 14);
-		G(2, 7,  8, 13, 12);
-		G(0, 5, 10, 15,  8);
+	for(i = 0; i < 14; ++i)
+	{
+		G(0, 4, 8, 12, 0);
+		G(1, 5, 9, 13, 2);
+		G(2, 6, 10, 14, 4);
+		G(3, 7, 11, 15, 6);
+		G(3, 4, 9, 14, 14);
+		G(2, 7, 8, 13, 12);
+		G(0, 5, 10, 15, 8);
 		G(1, 6, 11, 12, 10);
 	}
 
-	for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
-	for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
+	for(i = 0; i < 16; ++i)
+		S->h[i % 8] ^= v[i];
+	for(i = 0; i < 8; ++i)
+		S->h[i] ^= S->s[i % 4];
 }
 
-void blake256_init(state *S) {
+void blake256_init(state* S)
+{
 	S->h[0] = 0x6A09E667;
 	S->h[1] = 0xBB67AE85;
 	S->h[2] = 0x3C6EF372;
@@ -110,7 +116,8 @@ void blake256_init(state *S) {
 	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
 }
 
-void blake224_init(state *S) {
+void blake224_init(state* S)
+{
 	S->h[0] = 0xC1059ED8;
 	S->h[1] = 0x367CD507;
 	S->h[2] = 0x3070DD17;
@@ -124,57 +131,75 @@ void blake224_init(state *S) {
 }
 
 // datalen = number of bits
-void blake256_update(state *S, const uint8_t *data, uint32_t datalen) {
+void blake256_update(state* S, const uint8_t* data, uint32_t datalen)
+{
 	int left = S->buflen >> 3;
 	int fill = 64 - left;
 
-	if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) {
-		memcpy((void *) (S->buf + left), (void *) data, fill);
+	if(left && (((datalen >> 3) & 0x3F) >= (unsigned)fill))
+	{
+		memcpy((void*)(S->buf + left), (void*)data, fill);
 		S->t[0] += 512;
-		if (S->t[0] == 0) S->t[1]++;
+		if(S->t[0] == 0)
+			S->t[1]++;
 		blake256_compress(S, S->buf);
 		data += fill;
 		datalen -= (fill << 3);
 		left = 0;
 	}
 
-	while (datalen >= 512) {
+	while(datalen >= 512)
+	{
 		S->t[0] += 512;
-		if (S->t[0] == 0) S->t[1]++;
+		if(S->t[0] == 0)
+			S->t[1]++;
 		blake256_compress(S, data);
 		data += 64;
 		datalen -= 512;
 	}
 
-	if (datalen > 0) {
-		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
+	if(datalen > 0)
+	{
+		memcpy((void*)(S->buf + left), (void*)data, datalen >> 3);
 		S->buflen = (left << 3) + datalen;
-	} else {
+	}
+	else
+	{
 		S->buflen = 0;
 	}
 }
 
 // datalen = number of bits
-void blake224_update(state *S, const uint8_t *data, uint32_t datalen) {
+void blake224_update(state* S, const uint8_t* data, uint32_t datalen)
+{
 	blake256_update(S, data, datalen);
 }
 
-void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) {
+void blake256_final_h(state* S, uint8_t* digest, uint8_t pa, uint8_t pb)
+{
 	uint8_t msglen[8];
 	uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
-	if (lo < (unsigned) S->buflen) hi++;
+	if(lo < (unsigned)S->buflen)
+		hi++;
 	U32TO8(msglen + 0, hi);
 	U32TO8(msglen + 4, lo);
 
-	if (S->buflen == 440) { /* one padding byte */
+	if(S->buflen == 440)
+	{ /* one padding byte */
 		S->t[0] -= 8;
 		blake256_update(S, &pa, 8);
-	} else {
-		if (S->buflen < 440) { /* enough space to fill the block  */
-			if (S->buflen == 0) S->nullt = 1;
+	}
+	else
+	{
+		if(S->buflen < 440)
+		{ /* enough space to fill the block  */
+			if(S->buflen == 0)
+				S->nullt = 1;
 			S->t[0] -= 440 - S->buflen;
 			blake256_update(S, padding, 440 - S->buflen);
-		} else { /* need 2 compressions */
+		}
+		else
+		{ /* need 2 compressions */
 			S->t[0] -= 512 - S->buflen;
 			blake256_update(S, padding, 512 - S->buflen);
 			S->t[0] -= 440;
@@ -187,9 +212,9 @@ void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) {
 	S->t[0] -= 64;
 	blake256_update(S, msglen, 64);
 
-	U32TO8(digest +  0, S->h[0]);
-	U32TO8(digest +  4, S->h[1]);
-	U32TO8(digest +  8, S->h[2]);
+	U32TO8(digest + 0, S->h[0]);
+	U32TO8(digest + 4, S->h[1]);
+	U32TO8(digest + 8, S->h[2]);
 	U32TO8(digest + 12, S->h[3]);
 	U32TO8(digest + 16, S->h[4]);
 	U32TO8(digest + 20, S->h[5]);
@@ -197,16 +222,19 @@ void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) {
 	U32TO8(digest + 28, S->h[7]);
 }
 
-void blake256_final(state *S, uint8_t *digest) {
+void blake256_final(state* S, uint8_t* digest)
+{
 	blake256_final_h(S, digest, 0x81, 0x01);
 }
 
-void blake224_final(state *S, uint8_t *digest) {
+void blake224_final(state* S, uint8_t* digest)
+{
 	blake256_final_h(S, digest, 0x80, 0x00);
 }
 
 // inlen = number of bytes
-void blake256_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
+void blake256_hash(uint8_t* out, const uint8_t* in, uint32_t inlen)
+{
 	state S;
 	blake256_init(&S);
 	blake256_update(&S, in, inlen * 8);
@@ -214,7 +242,8 @@ void blake256_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
 }
 
 // inlen = number of bytes
-void blake224_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
+void blake224_hash(uint8_t* out, const uint8_t* in, uint32_t inlen)
+{
 	state S;
 	blake224_init(&S);
 	blake224_update(&S, in, inlen * 8);
@@ -222,13 +251,15 @@ void blake224_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) {
 }
 
 // keylen = number of bytes
-void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
-	const uint8_t *key = _key;
+void hmac_blake256_init(hmac_state* S, const uint8_t* _key, uint64_t keylen)
+{
+	const uint8_t* key = _key;
 	uint8_t keyhash[32];
 	uint8_t pad[64];
 	uint64_t i;
 
-	if (keylen > 64) {
+	if(keylen > 64)
+	{
 		blake256_hash(keyhash, key, keylen);
 		key = keyhash;
 		keylen = 32;
@@ -236,14 +267,16 @@ void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
 
 	blake256_init(&S->inner);
 	memset(pad, 0x36, 64);
-	for (i = 0; i < keylen; ++i) {
+	for(i = 0; i < keylen; ++i)
+	{
 		pad[i] ^= key[i];
 	}
 	blake256_update(&S->inner, pad, 512);
 
 	blake256_init(&S->outer);
 	memset(pad, 0x5c, 64);
-	for (i = 0; i < keylen; ++i) {
+	for(i = 0; i < keylen; ++i)
+	{
 		pad[i] ^= key[i];
 	}
 	blake256_update(&S->outer, pad, 512);
@@ -252,13 +285,15 @@ void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
 }
 
 // keylen = number of bytes
-void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
-	const uint8_t *key = _key;
+void hmac_blake224_init(hmac_state* S, const uint8_t* _key, uint64_t keylen)
+{
+	const uint8_t* key = _key;
 	uint8_t keyhash[32];
 	uint8_t pad[64];
 	uint64_t i;
 
-	if (keylen > 64) {
+	if(keylen > 64)
+	{
 		blake256_hash(keyhash, key, keylen);
 		key = keyhash;
 		keylen = 28;
@@ -266,14 +301,16 @@ void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
 
 	blake224_init(&S->inner);
 	memset(pad, 0x36, 64);
-	for (i = 0; i < keylen; ++i) {
+	for(i = 0; i < keylen; ++i)
+	{
 		pad[i] ^= key[i];
 	}
 	blake224_update(&S->inner, pad, 512);
 
 	blake224_init(&S->outer);
 	memset(pad, 0x5c, 64);
-	for (i = 0; i < keylen; ++i) {
+	for(i = 0; i < keylen; ++i)
+	{
 		pad[i] ^= key[i];
 	}
 	blake224_update(&S->outer, pad, 512);
@@ -282,18 +319,21 @@ void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) {
 }
 
 // datalen = number of bits
-void hmac_blake256_update(hmac_state *S, const uint8_t *data, uint32_t datalen) {
-  // update the inner state
-  blake256_update(&S->inner, data, datalen);
+void hmac_blake256_update(hmac_state* S, const uint8_t* data, uint32_t datalen)
+{
+	// update the inner state
+	blake256_update(&S->inner, data, datalen);
 }
 
 // datalen = number of bits
-void hmac_blake224_update(hmac_state *S, const uint8_t *data, uint32_t datalen) {
-  // update the inner state
-  blake224_update(&S->inner, data, datalen);
+void hmac_blake224_update(hmac_state* S, const uint8_t* data, uint32_t datalen)
+{
+	// update the inner state
+	blake224_update(&S->inner, data, datalen);
 }
 
-void hmac_blake256_final(hmac_state *S, uint8_t *digest) {
+void hmac_blake256_final(hmac_state* S, uint8_t* digest)
+{
 	uint8_t ihash[32];
 	blake256_final(&S->inner, ihash);
 	blake256_update(&S->outer, ihash, 256);
@@ -301,7 +341,8 @@ void hmac_blake256_final(hmac_state *S, uint8_t *digest) {
 	memset(ihash, 0, 32);
 }
 
-void hmac_blake224_final(hmac_state *S, uint8_t *digest) {
+void hmac_blake224_final(hmac_state* S, uint8_t* digest)
+{
 	uint8_t ihash[32];
 	blake224_final(&S->inner, ihash);
 	blake224_update(&S->outer, ihash, 224);
@@ -310,7 +351,8 @@ void hmac_blake224_final(hmac_state *S, uint8_t *digest) {
 }
 
 // keylen = number of bytes; inlen = number of bytes
-void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) {
+void hmac_blake256_hash(uint8_t* out, const uint8_t* key, uint64_t keylen, const uint8_t* in, uint32_t inlen)
+{
 	hmac_state S;
 	hmac_blake256_init(&S, key, keylen);
 	hmac_blake256_update(&S, in, inlen * 8);
@@ -318,7 +360,8 @@ void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const
 }
 
 // keylen = number of bytes; inlen = number of bytes
-void hmac_blake224_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) {
+void hmac_blake224_hash(uint8_t* out, const uint8_t* key, uint64_t keylen, const uint8_t* in, uint32_t inlen)
+{
 	hmac_state S;
 	hmac_blake224_init(&S, key, keylen);
 	hmac_blake224_update(&S, in, inlen * 8);
diff --git a/xmrstak/backend/cpu/crypto/c_blake256.h b/xmrstak/backend/cpu/crypto/c_blake256.h
index 06c7917af8c159c85d425a7dd7b3afc87f9ec9d7..9f63f88f4ebb831e7671d3ab41899df12e4e1998 100644
--- a/xmrstak/backend/cpu/crypto/c_blake256.h
+++ b/xmrstak/backend/cpu/crypto/c_blake256.h
@@ -3,41 +3,43 @@
 
 #include <stdint.h>
 
-typedef struct {
-  uint32_t h[8], s[4], t[2];
-  int buflen, nullt;
-  uint8_t buf[64];
+typedef struct
+{
+	uint32_t h[8], s[4], t[2];
+	int buflen, nullt;
+	uint8_t buf[64];
 } state;
 
-typedef struct {
-  state inner;
-  state outer;
+typedef struct
+{
+	state inner;
+	state outer;
 } hmac_state;
 
-void blake256_init(state *);
-void blake224_init(state *);
+void blake256_init(state*);
+void blake224_init(state*);
 
-void blake256_update(state *, const uint8_t *, uint32_t);
-void blake224_update(state *, const uint8_t *, uint32_t);
+void blake256_update(state*, const uint8_t*, uint32_t);
+void blake224_update(state*, const uint8_t*, uint32_t);
 
-void blake256_final(state *, uint8_t *);
-void blake224_final(state *, uint8_t *);
+void blake256_final(state*, uint8_t*);
+void blake224_final(state*, uint8_t*);
 
-void blake256_hash(uint8_t *, const uint8_t *, uint32_t);
-void blake224_hash(uint8_t *, const uint8_t *, uint32_t);
+void blake256_hash(uint8_t*, const uint8_t*, uint32_t);
+void blake224_hash(uint8_t*, const uint8_t*, uint32_t);
 
 /* HMAC functions: */
 
-void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t);
-void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t);
+void hmac_blake256_init(hmac_state*, const uint8_t*, uint64_t);
+void hmac_blake224_init(hmac_state*, const uint8_t*, uint64_t);
 
-void hmac_blake256_update(hmac_state *, const uint8_t *, uint32_t);
-void hmac_blake224_update(hmac_state *, const uint8_t *, uint32_t);
+void hmac_blake256_update(hmac_state*, const uint8_t*, uint32_t);
+void hmac_blake224_update(hmac_state*, const uint8_t*, uint32_t);
 
-void hmac_blake256_final(hmac_state *, uint8_t *);
-void hmac_blake224_final(hmac_state *, uint8_t *);
+void hmac_blake256_final(hmac_state*, uint8_t*);
+void hmac_blake224_final(hmac_state*, uint8_t*);
 
-void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t);
-void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t);
+void hmac_blake256_hash(uint8_t*, const uint8_t*, uint64_t, const uint8_t*, uint32_t);
+void hmac_blake224_hash(uint8_t*, const uint8_t*, uint64_t, const uint8_t*, uint32_t);
 
 #endif /* _BLAKE256_H_ */
diff --git a/xmrstak/backend/cpu/crypto/c_groestl.c b/xmrstak/backend/cpu/crypto/c_groestl.c
index 5b3523e7953795c65d673e5be4ce25b95f457826..bae9a9f1188f9b3da5b772bb5e2b00a33e6eef44 100644
--- a/xmrstak/backend/cpu/crypto/c_groestl.c
+++ b/xmrstak/backend/cpu/crypto/c_groestl.c
@@ -14,178 +14,185 @@
 #define P_TYPE 0
 #define Q_TYPE 1
 
-const uint8_t shift_Values[2][8] = {{0,1,2,3,4,5,6,7},{1,3,5,7,0,2,4,6}};
-
-const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6};
-
-
-#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
-															v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
-															v1 = temp_var;}
-
-
-#define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t)				\
-   tu = T[2*(uint32_t)x[4*c0+0]];			    \
-   tl = T[2*(uint32_t)x[4*c0+0]+1];		    \
-   tv1 = T[2*(uint32_t)x[4*c1+1]];			\
-   tv2 = T[2*(uint32_t)x[4*c1+1]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
-   tu ^= tv1;						\
-   tl ^= tv2;						\
-   tv1 = T[2*(uint32_t)x[4*c2+2]];			\
-   tv2 = T[2*(uint32_t)x[4*c2+2]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
-   tu ^= tv1;						\
-   tl ^= tv2;   					\
-   tv1 = T[2*(uint32_t)x[4*c3+3]];			\
-   tv2 = T[2*(uint32_t)x[4*c3+3]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
-   tu ^= tv1;						\
-   tl ^= tv2;						\
-   tl ^= T[2*(uint32_t)x[4*c4+0]];			\
-   tu ^= T[2*(uint32_t)x[4*c4+0]+1];			\
-   tv1 = T[2*(uint32_t)x[4*c5+1]];			\
-   tv2 = T[2*(uint32_t)x[4*c5+1]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,1,t)	\
-   tl ^= tv1;						\
-   tu ^= tv2;						\
-   tv1 = T[2*(uint32_t)x[4*c6+2]];			\
-   tv2 = T[2*(uint32_t)x[4*c6+2]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,2,t)	\
-   tl ^= tv1;						\
-   tu ^= tv2;   					\
-   tv1 = T[2*(uint32_t)x[4*c7+3]];			\
-   tv2 = T[2*(uint32_t)x[4*c7+3]+1];			\
-   ROTATE_COLUMN_DOWN(tv1,tv2,3,t)	\
-   tl ^= tv1;						\
-   tu ^= tv2;						\
-   y[i] = tu;						\
-   y[i+1] = tl;
+const uint8_t shift_Values[2][8] = {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 3, 5, 7, 0, 2, 4, 6}};
 
+const uint8_t indices_cyclic[15] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6};
+
+#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var)                        \
+	{                                                                             \
+		temp_var = (v1 << (8 * amount_bytes)) | (v2 >> (8 * (4 - amount_bytes))); \
+		v2 = (v2 << (8 * amount_bytes)) | (v1 >> (8 * (4 - amount_bytes)));       \
+		v1 = temp_var;                                                            \
+	}
+
+#define COLUMN(x, y, i, c0, c1, c2, c3, c4, c5, c6, c7, tv1, tv2, tu, tl, t) \
+	tu = T[2 * (uint32_t)x[4 * c0 + 0]];                                     \
+	tl = T[2 * (uint32_t)x[4 * c0 + 0] + 1];                                 \
+	tv1 = T[2 * (uint32_t)x[4 * c1 + 1]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c1 + 1] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 1, t)                                       \
+	tu ^= tv1;                                                               \
+	tl ^= tv2;                                                               \
+	tv1 = T[2 * (uint32_t)x[4 * c2 + 2]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c2 + 2] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 2, t)                                       \
+	tu ^= tv1;                                                               \
+	tl ^= tv2;                                                               \
+	tv1 = T[2 * (uint32_t)x[4 * c3 + 3]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c3 + 3] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 3, t)                                       \
+	tu ^= tv1;                                                               \
+	tl ^= tv2;                                                               \
+	tl ^= T[2 * (uint32_t)x[4 * c4 + 0]];                                    \
+	tu ^= T[2 * (uint32_t)x[4 * c4 + 0] + 1];                                \
+	tv1 = T[2 * (uint32_t)x[4 * c5 + 1]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c5 + 1] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 1, t)                                       \
+	tl ^= tv1;                                                               \
+	tu ^= tv2;                                                               \
+	tv1 = T[2 * (uint32_t)x[4 * c6 + 2]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c6 + 2] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 2, t)                                       \
+	tl ^= tv1;                                                               \
+	tu ^= tv2;                                                               \
+	tv1 = T[2 * (uint32_t)x[4 * c7 + 3]];                                    \
+	tv2 = T[2 * (uint32_t)x[4 * c7 + 3] + 1];                                \
+	ROTATE_COLUMN_DOWN(tv1, tv2, 3, t)                                       \
+	tl ^= tv1;                                                               \
+	tu ^= tv2;                                                               \
+	y[i] = tu;                                                               \
+	y[i + 1] = tl;
 
 /* compute one round of P (short variants) */
-static void RND512P(uint8_t *x, uint32_t *y, uint32_t r) {
-  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
-  uint32_t* x32 = (uint32_t*)x;
-  x32[ 0] ^= 0x00000000^r;
-  x32[ 2] ^= 0x00000010^r;
-  x32[ 4] ^= 0x00000020^r;
-  x32[ 6] ^= 0x00000030^r;
-  x32[ 8] ^= 0x00000040^r;
-  x32[10] ^= 0x00000050^r;
-  x32[12] ^= 0x00000060^r;
-  x32[14] ^= 0x00000070^r;
-  COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+static void RND512P(uint8_t* x, uint32_t* y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[0] ^= 0x00000000 ^ r;
+	x32[2] ^= 0x00000010 ^ r;
+	x32[4] ^= 0x00000020 ^ r;
+	x32[6] ^= 0x00000030 ^ r;
+	x32[8] ^= 0x00000040 ^ r;
+	x32[10] ^= 0x00000050 ^ r;
+	x32[12] ^= 0x00000060 ^ r;
+	x32[14] ^= 0x00000070 ^ r;
+	COLUMN(x, y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
 }
 
 /* compute one round of Q (short variants) */
-static void RND512Q(uint8_t *x, uint32_t *y, uint32_t r) {
-  uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
-  uint32_t* x32 = (uint32_t*)x;
-  x32[ 0] = ~x32[ 0];
-  x32[ 1] ^= 0xffffffff^r;
-  x32[ 2] = ~x32[ 2];
-  x32[ 3] ^= 0xefffffff^r;
-  x32[ 4] = ~x32[ 4];
-  x32[ 5] ^= 0xdfffffff^r;
-  x32[ 6] = ~x32[ 6];
-  x32[ 7] ^= 0xcfffffff^r;
-  x32[ 8] = ~x32[ 8];
-  x32[ 9] ^= 0xbfffffff^r;
-  x32[10] = ~x32[10];
-  x32[11] ^= 0xafffffff^r;
-  x32[12] = ~x32[12];
-  x32[13] ^= 0x9fffffff^r;
-  x32[14] = ~x32[14];
-  x32[15] ^= 0x8fffffff^r;
-  COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-  COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+static void RND512Q(uint8_t* x, uint32_t* y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[0] = ~x32[0];
+	x32[1] ^= 0xffffffff ^ r;
+	x32[2] = ~x32[2];
+	x32[3] ^= 0xefffffff ^ r;
+	x32[4] = ~x32[4];
+	x32[5] ^= 0xdfffffff ^ r;
+	x32[6] = ~x32[6];
+	x32[7] ^= 0xcfffffff ^ r;
+	x32[8] = ~x32[8];
+	x32[9] ^= 0xbfffffff ^ r;
+	x32[10] = ~x32[10];
+	x32[11] ^= 0xafffffff ^ r;
+	x32[12] = ~x32[12];
+	x32[13] ^= 0x9fffffff ^ r;
+	x32[14] = ~x32[14];
+	x32[15] ^= 0x8fffffff ^ r;
+	COLUMN(x, y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	COLUMN(x, y, 14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
 }
 
 /* compute compression function (short variants) */
-static void F512(uint32_t *h, const uint32_t *m) {
-  int i;
-  uint32_t Ptmp[2*COLS512];
-  uint32_t Qtmp[2*COLS512];
-  uint32_t y[2*COLS512];
-  uint32_t z[2*COLS512];
-
-  for (i = 0; i < 2*COLS512; i++) {
-	z[i] = m[i];
-	Ptmp[i] = h[i]^m[i];
-  }
-
-  /* compute Q(m) */
-  RND512Q((uint8_t*)z, y, 0x00000000);
-  RND512Q((uint8_t*)y, z, 0x01000000);
-  RND512Q((uint8_t*)z, y, 0x02000000);
-  RND512Q((uint8_t*)y, z, 0x03000000);
-  RND512Q((uint8_t*)z, y, 0x04000000);
-  RND512Q((uint8_t*)y, z, 0x05000000);
-  RND512Q((uint8_t*)z, y, 0x06000000);
-  RND512Q((uint8_t*)y, z, 0x07000000);
-  RND512Q((uint8_t*)z, y, 0x08000000);
-  RND512Q((uint8_t*)y, Qtmp, 0x09000000);
-
-  /* compute P(h+m) */
-  RND512P((uint8_t*)Ptmp, y, 0x00000000);
-  RND512P((uint8_t*)y, z, 0x00000001);
-  RND512P((uint8_t*)z, y, 0x00000002);
-  RND512P((uint8_t*)y, z, 0x00000003);
-  RND512P((uint8_t*)z, y, 0x00000004);
-  RND512P((uint8_t*)y, z, 0x00000005);
-  RND512P((uint8_t*)z, y, 0x00000006);
-  RND512P((uint8_t*)y, z, 0x00000007);
-  RND512P((uint8_t*)z, y, 0x00000008);
-  RND512P((uint8_t*)y, Ptmp, 0x00000009);
-
-  /* compute P(h+m) + Q(m) + h */
-  for (i = 0; i < 2*COLS512; i++) {
-	h[i] ^= Ptmp[i]^Qtmp[i];
-  }
-}
+static void F512(uint32_t* h, const uint32_t* m)
+{
+	int i;
+	uint32_t Ptmp[2 * COLS512];
+	uint32_t Qtmp[2 * COLS512];
+	uint32_t y[2 * COLS512];
+	uint32_t z[2 * COLS512];
+
+	for(i = 0; i < 2 * COLS512; i++)
+	{
+		z[i] = m[i];
+		Ptmp[i] = h[i] ^ m[i];
+	}
 
+	/* compute Q(m) */
+	RND512Q((uint8_t*)z, y, 0x00000000);
+	RND512Q((uint8_t*)y, z, 0x01000000);
+	RND512Q((uint8_t*)z, y, 0x02000000);
+	RND512Q((uint8_t*)y, z, 0x03000000);
+	RND512Q((uint8_t*)z, y, 0x04000000);
+	RND512Q((uint8_t*)y, z, 0x05000000);
+	RND512Q((uint8_t*)z, y, 0x06000000);
+	RND512Q((uint8_t*)y, z, 0x07000000);
+	RND512Q((uint8_t*)z, y, 0x08000000);
+	RND512Q((uint8_t*)y, Qtmp, 0x09000000);
+
+	/* compute P(h+m) */
+	RND512P((uint8_t*)Ptmp, y, 0x00000000);
+	RND512P((uint8_t*)y, z, 0x00000001);
+	RND512P((uint8_t*)z, y, 0x00000002);
+	RND512P((uint8_t*)y, z, 0x00000003);
+	RND512P((uint8_t*)z, y, 0x00000004);
+	RND512P((uint8_t*)y, z, 0x00000005);
+	RND512P((uint8_t*)z, y, 0x00000006);
+	RND512P((uint8_t*)y, z, 0x00000007);
+	RND512P((uint8_t*)z, y, 0x00000008);
+	RND512P((uint8_t*)y, Ptmp, 0x00000009);
+
+	/* compute P(h+m) + Q(m) + h */
+	for(i = 0; i < 2 * COLS512; i++)
+	{
+		h[i] ^= Ptmp[i] ^ Qtmp[i];
+	}
+}
 
 /* digest up to msglen bytes of input (full blocks only) */
-static void Transform(groestlHashState *ctx,
-	       const uint8_t *input,
-	       int msglen) {
+static void Transform(groestlHashState* ctx,
+	const uint8_t* input,
+	int msglen)
+{
 
-  /* digest message, one block at a time */
-  for (; msglen >= SIZE512;
-	   msglen -= SIZE512, input += SIZE512) {
-	F512(ctx->chaining,(uint32_t*)input);
+	/* digest message, one block at a time */
+	for(; msglen >= SIZE512;
+		msglen -= SIZE512, input += SIZE512)
+	{
+		F512(ctx->chaining, (uint32_t*)input);
 
-	/* increment block counter */
-	ctx->block_counter1++;
-	if (ctx->block_counter1 == 0) ctx->block_counter2++;
-  }
+		/* increment block counter */
+		ctx->block_counter1++;
+		if(ctx->block_counter1 == 0)
+			ctx->block_counter2++;
+	}
 }
 
 /* given state h, do h <- P(h)+h */
-static void OutputTransformation(groestlHashState *ctx) {
-  int j;
-  uint32_t temp[2*COLS512];
-  uint32_t y[2*COLS512];
-  uint32_t z[2*COLS512];
-
-
-
-	for (j = 0; j < 2*COLS512; j++) {
-	  temp[j] = ctx->chaining[j];
+static void OutputTransformation(groestlHashState* ctx)
+{
+	int j;
+	uint32_t temp[2 * COLS512];
+	uint32_t y[2 * COLS512];
+	uint32_t z[2 * COLS512];
+
+	for(j = 0; j < 2 * COLS512; j++)
+	{
+		temp[j] = ctx->chaining[j];
 	}
 	RND512P((uint8_t*)temp, y, 0x00000000);
 	RND512P((uint8_t*)y, z, 0x00000001);
@@ -197,75 +204,84 @@ static void OutputTransformation(groestlHashState *ctx) {
 	RND512P((uint8_t*)y, z, 0x00000007);
 	RND512P((uint8_t*)z, y, 0x00000008);
 	RND512P((uint8_t*)y, temp, 0x00000009);
-	for (j = 0; j < 2*COLS512; j++) {
-	  ctx->chaining[j] ^= temp[j];
+	for(j = 0; j < 2 * COLS512; j++)
+	{
+		ctx->chaining[j] ^= temp[j];
 	}
 }
 
 /* initialise context */
-static void Init(groestlHashState* ctx) {
-  int i = 0;
-  /* allocate memory for state and data buffer */
-
-  for(;i<(SIZE512/sizeof(uint32_t));i++)
-  {
-	ctx->chaining[i] = 0;
-  }
-
-  /* set initial value */
-  ctx->chaining[2*COLS512-1] = u32BIG((uint32_t)HASH_BIT_LEN);
-
-  /* set other variables */
-  ctx->buf_ptr = 0;
-  ctx->block_counter1 = 0;
-  ctx->block_counter2 = 0;
-  ctx->bits_in_last_byte = 0;
+static void Init(groestlHashState* ctx)
+{
+	int i = 0;
+	/* allocate memory for state and data buffer */
+
+	for(; i < (SIZE512 / sizeof(uint32_t)); i++)
+	{
+		ctx->chaining[i] = 0;
+	}
+
+	/* set initial value */
+	ctx->chaining[2 * COLS512 - 1] = u32BIG((uint32_t)HASH_BIT_LEN);
+
+	/* set other variables */
+	ctx->buf_ptr = 0;
+	ctx->block_counter1 = 0;
+	ctx->block_counter2 = 0;
+	ctx->bits_in_last_byte = 0;
 }
 
 /* update state with databitlen bits of input */
 static void Update(groestlHashState* ctx,
-		  const BitSequence* input,
-		  DataLength databitlen) {
-  int index = 0;
-  int msglen = (int)(databitlen/8);
-  int rem = (int)(databitlen%8);
+	const BitSequence* input,
+	DataLength databitlen)
+{
+	int index = 0;
+	int msglen = (int)(databitlen / 8);
+	int rem = (int)(databitlen % 8);
 
-  /* if the buffer contains data that has not yet been digested, first
+	/* if the buffer contains data that has not yet been digested, first
 	 add data to buffer until full */
-  if (ctx->buf_ptr) {
-	while (ctx->buf_ptr < SIZE512 && index < msglen) {
-	  ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
-	}
-	if (ctx->buf_ptr < SIZE512) {
-	  /* buffer still not full, return */
-	  if (rem) {
-	ctx->bits_in_last_byte = rem;
-	ctx->buffer[(int)ctx->buf_ptr++] = input[index];
-	  }
-	  return;
+	if(ctx->buf_ptr)
+	{
+		while(ctx->buf_ptr < SIZE512 && index < msglen)
+		{
+			ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+		}
+		if(ctx->buf_ptr < SIZE512)
+		{
+			/* buffer still not full, return */
+			if(rem)
+			{
+				ctx->bits_in_last_byte = rem;
+				ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+			}
+			return;
+		}
+
+		/* digest buffer */
+		ctx->buf_ptr = 0;
+		Transform(ctx, ctx->buffer, SIZE512);
 	}
 
-	/* digest buffer */
-	ctx->buf_ptr = 0;
-	Transform(ctx, ctx->buffer, SIZE512);
-  }
+	/* digest bulk of message */
+	Transform(ctx, input + index, msglen - index);
+	index += ((msglen - index) / SIZE512) * SIZE512;
 
-  /* digest bulk of message */
-  Transform(ctx, input+index, msglen-index);
-  index += ((msglen-index)/SIZE512)*SIZE512;
-
-  /* store remaining data in buffer */
-  while (index < msglen) {
-	ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
-  }
+	/* store remaining data in buffer */
+	while(index < msglen)
+	{
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+	}
 
-  /* if non-integral number of bytes have been supplied, store
+	/* if non-integral number of bytes have been supplied, store
 	 remaining bits in last byte, together with information about
 	 number of bits */
-  if (rem) {
-	ctx->bits_in_last_byte = rem;
-	ctx->buffer[(int)ctx->buf_ptr++] = input[index];
-  }
+	if(rem)
+	{
+		ctx->bits_in_last_byte = rem;
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+	}
 }
 
 #define BILB ctx->bits_in_last_byte
@@ -273,80 +289,92 @@ static void Update(groestlHashState* ctx,
 /* finalise: process remaining data (including padding), perform
    output transformation, and write hash result to 'output' */
 static void Final(groestlHashState* ctx,
-		 BitSequence* output) {
-  int i, j = 0, hashbytelen = HASH_BIT_LEN/8;
-  uint8_t *s = (BitSequence*)ctx->chaining;
-
-  /* pad with '1'-bit and first few '0'-bits */
-  if (BILB) {
-	ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
-	ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
-	BILB = 0;
-  }
-  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
-
-  /* pad with '0'-bits */
-  if (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
-	/* padding requires two blocks */
-	while (ctx->buf_ptr < SIZE512) {
-	  ctx->buffer[(int)ctx->buf_ptr++] = 0;
+	BitSequence* output)
+{
+	int i, j = 0, hashbytelen = HASH_BIT_LEN / 8;
+	uint8_t* s = (BitSequence*)ctx->chaining;
+
+	/* pad with '1'-bit and first few '0'-bits */
+	if(BILB)
+	{
+		ctx->buffer[(int)ctx->buf_ptr - 1] &= ((1 << BILB) - 1) << (8 - BILB);
+		ctx->buffer[(int)ctx->buf_ptr - 1] ^= 0x1 << (7 - BILB);
+		BILB = 0;
+	}
+	else
+		ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+
+	/* pad with '0'-bits */
+	if(ctx->buf_ptr > SIZE512 - LENGTHFIELDLEN)
+	{
+		/* padding requires two blocks */
+		while(ctx->buf_ptr < SIZE512)
+		{
+			ctx->buffer[(int)ctx->buf_ptr++] = 0;
+		}
+		/* digest first padding block */
+		Transform(ctx, ctx->buffer, SIZE512);
+		ctx->buf_ptr = 0;
 	}
-	/* digest first padding block */
+	while(ctx->buf_ptr < SIZE512 - LENGTHFIELDLEN)
+	{
+		ctx->buffer[(int)ctx->buf_ptr++] = 0;
+	}
+
+	/* length padding */
+	ctx->block_counter1++;
+	if(ctx->block_counter1 == 0)
+		ctx->block_counter2++;
+	ctx->buf_ptr = SIZE512;
+
+	while(ctx->buf_ptr > SIZE512 - (int)sizeof(uint32_t))
+	{
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
+		ctx->block_counter1 >>= 8;
+	}
+	while(ctx->buf_ptr > SIZE512 - LENGTHFIELDLEN)
+	{
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
+		ctx->block_counter2 >>= 8;
+	}
+	/* digest final padding block */
 	Transform(ctx, ctx->buffer, SIZE512);
-	ctx->buf_ptr = 0;
-  }
-  while (ctx->buf_ptr < SIZE512-LENGTHFIELDLEN) {
-	ctx->buffer[(int)ctx->buf_ptr++] = 0;
-  }
-
-  /* length padding */
-  ctx->block_counter1++;
-  if (ctx->block_counter1 == 0) ctx->block_counter2++;
-  ctx->buf_ptr = SIZE512;
-
-  while (ctx->buf_ptr > SIZE512-(int)sizeof(uint32_t)) {
-	ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
-	ctx->block_counter1 >>= 8;
-  }
-  while (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) {
-	ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
-	ctx->block_counter2 >>= 8;
-  }
-  /* digest final padding block */
-  Transform(ctx, ctx->buffer, SIZE512);
-  /* perform output transformation */
-  OutputTransformation(ctx);
-
-  /* store hash result in output */
-  for (i = SIZE512-hashbytelen; i < SIZE512; i++,j++) {
-	output[j] = s[i];
-  }
-
-  /* zeroise relevant variables and deallocate memory */
-  for (i = 0; i < COLS512; i++) {
-	ctx->chaining[i] = 0;
-  }
-  for (i = 0; i < SIZE512; i++) {
-	ctx->buffer[i] = 0;
-  }
+	/* perform output transformation */
+	OutputTransformation(ctx);
+
+	/* store hash result in output */
+	for(i = SIZE512 - hashbytelen; i < SIZE512; i++, j++)
+	{
+		output[j] = s[i];
+	}
+
+	/* zeroise relevant variables and deallocate memory */
+	for(i = 0; i < COLS512; i++)
+	{
+		ctx->chaining[i] = 0;
+	}
+	for(i = 0; i < SIZE512; i++)
+	{
+		ctx->buffer[i] = 0;
+	}
 }
 
 /* hash bit sequence */
 void groestl(const BitSequence* data,
-		DataLength databitlen,
-		BitSequence* hashval) {
+	DataLength databitlen,
+	BitSequence* hashval)
+{
 
-  groestlHashState context;
+	groestlHashState context;
 
-  /* initialise */
+	/* initialise */
 	Init(&context);
 
+	/* process message */
+	Update(&context, data, databitlen);
 
-  /* process message */
-  Update(&context, data, databitlen);
-
-  /* finalise */
-  Final(&context, hashval);
+	/* finalise */
+	Final(&context, hashval);
 }
 /*
 static int crypto_hash(unsigned char *out,
diff --git a/xmrstak/backend/cpu/crypto/c_groestl.h b/xmrstak/backend/cpu/crypto/c_groestl.h
index 47044b462ea5ae60e2b6cf0ed78fae51395e8dcb..5322a2e2ea82e3c251a18ab9ecca3b9d7c00c0ce 100644
--- a/xmrstak/backend/cpu/crypto/c_groestl.h
+++ b/xmrstak/backend/cpu/crypto/c_groestl.h
@@ -1,10 +1,10 @@
 #ifndef __hash_h
 #define __hash_h
 /*
-#include "crypto_uint8.h"
+#include "crypto_hash.h"
 #include "crypto_uint32.h"
 #include "crypto_uint64.h"
-#include "crypto_hash.h"
+#include "crypto_uint8.h"
 
 typedef crypto_uint8 uint8_t;
 typedef crypto_uint32 uint32_t;
@@ -19,29 +19,28 @@ typedef crypto_uint64 uint64_t;
 #define LENGTHFIELDLEN ROWS
 #define COLS512 8
 
-#define SIZE512 (ROWS*COLS512)
+#define SIZE512 (ROWS * COLS512)
 
 #define ROUNDS512 10
 #define HASH_BIT_LEN 256
 
-#define ROTL32(v, n) ((((v)<<(n))|((v)>>(32-(n))))&li_32(ffffffff))
-
+#define ROTL32(v, n) ((((v) << (n)) | ((v) >> (32 - (n)))) & li_32(ffffffff))
 
 #define li_32(h) 0x##h##u
-#define EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
-#define u32BIG(a)				\
-  ((ROTL32(a,8) & li_32(00FF00FF)) |		\
-   (ROTL32(a,24) & li_32(FF00FF00)))
-
+#define EXT_BYTE(var, n) ((uint8_t)((uint32_t)(var) >> (8 * n)))
+#define u32BIG(a)                       \
+	((ROTL32(a, 8) & li_32(00FF00FF)) | \
+		(ROTL32(a, 24) & li_32(FF00FF00)))
 
 /* NIST API begin */
-typedef struct {
-  uint32_t chaining[SIZE512/sizeof(uint32_t)];            /* actual state */
-  uint32_t block_counter1,
-  block_counter2;         /* message block counter(s) */
-  BitSequence buffer[SIZE512];      /* data buffer */
-  int buf_ptr;              /* data buffer pointer */
-  int bits_in_last_byte;    /* no. of message bits in last byte of
+typedef struct
+{
+	uint32_t chaining[SIZE512 / sizeof(uint32_t)]; /* actual state */
+	uint32_t block_counter1,
+		block_counter2;			 /* message block counter(s) */
+	BitSequence buffer[SIZE512]; /* data buffer */
+	int buf_ptr;				 /* data buffer pointer */
+	int bits_in_last_byte;		 /* no. of message bits in last byte of
                                data buffer */
 } groestlHashState;
 
diff --git a/xmrstak/backend/cpu/crypto/c_jh.c b/xmrstak/backend/cpu/crypto/c_jh.c
index 0256a0fa22d539564fb78458657ca0439e0372d2..e50886dee28a2a910f63b6921e96af5ee3a3540b 100644
--- a/xmrstak/backend/cpu/crypto/c_jh.c
+++ b/xmrstak/backend/cpu/crypto/c_jh.c
@@ -23,345 +23,400 @@ typedef uint64_t uint64;
 
 /*define data alignment for different C compilers*/
 #if defined(__GNUC__)
-	  #define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
+#define DATA_ALIGN16(x) x __attribute__((aligned(16)))
 #else
-	  #define DATA_ALIGN16(x) __declspec(align(16)) x
+#define DATA_ALIGN16(x) __declspec(align(16)) x
 #endif
 
-
-typedef struct {
-	int hashbitlen;	   	              /*the message digest size*/
-	unsigned long long databitlen;    /*the message size in bits*/
-	unsigned long long datasize_in_buffer;      /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
-	DATA_ALIGN16(uint64 x[8][2]);     /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/
-	unsigned char buffer[64];         /*the 512-bit message block to be hashed;*/
+typedef struct
+{
+	int hashbitlen;						   /*the message digest size*/
+	unsigned long long databitlen;		   /*the message size in bits*/
+	unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
+	DATA_ALIGN16(uint64 x[8][2]);		   /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/
+	unsigned char buffer[64];			   /*the 512-bit message block to be hashed;*/
 } hashState;
 
-
 /*The initial hash value H(0)*/
-const unsigned char JH224_H0[128]={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e};
-const unsigned char JH256_H0[128]={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69};
-const unsigned char JH384_H0[128]={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f};
-const unsigned char JH512_H0[128]={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b};
+const unsigned char JH224_H0[128] = {0x2d, 0xfe, 0xdd, 0x62, 0xf9, 0x9a, 0x98, 0xac, 0xae, 0x7c, 0xac, 0xd6, 0x19, 0xd6, 0x34, 0xe7, 0xa4, 0x83, 0x10, 0x5, 0xbc, 0x30, 0x12, 0x16, 0xb8, 0x60, 0x38, 0xc6, 0xc9, 0x66, 0x14, 0x94, 0x66, 0xd9, 0x89, 0x9f, 0x25, 0x80, 0x70, 0x6f, 0xce, 0x9e, 0xa3, 0x1b, 0x1d, 0x9b, 0x1a, 0xdc, 0x11, 0xe8, 0x32, 0x5f, 0x7b, 0x36, 0x6e, 0x10, 0xf9, 0x94, 0x85, 0x7f, 0x2, 0xfa, 0x6, 0xc1, 0x1b, 0x4f, 0x1b, 0x5c, 0xd8, 0xc8, 0x40, 0xb3, 0x97, 0xf6, 0xa1, 0x7f, 0x6e, 0x73, 0x80, 0x99, 0xdc, 0xdf, 0x93, 0xa5, 0xad, 0xea, 0xa3, 0xd3, 0xa4, 0x31, 0xe8, 0xde, 0xc9, 0x53, 0x9a, 0x68, 0x22, 0xb4, 0xa9, 0x8a, 0xec, 0x86, 0xa1, 0xe4, 0xd5, 0x74, 0xac, 0x95, 0x9c, 0xe5, 0x6c, 0xf0, 0x15, 0x96, 0xd, 0xea, 0xb5, 0xab, 0x2b, 0xbf, 0x96, 0x11, 0xdc, 0xf0, 0xdd, 0x64, 0xea, 0x6e};
+const unsigned char JH256_H0[128] = {0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1, 0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3, 0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77, 0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8, 0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62, 0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c, 0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf, 0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69};
+const unsigned char JH384_H0[128] = {0x48, 0x1e, 0x3b, 0xc6, 0xd8, 0x13, 0x39, 0x8a, 0x6d, 0x3b, 0x5e, 0x89, 0x4a, 0xde, 0x87, 0x9b, 0x63, 0xfa, 0xea, 0x68, 0xd4, 0x80, 0xad, 0x2e, 0x33, 0x2c, 0xcb, 0x21, 0x48, 0xf, 0x82, 0x67, 0x98, 0xae, 0xc8, 0x4d, 0x90, 0x82, 0xb9, 0x28, 0xd4, 0x55, 0xea, 0x30, 0x41, 0x11, 0x42, 0x49, 0x36, 0xf5, 0x55, 0xb2, 0x92, 0x48, 0x47, 0xec, 0xc7, 0x25, 0xa, 0x93, 0xba, 0xf4, 0x3c, 0xe1, 0x56, 0x9b, 0x7f, 0x8a, 0x27, 0xdb, 0x45, 0x4c, 0x9e, 0xfc, 0xbd, 0x49, 0x63, 0x97, 0xaf, 0xe, 0x58, 0x9f, 0xc2, 0x7d, 0x26, 0xaa, 0x80, 0xcd, 0x80, 0xc0, 0x8b, 0x8c, 0x9d, 0xeb, 0x2e, 0xda, 0x8a, 0x79, 0x81, 0xe8, 0xf8, 0xd5, 0x37, 0x3a, 0xf4, 0x39, 0x67, 0xad, 0xdd, 0xd1, 0x7a, 0x71, 0xa9, 0xb4, 0xd3, 0xbd, 0xa4, 0x75, 0xd3, 0x94, 0x97, 0x6c, 0x3f, 0xba, 0x98, 0x42, 0x73, 0x7f};
+const unsigned char JH512_H0[128] = {0x6f, 0xd1, 0x4b, 0x96, 0x3e, 0x0, 0xaa, 0x17, 0x63, 0x6a, 0x2e, 0x5, 0x7a, 0x15, 0xd5, 0x43, 0x8a, 0x22, 0x5e, 0x8d, 0xc, 0x97, 0xef, 0xb, 0xe9, 0x34, 0x12, 0x59, 0xf2, 0xb3, 0xc3, 0x61, 0x89, 0x1d, 0xa0, 0xc1, 0x53, 0x6f, 0x80, 0x1e, 0x2a, 0xa9, 0x5, 0x6b, 0xea, 0x2b, 0x6d, 0x80, 0x58, 0x8e, 0xcc, 0xdb, 0x20, 0x75, 0xba, 0xa6, 0xa9, 0xf, 0x3a, 0x76, 0xba, 0xf8, 0x3b, 0xf7, 0x1, 0x69, 0xe6, 0x5, 0x41, 0xe3, 0x4a, 0x69, 0x46, 0xb5, 0x8a, 0x8e, 0x2e, 0x6f, 0xe6, 0x5a, 0x10, 0x47, 0xa7, 0xd0, 0xc1, 0x84, 0x3c, 0x24, 0x3b, 0x6e, 0x71, 0xb1, 0x2d, 0x5a, 0xc1, 0x99, 0xcf, 0x57, 0xf6, 0xec, 0x9d, 0xb1, 0xf8, 0x56, 0xa7, 0x6, 0x88, 0x7c, 0x57, 0x16, 0xb1, 0x56, 0xe3, 0xc2, 0xfc, 0xdf, 0xe6, 0x85, 0x17, 0xfb, 0x54, 0x5a, 0x46, 0x78, 0xcc, 0x8c, 0xdd, 0x4b};
 
 /*42 round constants, each round constant is 32-byte (256-bit)*/
-const unsigned char E8_bitslice_roundconstant[42][32]={
-{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
-{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
-{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
-{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
-{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
-{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
-{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
-{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
-{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
-{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
-{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
-{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
-{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
-{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
-{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
-{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
-{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
-{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
-{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
-{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
-{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
-{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
-{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
-{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
-{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
-{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
-{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
-{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
-{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
-{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
-{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
-{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
-{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
-{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
-{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
-{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
-{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
-{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
-{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
-{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
-{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
-{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
-
-
-static void E8(hashState *state);  /*The bijective function E8, in bitslice form*/
-static void F8(hashState *state);  /*The compression function F8 */
+const unsigned char E8_bitslice_roundconstant[42][32] = {
+	{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
+	{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
+	{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
+	{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
+	{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
+	{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
+	{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
+	{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
+	{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
+	{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
+	{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
+	{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
+	{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
+	{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
+	{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
+	{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
+	{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
+	{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
+	{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
+	{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
+	{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
+	{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
+	{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
+	{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
+	{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
+	{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
+	{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
+	{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
+	{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
+	{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
+	{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
+	{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
+	{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
+	{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
+	{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
+	{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
+	{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
+	{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
+	{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
+	{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
+	{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
+	{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}};
+
+static void E8(hashState* state); /*The bijective function E8, in bitslice form*/
+static void F8(hashState* state); /*The compression function F8 */
 
 /*The API functions*/
-static HashReturn Init(hashState *state, int hashbitlen);
-static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
-static HashReturn Final(hashState *state, BitSequence *hashval);
-HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval);
+static HashReturn Init(hashState* state, int hashbitlen);
+static HashReturn Update(hashState* state, const BitSequence* data, DataLength databitlen);
+static HashReturn Final(hashState* state, BitSequence* hashval);
+HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval);
 
 /*swapping bit 2i with bit 2i+1 of 64-bit x*/
-#define SWAP1(x)   (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
+#define SWAP1(x) (x) = ((((x)&0x5555555555555555ULL) << 1) | (((x)&0xaaaaaaaaaaaaaaaaULL) >> 1));
 /*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 64-bit x*/
-#define SWAP2(x)   (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
+#define SWAP2(x) (x) = ((((x)&0x3333333333333333ULL) << 2) | (((x)&0xccccccccccccccccULL) >> 2));
 /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 64-bit x*/
-#define SWAP4(x)   (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
+#define SWAP4(x) (x) = ((((x)&0x0f0f0f0f0f0f0f0fULL) << 4) | (((x)&0xf0f0f0f0f0f0f0f0ULL) >> 4));
 /*swapping bits 16i||16i+1||......||16i+7  with bits 16i+8||16i+9||......||16i+15 of 64-bit x*/
-#define SWAP8(x)   (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
+#define SWAP8(x) (x) = ((((x)&0x00ff00ff00ff00ffULL) << 8) | (((x)&0xff00ff00ff00ff00ULL) >> 8));
 /*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 64-bit x*/
-#define SWAP16(x)  (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
+#define SWAP16(x) (x) = ((((x)&0x0000ffff0000ffffULL) << 16) | (((x)&0xffff0000ffff0000ULL) >> 16));
 /*swapping bits 64i||64i+1||......||64i+31 with bits 64i+32||64i+33||......||64i+63 of 64-bit x*/
-#define SWAP32(x)  (x) = (((x) << 32) | ((x) >> 32));
+#define SWAP32(x) (x) = (((x) << 32) | ((x) >> 32));
 
 /*The MDS transform*/
-#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
-	  (m4) ^= (m1);                \
-	  (m5) ^= (m2);                \
-	  (m6) ^= (m0) ^ (m3);         \
-	  (m7) ^= (m0);                \
-	  (m0) ^= (m5);                \
-	  (m1) ^= (m6);                \
-	  (m2) ^= (m4) ^ (m7);         \
-	  (m3) ^= (m4);
+#define L(m0, m1, m2, m3, m4, m5, m6, m7) \
+	(m4) ^= (m1);                         \
+	(m5) ^= (m2);                         \
+	(m6) ^= (m0) ^ (m3);                  \
+	(m7) ^= (m0);                         \
+	(m0) ^= (m5);                         \
+	(m1) ^= (m6);                         \
+	(m2) ^= (m4) ^ (m7);                  \
+	(m3) ^= (m4);
 
 /*Two Sboxes are computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/
 /*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power*/
-#define SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1)   \
-	  m3  = ~(m3);                  \
-	  m7  = ~(m7);                  \
-	  m0 ^= ((~(m2)) & (cc0));      \
-	  m4 ^= ((~(m6)) & (cc1));      \
-	  temp0 = (cc0) ^ ((m0) & (m1));\
-	  temp1 = (cc1) ^ ((m4) & (m5));\
-	  m0 ^= ((m2) & (m3));          \
-	  m4 ^= ((m6) & (m7));          \
-	  m3 ^= ((~(m1)) & (m2));       \
-	  m7 ^= ((~(m5)) & (m6));       \
-	  m1 ^= ((m0) & (m2));          \
-	  m5 ^= ((m4) & (m6));          \
-	  m2 ^= ((m0) & (~(m3)));       \
-	  m6 ^= ((m4) & (~(m7)));       \
-	  m0 ^= ((m1) | (m3));          \
-	  m4 ^= ((m5) | (m7));          \
-	  m3 ^= ((m1) & (m2));          \
-	  m7 ^= ((m5) & (m6));          \
-	  m1 ^= (temp0 & (m0));         \
-	  m5 ^= (temp1 & (m4));         \
-	  m2 ^= temp0;                  \
-	  m6 ^= temp1;
+#define SS(m0, m1, m2, m3, m4, m5, m6, m7, cc0, cc1) \
+	m3 = ~(m3);                                      \
+	m7 = ~(m7);                                      \
+	m0 ^= ((~(m2)) & (cc0));                         \
+	m4 ^= ((~(m6)) & (cc1));                         \
+	temp0 = (cc0) ^ ((m0) & (m1));                   \
+	temp1 = (cc1) ^ ((m4) & (m5));                   \
+	m0 ^= ((m2) & (m3));                             \
+	m4 ^= ((m6) & (m7));                             \
+	m3 ^= ((~(m1)) & (m2));                          \
+	m7 ^= ((~(m5)) & (m6));                          \
+	m1 ^= ((m0) & (m2));                             \
+	m5 ^= ((m4) & (m6));                             \
+	m2 ^= ((m0) & (~(m3)));                          \
+	m6 ^= ((m4) & (~(m7)));                          \
+	m0 ^= ((m1) | (m3));                             \
+	m4 ^= ((m5) | (m7));                             \
+	m3 ^= ((m1) & (m2));                             \
+	m7 ^= ((m5) & (m6));                             \
+	m1 ^= (temp0 & (m0));                            \
+	m5 ^= (temp1 & (m4));                            \
+	m2 ^= temp0;                                     \
+	m6 ^= temp1;
 
 /*The bijective function E8, in bitslice form*/
-static void E8(hashState *state)
+static void E8(hashState* state)
 {
-	  uint64 i,roundnumber,temp0,temp1;
-
-	  for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) {
-			/*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP1(state->x[1][i]); SWAP1(state->x[3][i]); SWAP1(state->x[5][i]); SWAP1(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP2(state->x[1][i]); SWAP2(state->x[3][i]); SWAP2(state->x[5][i]); SWAP2(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP4(state->x[1][i]); SWAP4(state->x[3][i]); SWAP4(state->x[5][i]); SWAP4(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP8(state->x[1][i]); SWAP8(state->x[3][i]); SWAP8(state->x[5][i]); SWAP8(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP16(state->x[1][i]); SWAP16(state->x[3][i]); SWAP16(state->x[5][i]); SWAP16(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-				  SWAP32(state->x[1][i]); SWAP32(state->x[3][i]); SWAP32(state->x[5][i]); SWAP32(state->x[7][i]);
-			}
-
-			/*round 7*roundnumber+6: Sbox and MDS layers*/
-			for (i = 0; i < 2; i++) {
-				  SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i+2] );
-				  L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			}
-			/*round 7*roundnumber+6: swapping layer*/
-			for (i = 1; i < 8; i = i+2) {
-				  temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; state->x[i][1] = temp0;
-			}
-	  }
-
+	uint64 i, roundnumber, temp0, temp1;
+
+	for(roundnumber = 0; roundnumber < 42; roundnumber = roundnumber + 7)
+	{
+		/*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 0])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 0])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP1(state->x[1][i]);
+			SWAP1(state->x[3][i]);
+			SWAP1(state->x[5][i]);
+			SWAP1(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 1])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 1])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP2(state->x[1][i]);
+			SWAP2(state->x[3][i]);
+			SWAP2(state->x[5][i]);
+			SWAP2(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 2])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 2])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP4(state->x[1][i]);
+			SWAP4(state->x[3][i]);
+			SWAP4(state->x[5][i]);
+			SWAP4(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 3])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 3])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP8(state->x[1][i]);
+			SWAP8(state->x[3][i]);
+			SWAP8(state->x[5][i]);
+			SWAP8(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 4])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 4])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP16(state->x[1][i]);
+			SWAP16(state->x[3][i]);
+			SWAP16(state->x[5][i]);
+			SWAP16(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 5])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 5])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			SWAP32(state->x[1][i]);
+			SWAP32(state->x[3][i]);
+			SWAP32(state->x[5][i]);
+			SWAP32(state->x[7][i]);
+		}
+
+		/*round 7*roundnumber+6: Sbox and MDS layers*/
+		for(i = 0; i < 2; i++)
+		{
+			SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 6])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 6])[i + 2]);
+			L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+		}
+		/*round 7*roundnumber+6: swapping layer*/
+		for(i = 1; i < 8; i = i + 2)
+		{
+			temp0 = state->x[i][0];
+			state->x[i][0] = state->x[i][1];
+			state->x[i][1] = temp0;
+		}
+	}
 }
 
 /*The compression function F8 */
-static void F8(hashState *state)
+static void F8(hashState* state)
 {
-	  uint64  i;
+	uint64 i;
 
-	  /*xor the 512-bit message with the fist half of the 1024-bit hash state*/
-	  for (i = 0; i < 8; i++)  state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i];
+	/*xor the 512-bit message with the fist half of the 1024-bit hash state*/
+	for(i = 0; i < 8; i++)
+		state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i];
 
-	  /*the bijective function E8 */
-	  E8(state);
+	/*the bijective function E8 */
+	E8(state);
 
-	  /*xor the 512-bit message with the second half of the 1024-bit hash state*/
-	  for (i = 0; i < 8; i++)  state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64*)state->buffer)[i];
+	/*xor the 512-bit message with the second half of the 1024-bit hash state*/
+	for(i = 0; i < 8; i++)
+		state->x[(8 + i) >> 1][(8 + i) & 1] ^= ((uint64*)state->buffer)[i];
 }
 
 /*before hashing a message, initialize the hash state as H0 */
-static HashReturn Init(hashState *state, int hashbitlen)
+static HashReturn Init(hashState* state, int hashbitlen)
 {
-	  state->databitlen = 0;
-	  state->datasize_in_buffer = 0;
-
-	  /*initialize the initial hash value of JH*/
-	  state->hashbitlen = hashbitlen;
-
-	  /*load the initial hash value into state*/
-	  switch (hashbitlen)
-	  {
-			case 224: memcpy(state->x,JH224_H0,128); break;
-			case 256: memcpy(state->x,JH256_H0,128); break;
-			case 384: memcpy(state->x,JH384_H0,128); break;
-			case 512: memcpy(state->x,JH512_H0,128); break;
-	  }
-
-	  return(SUCCESS);
+	state->databitlen = 0;
+	state->datasize_in_buffer = 0;
+
+	/*initialize the initial hash value of JH*/
+	state->hashbitlen = hashbitlen;
+
+	/*load the initial hash value into state*/
+	switch(hashbitlen)
+	{
+	case 224:
+		memcpy(state->x, JH224_H0, 128);
+		break;
+	case 256:
+		memcpy(state->x, JH256_H0, 128);
+		break;
+	case 384:
+		memcpy(state->x, JH384_H0, 128);
+		break;
+	case 512:
+		memcpy(state->x, JH512_H0, 128);
+		break;
+	}
+
+	return (SUCCESS);
 }
 
-
 /*hash each 512-bit message block, except the last partial block*/
-static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
+static HashReturn Update(hashState* state, const BitSequence* data, DataLength databitlen)
 {
-	  DataLength index; /*the starting address of the data to be compressed*/
-
-	  state->databitlen += databitlen;
-	  index = 0;
-
-	  /*if there is remaining data in the buffer, fill it to a full message block first*/
-	  /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/
-
-	  /*There is data in the buffer, but the incoming data is insufficient for a full block*/
-	  if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  ) {
-			if ( (databitlen & 7) == 0 ) {
-				 memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ;
-		    }
-			else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ;
-			state->datasize_in_buffer += databitlen;
-			databitlen = 0;
-	  }
-
-	  /*There is data in the buffer, and the incoming data is sufficient for a full block*/
-	  if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  ) {
-	        memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
-	        index = 64-(state->datasize_in_buffer >> 3);
-	        databitlen = databitlen - (512 - state->datasize_in_buffer);
-	        F8(state);
-	        state->datasize_in_buffer = 0;
-	  }
-
-	  /*hash the remaining full message blocks*/
-	  for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) {
-			memcpy(state->buffer, data+index, 64);
-			F8(state);
-	  }
-
-	  /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/
-	  if ( databitlen > 0) {
-			if ((databitlen & 7) == 0)
-				  memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
-			else
-				  memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
-			state->datasize_in_buffer = databitlen;
-	  }
-
-	  return(SUCCESS);
+	DataLength index; /*the starting address of the data to be compressed*/
+
+	state->databitlen += databitlen;
+	index = 0;
+
+	/*if there is remaining data in the buffer, fill it to a full message block first*/
+	/*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/
+
+	/*There is data in the buffer, but the incoming data is insufficient for a full block*/
+	if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) < 512))
+	{
+		if((databitlen & 7) == 0)
+		{
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3));
+		}
+		else
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3) + 1);
+		state->datasize_in_buffer += databitlen;
+		databitlen = 0;
+	}
+
+	/*There is data in the buffer, and the incoming data is sufficient for a full block*/
+	if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) >= 512))
+	{
+		memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3));
+		index = 64 - (state->datasize_in_buffer >> 3);
+		databitlen = databitlen - (512 - state->datasize_in_buffer);
+		F8(state);
+		state->datasize_in_buffer = 0;
+	}
+
+	/*hash the remaining full message blocks*/
+	for(; databitlen >= 512; index = index + 64, databitlen = databitlen - 512)
+	{
+		memcpy(state->buffer, data + index, 64);
+		F8(state);
+	}
+
+	/*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/
+	if(databitlen > 0)
+	{
+		if((databitlen & 7) == 0)
+			memcpy(state->buffer, data + index, (databitlen & 0x1ff) >> 3);
+		else
+			memcpy(state->buffer, data + index, ((databitlen & 0x1ff) >> 3) + 1);
+		state->datasize_in_buffer = databitlen;
+	}
+
+	return (SUCCESS);
 }
 
 /*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
-static HashReturn Final(hashState *state, BitSequence *hashval)
+static HashReturn Final(hashState* state, BitSequence* hashval)
 {
-	  unsigned int i;
-
-	  if ( (state->databitlen & 0x1ff) == 0 ) {
-			/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
-			memset(state->buffer, 0, 64);
-			state->buffer[0]  = 0x80;
-			state->buffer[63] = state->databitlen & 0xff;
-			state->buffer[62] = (state->databitlen >> 8)  & 0xff;
-			state->buffer[61] = (state->databitlen >> 16) & 0xff;
-			state->buffer[60] = (state->databitlen >> 24) & 0xff;
-			state->buffer[59] = (state->databitlen >> 32) & 0xff;
-			state->buffer[58] = (state->databitlen >> 40) & 0xff;
-			state->buffer[57] = (state->databitlen >> 48) & 0xff;
-			state->buffer[56] = (state->databitlen >> 56) & 0xff;
-			F8(state);
-	  }
-	  else {
-		    /*set the rest of the bytes in the buffer to 0*/
-			if ( (state->datasize_in_buffer & 7) == 0)
-				  for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)  state->buffer[i] = 0;
-			else
-				  for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++)  state->buffer[i] = 0;
-
-			/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
-			state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
-
-			F8(state);
-			memset(state->buffer, 0, 64);
-			state->buffer[63] = state->databitlen & 0xff;
-			state->buffer[62] = (state->databitlen >> 8) & 0xff;
-			state->buffer[61] = (state->databitlen >> 16) & 0xff;
-			state->buffer[60] = (state->databitlen >> 24) & 0xff;
-			state->buffer[59] = (state->databitlen >> 32) & 0xff;
-			state->buffer[58] = (state->databitlen >> 40) & 0xff;
-			state->buffer[57] = (state->databitlen >> 48) & 0xff;
-			state->buffer[56] = (state->databitlen >> 56) & 0xff;
-			F8(state);
-	  }
-
-	  /*truncating the final hash value to generate the message digest*/
-	  switch(state->hashbitlen) {
-			case 224: memcpy(hashval,(unsigned char*)state->x+64+36,28);  break;
-			case 256: memcpy(hashval,(unsigned char*)state->x+64+32,32);  break;
-			case 384: memcpy(hashval,(unsigned char*)state->x+64+16,48);  break;
-			case 512: memcpy(hashval,(unsigned char*)state->x+64,64);     break;
-	  }
-
-	  return(SUCCESS);
+	unsigned int i;
+
+	if((state->databitlen & 0x1ff) == 0)
+	{
+		/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
+		memset(state->buffer, 0, 64);
+		state->buffer[0] = 0x80;
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8) & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		F8(state);
+	}
+	else
+	{
+		/*set the rest of the bytes in the buffer to 0*/
+		if((state->datasize_in_buffer & 7) == 0)
+			for(i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)
+				state->buffer[i] = 0;
+		else
+			for(i = ((state->databitlen & 0x1ff) >> 3) + 1; i < 64; i++)
+				state->buffer[i] = 0;
+
+		/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
+		state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7 - (state->databitlen & 7));
+
+		F8(state);
+		memset(state->buffer, 0, 64);
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8) & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		F8(state);
+	}
+
+	/*truncating the final hash value to generate the message digest*/
+	switch(state->hashbitlen)
+	{
+	case 224:
+		memcpy(hashval, (unsigned char*)state->x + 64 + 36, 28);
+		break;
+	case 256:
+		memcpy(hashval, (unsigned char*)state->x + 64 + 32, 32);
+		break;
+	case 384:
+		memcpy(hashval, (unsigned char*)state->x + 64 + 16, 48);
+		break;
+	case 512:
+		memcpy(hashval, (unsigned char*)state->x + 64, 64);
+		break;
+	}
+
+	return (SUCCESS);
 }
 
 /* hash a message,
    three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen)
    one output:   message digest (hashval)
 */
-HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval)
+HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval)
 {
-	  hashState state;
-
-	  if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 ) {
-			Init(&state, hashbitlen);
-			Update(&state, data, databitlen);
-			Final(&state, hashval);
-			return SUCCESS;
-	  }
-	  else
-			return(BAD_HASHLEN);
+	hashState state;
+
+	if(hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512)
+	{
+		Init(&state, hashbitlen);
+		Update(&state, data, databitlen);
+		Final(&state, hashval);
+		return SUCCESS;
+	}
+	else
+		return (BAD_HASHLEN);
 }
diff --git a/xmrstak/backend/cpu/crypto/c_jh.h b/xmrstak/backend/cpu/crypto/c_jh.h
index d10d40fe573ad48ae3d6af4737af983562dc4611..34d30e6b4fd956a917feba5a60d3405751b17c1d 100644
--- a/xmrstak/backend/cpu/crypto/c_jh.h
+++ b/xmrstak/backend/cpu/crypto/c_jh.h
@@ -16,4 +16,4 @@
 
 #include "hash.h"
 
-HashReturn jh_hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval);
diff --git a/xmrstak/backend/cpu/crypto/c_keccak.c b/xmrstak/backend/cpu/crypto/c_keccak.c
index 63c16147df9efc952c8a169c13f8059c662dd36d..0af6b02ef7dc8f2e56af653a089ddc38aba0657d 100644
--- a/xmrstak/backend/cpu/crypto/c_keccak.c
+++ b/xmrstak/backend/cpu/crypto/c_keccak.c
@@ -2,8 +2,8 @@
 // 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
 // A baseline Keccak (3rd round) implementation.
 
-#include <stdint.h>
 #include <memory.h>
+#include <stdint.h>
 
 #define HASH_DATA_AREA 136
 #define KECCAK_ROUNDS 24
@@ -13,16 +13,15 @@
 #endif
 
 const uint64_t keccakf_rndc[24] =
-{
-	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
-	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
-	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
-	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
-	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
-	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
-};
+	{
+		0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+		0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+		0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+		0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+		0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+		0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+		0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+		0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
 
 // update the state with given number of rounds
 
@@ -31,7 +30,8 @@ void keccakf(uint64_t st[25], int rounds)
 	int i, j, round;
 	uint64_t t, bc[5];
 
-	for (round = 0; round < rounds; ++round) {
+	for(round = 0; round < rounds; ++round)
+	{
 
 		// Theta
 		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
@@ -40,10 +40,11 @@ void keccakf(uint64_t st[25], int rounds)
 		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
 		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
 
-		for (i = 0; i < 5; ++i) {
+		for(i = 0; i < 5; ++i)
+		{
 			t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
-			st[i     ] ^= t;
-			st[i +  5] ^= t;
+			st[i] ^= t;
+			st[i + 5] ^= t;
 			st[i + 10] ^= t;
 			st[i + 15] ^= t;
 			st[i + 20] ^= t;
@@ -51,81 +52,81 @@ void keccakf(uint64_t st[25], int rounds)
 
 		// Rho Pi
 		t = st[1];
-		st[ 1] = ROTL64(st[ 6], 44);
-		st[ 6] = ROTL64(st[ 9], 20);
-		st[ 9] = ROTL64(st[22], 61);
+		st[1] = ROTL64(st[6], 44);
+		st[6] = ROTL64(st[9], 20);
+		st[9] = ROTL64(st[22], 61);
 		st[22] = ROTL64(st[14], 39);
 		st[14] = ROTL64(st[20], 18);
-		st[20] = ROTL64(st[ 2], 62);
-		st[ 2] = ROTL64(st[12], 43);
+		st[20] = ROTL64(st[2], 62);
+		st[2] = ROTL64(st[12], 43);
 		st[12] = ROTL64(st[13], 25);
-		st[13] = ROTL64(st[19],  8);
+		st[13] = ROTL64(st[19], 8);
 		st[19] = ROTL64(st[23], 56);
 		st[23] = ROTL64(st[15], 41);
-		st[15] = ROTL64(st[ 4], 27);
-		st[ 4] = ROTL64(st[24], 14);
-		st[24] = ROTL64(st[21],  2);
-		st[21] = ROTL64(st[ 8], 55);
-		st[ 8] = ROTL64(st[16], 45);
-		st[16] = ROTL64(st[ 5], 36);
-		st[ 5] = ROTL64(st[ 3], 28);
-		st[ 3] = ROTL64(st[18], 21);
+		st[15] = ROTL64(st[4], 27);
+		st[4] = ROTL64(st[24], 14);
+		st[24] = ROTL64(st[21], 2);
+		st[21] = ROTL64(st[8], 55);
+		st[8] = ROTL64(st[16], 45);
+		st[16] = ROTL64(st[5], 36);
+		st[5] = ROTL64(st[3], 28);
+		st[3] = ROTL64(st[18], 21);
 		st[18] = ROTL64(st[17], 15);
 		st[17] = ROTL64(st[11], 10);
-		st[11] = ROTL64(st[ 7],  6);
-		st[ 7] = ROTL64(st[10],  3);
+		st[11] = ROTL64(st[7], 6);
+		st[7] = ROTL64(st[10], 3);
 		st[10] = ROTL64(t, 1);
 
 		//  Chi
 		// unrolled loop, where only last iteration is different
 		j = 0;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 
-		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j] ^= (~st[j + 1]) & st[j + 2];
 		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
 		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
 		st[j + 3] ^= (~st[j + 4]) & bc[0];
 		st[j + 4] ^= (~bc[0]) & bc[1];
 
 		j = 5;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 
-		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j] ^= (~st[j + 1]) & st[j + 2];
 		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
 		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
 		st[j + 3] ^= (~st[j + 4]) & bc[0];
 		st[j + 4] ^= (~bc[0]) & bc[1];
 
 		j = 10;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 
-		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j] ^= (~st[j + 1]) & st[j + 2];
 		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
 		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
 		st[j + 3] ^= (~st[j + 4]) & bc[0];
 		st[j + 4] ^= (~bc[0]) & bc[1];
 
 		j = 15;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 
-		st[j    ] ^= (~st[j + 1]) & st[j + 2];
+		st[j] ^= (~st[j + 1]) & st[j + 2];
 		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
 		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
 		st[j + 3] ^= (~st[j + 4]) & bc[0];
 		st[j + 4] ^= (~bc[0]) & bc[1];
 
 		j = 20;
-		bc[0] = st[j    ];
+		bc[0] = st[j];
 		bc[1] = st[j + 1];
 		bc[2] = st[j + 2];
 		bc[3] = st[j + 3];
 		bc[4] = st[j + 4];
 
-		st[j    ] ^= (~bc[1]) & bc[2];
+		st[j] ^= (~bc[1]) & bc[2];
 		st[j + 1] ^= (~bc[2]) & bc[3];
 		st[j + 2] ^= (~bc[3]) & bc[4];
 		st[j + 3] ^= (~bc[4]) & bc[0];
@@ -139,7 +140,7 @@ void keccakf(uint64_t st[25], int rounds)
 // compute a keccak hash (md) of given byte length from "in"
 typedef uint64_t state_t[25];
 
-void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
+void keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen)
 {
 	state_t st;
 	uint8_t temp[144];
@@ -150,9 +151,10 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
 
 	memset(st, 0, sizeof(st));
 
-	for ( ; inlen >= rsiz; inlen -= rsiz, in += rsiz) {
-		for (i = 0; i < rsizw; i++)
-			st[i] ^= ((uint64_t *) in)[i];
+	for(; inlen >= rsiz; inlen -= rsiz, in += rsiz)
+	{
+		for(i = 0; i < rsizw; i++)
+			st[i] ^= ((uint64_t*)in)[i];
 		keccakf(st, KECCAK_ROUNDS);
 	}
 
@@ -162,15 +164,15 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
 	memset(temp + inlen, 0, rsiz - inlen);
 	temp[rsiz - 1] |= 0x80;
 
-	for (i = 0; i < rsizw; i++)
-		st[i] ^= ((uint64_t *) temp)[i];
+	for(i = 0; i < rsizw; i++)
+		st[i] ^= ((uint64_t*)temp)[i];
 
 	keccakf(st, KECCAK_ROUNDS);
 
 	memcpy(md, st, mdlen);
 }
 
-void keccak1600(const uint8_t *in, int inlen, uint8_t *md)
+void keccak1600(const uint8_t* in, int inlen, uint8_t* md)
 {
 	keccak(in, inlen, md, sizeof(state_t));
 }
diff --git a/xmrstak/backend/cpu/crypto/c_keccak.h b/xmrstak/backend/cpu/crypto/c_keccak.h
index 4f7f85729a55003ebcc78a5249b751b5c540a64f..b7a26065e3000087b364d205c9d4ec25e2d2a652 100644
--- a/xmrstak/backend/cpu/crypto/c_keccak.h
+++ b/xmrstak/backend/cpu/crypto/c_keccak.h
@@ -16,11 +16,11 @@
 #endif
 
 // compute a keccak hash (md) of given byte length from "in"
-int keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
+int keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen);
 
 // update the state
 void keccakf(uint64_t st[25], int norounds);
 
-void keccak1600(const uint8_t *in, int inlen, uint8_t *md);
+void keccak1600(const uint8_t* in, int inlen, uint8_t* md);
 
 #endif
diff --git a/xmrstak/backend/cpu/crypto/c_skein.c b/xmrstak/backend/cpu/crypto/c_skein.c
index e2d54425f282b2b671e04413b0de2f5656698471..4b8cbb3884f5f66ef889127527b51cbdcd5ef447 100644
--- a/xmrstak/backend/cpu/crypto/c_skein.c
+++ b/xmrstak/backend/cpu/crypto/c_skein.c
@@ -8,11 +8,11 @@
 **
 ************************************************************************/
 
-#define  SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
+#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */
 
-#include <stddef.h>                          /* get size_t definition */
-#include <string.h>      /* get the memcpy/memset functions */
-#include "c_skein.h"       /* get the Skein API definitions   */
+#include "c_skein.h" /* get the Skein API definitions   */
+#include <stddef.h>  /* get size_t definition */
+#include <string.h>  /* get the memcpy/memset functions */
 
 #define DISABLE_UNUSED 0
 
@@ -24,72 +24,72 @@
 #define SKEIN_512_NIST_MAX_HASHBITS (512)
 #endif
 
-#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */
 
-#define  SKEIN_256_STATE_WORDS ( 4)
-#define  SKEIN_512_STATE_WORDS ( 8)
-#define  SKEIN1024_STATE_WORDS (16)
-#define  SKEIN_MAX_STATE_WORDS (16)
+#define SKEIN_256_STATE_WORDS (4)
+#define SKEIN_512_STATE_WORDS (8)
+#define SKEIN1024_STATE_WORDS (16)
+#define SKEIN_MAX_STATE_WORDS (16)
 
-#define  SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
-#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define  SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS)
 
-#define  SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
-#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
-#define  SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS)
 
-#define  SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
-#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define  SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS)
 
-#define SKEIN_RND_SPECIAL       (1000u)
-#define SKEIN_RND_KEY_INITIAL   (SKEIN_RND_SPECIAL+0u)
-#define SKEIN_RND_KEY_INJECT    (SKEIN_RND_SPECIAL+1u)
-#define SKEIN_RND_FEED_FWD      (SKEIN_RND_SPECIAL+2u)
+#define SKEIN_RND_SPECIAL (1000u)
+#define SKEIN_RND_KEY_INITIAL (SKEIN_RND_SPECIAL + 0u)
+#define SKEIN_RND_KEY_INJECT (SKEIN_RND_SPECIAL + 1u)
+#define SKEIN_RND_FEED_FWD (SKEIN_RND_SPECIAL + 2u)
 
 typedef struct
 {
-  size_t  hashBitLen;                      /* size of hash result, in bits */
-  size_t  bCnt;                            /* current byte count in buffer b[] */
-  u64b_t  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
+	size_t hashBitLen;				/* size of hash result, in bits */
+	size_t bCnt;					/* current byte count in buffer b[] */
+	u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */
 } Skein_Ctxt_Hdr_t;
 
-typedef struct                               /*  256-bit Skein hash context structure */
+typedef struct /*  256-bit Skein hash context structure */
 {
-  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
-  u64b_t  X[SKEIN_256_STATE_WORDS];        /* chaining variables */
-  u08b_t  b[SKEIN_256_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+	Skein_Ctxt_Hdr_t h;				 /* common header context variables */
+	u64b_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */
+	u08b_t b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
 } Skein_256_Ctxt_t;
 
-typedef struct                               /*  512-bit Skein hash context structure */
+typedef struct /*  512-bit Skein hash context structure */
 {
-  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
-  u64b_t  X[SKEIN_512_STATE_WORDS];        /* chaining variables */
-  u08b_t  b[SKEIN_512_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+	Skein_Ctxt_Hdr_t h;				 /* common header context variables */
+	u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */
+	u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
 } Skein_512_Ctxt_t;
 
-typedef struct                               /* 1024-bit Skein hash context structure */
+typedef struct /* 1024-bit Skein hash context structure */
 {
-  Skein_Ctxt_Hdr_t h;                      /* common header context variables */
-  u64b_t  X[SKEIN1024_STATE_WORDS];        /* chaining variables */
-  u08b_t  b[SKEIN1024_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
+	Skein_Ctxt_Hdr_t h;				 /* common header context variables */
+	u64b_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */
+	u08b_t b[SKEIN1024_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
 } Skein1024_Ctxt_t;
 
 /*   Skein APIs for (incremental) "straight hashing" */
 #if SKEIN_256_NIST_MAX_HASH_BITS
-static int  Skein_256_Init  (Skein_256_Ctxt_t *ctx, size_t hashBitLen);
+static int Skein_256_Init(Skein_256_Ctxt_t* ctx, size_t hashBitLen);
 #endif
-static int  Skein_512_Init  (Skein_512_Ctxt_t *ctx, size_t hashBitLen);
-static int  Skein1024_Init  (Skein1024_Ctxt_t *ctx, size_t hashBitLen);
+static int Skein_512_Init(Skein_512_Ctxt_t* ctx, size_t hashBitLen);
+static int Skein1024_Init(Skein1024_Ctxt_t* ctx, size_t hashBitLen);
 
-static int  Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
-static int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
-static int  Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+static int Skein_256_Update(Skein_256_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt);
+static int Skein_512_Update(Skein_512_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt);
+static int Skein1024_Update(Skein1024_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt);
 
-static int  Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
-static int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
-static int  Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
+static int Skein_256_Final(Skein_256_Ctxt_t* ctx, u08b_t* hashVal);
+static int Skein_512_Final(Skein_512_Ctxt_t* ctx, u08b_t* hashVal);
+static int Skein1024_Final(Skein1024_Ctxt_t* ctx, u08b_t* hashVal);
 
 /*
 **   Skein APIs for "extended" initialization: MAC keys, tree hashing.
@@ -126,7 +126,7 @@ static int  Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 #define SKEIN_TREE_HASH (1)
 #endif
 #if 0
-#if  SKEIN_TREE_HASH
+#if SKEIN_TREE_HASH
 static int  Skein_256_Output   (Skein_256_Ctxt_t *ctx, u08b_t * hashVal);
 static int  Skein_512_Output   (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
 static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
@@ -142,128 +142,146 @@ static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 ******************************************************************/
 
 /* tweak word T[1]: bit field starting positions */
-#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+#define SKEIN_T1_BIT(BIT) ((BIT)-64) /* offset 64 because it's the second word  */
 
-#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
-#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
-#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
-#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
-#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree       */
+#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119)  /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126)	/* bits 126     : first block flag         */
+#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127)	/* bit  127     : final block flag         */
 
 /* tweak word T[1]: flag bit definition(s) */
-#define SKEIN_T1_FLAG_FIRST     (((u64b_t)  1 ) << SKEIN_T1_POS_FIRST)
-#define SKEIN_T1_FLAG_FINAL     (((u64b_t)  1 ) << SKEIN_T1_POS_FINAL)
-#define SKEIN_T1_FLAG_BIT_PAD   (((u64b_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+#define SKEIN_T1_FLAG_FIRST (((u64b_t)1) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL (((u64b_t)1) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t)1) << SKEIN_T1_POS_BIT_PAD)
 
 /* tweak word T[1]: tree level bit field mask */
-#define SKEIN_T1_TREE_LVL_MASK  (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
-#define SKEIN_T1_TREE_LEVEL(n)  (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t)(n)) << SKEIN_T1_POS_TREE_LVL)
 
 /* tweak word T[1]: block type field */
-#define SKEIN_BLK_TYPE_KEY      ( 0)                    /* key, for MAC and KDF */
-#define SKEIN_BLK_TYPE_CFG      ( 4)                    /* configuration block */
-#define SKEIN_BLK_TYPE_PERS     ( 8)                    /* personalization string */
-#define SKEIN_BLK_TYPE_PK       (12)                    /* public key (for digital signature hashing) */
-#define SKEIN_BLK_TYPE_KDF      (16)                    /* key identifier for KDF */
-#define SKEIN_BLK_TYPE_NONCE    (20)                    /* nonce for PRNG */
-#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
-#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
-#define SKEIN_BLK_TYPE_MASK     (63)                    /* bit field mask */
-
-#define SKEIN_T1_BLK_TYPE(T)   (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
-#define SKEIN_T1_BLK_TYPE_KEY   SKEIN_T1_BLK_TYPE(KEY)  /* key, for MAC and KDF */
-#define SKEIN_T1_BLK_TYPE_CFG   SKEIN_T1_BLK_TYPE(CFG)  /* configuration block */
-#define SKEIN_T1_BLK_TYPE_PERS  SKEIN_T1_BLK_TYPE(PERS) /* personalization string */
-#define SKEIN_T1_BLK_TYPE_PK    SKEIN_T1_BLK_TYPE(PK)   /* public key (for digital signature hashing) */
-#define SKEIN_T1_BLK_TYPE_KDF   SKEIN_T1_BLK_TYPE(KDF)  /* key identifier for KDF */
-#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */
-#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
-#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
-#define SKEIN_T1_BLK_TYPE_MASK  SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */
-
-#define SKEIN_T1_BLK_TYPE_CFG_FINAL       (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
-#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
-
-#define SKEIN_VERSION           (1)
-
-#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
-#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
-#endif
-
-#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((u64b_t) (hi32)) << 32))
-#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
-#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
-
-#define SKEIN_CFG_STR_LEN       (4*8)
+#define SKEIN_BLK_TYPE_KEY (0)	/* key, for MAC and KDF */
+#define SKEIN_BLK_TYPE_CFG (4)	/* configuration block */
+#define SKEIN_BLK_TYPE_PERS (8)   /* personalization string */
+#define SKEIN_BLK_TYPE_PK (12)	/* public key (for digital signature hashing) */
+#define SKEIN_BLK_TYPE_KDF (16)   /* key identifier for KDF */
+#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */
+#define SKEIN_BLK_TYPE_MSG (48)   /* message processing */
+#define SKEIN_BLK_TYPE_OUT (63)   /* output stage */
+#define SKEIN_BLK_TYPE_MASK (63)  /* bit field mask */
+
+#define SKEIN_T1_BLK_TYPE(T) (((u64b_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY)	 /* key, for MAC and KDF */
+#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG)	 /* configuration block */
+#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS)   /* personalization string */
+#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK)		 /* public key (for digital signature hashing) */
+#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF)	 /* key identifier for KDF */
+#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE) /* nonce for PRNG */
+#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG)	 /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT)	 /* output stage */
+#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK)   /* field bit mask */
+
+#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_VERSION (1)
+
+#ifndef SKEIN_ID_STRING_LE				/* allow compile-time personalization */
+#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/
+#endif
+
+#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((u64b_t)(hi32)) << 32))
+#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE)
+#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
+
+#define SKEIN_CFG_STR_LEN (4 * 8)
 
 /* bit field definitions in config block treeInfo word */
-#define SKEIN_CFG_TREE_LEAF_SIZE_POS  ( 0)
-#define SKEIN_CFG_TREE_NODE_SIZE_POS  ( 8)
-#define SKEIN_CFG_TREE_MAX_LEVEL_POS  (16)
+#define SKEIN_CFG_TREE_LEAF_SIZE_POS (0)
+#define SKEIN_CFG_TREE_NODE_SIZE_POS (8)
+#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16)
 
-#define SKEIN_CFG_TREE_LEAF_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
-#define SKEIN_CFG_TREE_NODE_SIZE_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
-#define SKEIN_CFG_TREE_MAX_LEVEL_MSK  (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
 
-#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl)                   \
-  ( (((u64b_t)(leaf  )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |    \
-  (((u64b_t)(node  )) << SKEIN_CFG_TREE_NODE_SIZE_POS) |    \
-  (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) )
+#define SKEIN_CFG_TREE_INFO(leaf, node, maxLvl)              \
+	((((u64b_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |    \
+		(((u64b_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \
+		(((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS))
 
-#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */
+#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0, 0, 0) /* use as treeInfo in InitExt() call for sequential processing */
 
 /*
 **   Skein macros for getting/setting tweak words, etc.
 **   These are useful for partial input bytes, hash tree init/update, etc.
 **/
-#define Skein_Get_Tweak(ctxPtr,TWK_NUM)         ((ctxPtr)->h.T[TWK_NUM])
-#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+#define Skein_Get_Tweak(ctxPtr, TWK_NUM) ((ctxPtr)->h.T[TWK_NUM])
+#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \
+	{                                          \
+		(ctxPtr)->h.T[TWK_NUM] = (tVal);       \
+	}
 
-#define Skein_Get_T0(ctxPtr)    Skein_Get_Tweak(ctxPtr,0)
-#define Skein_Get_T1(ctxPtr)    Skein_Get_Tweak(ctxPtr,1)
-#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
-#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr, 0)
+#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr, 1)
+#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0)
+#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1)
 
 /* set both tweak words at once */
-#define Skein_Set_T0_T1(ctxPtr,T0,T1)           \
-{                                           \
-  Skein_Set_T0(ctxPtr,(T0));                  \
-  Skein_Set_T1(ctxPtr,(T1));                  \
-}
+#define Skein_Set_T0_T1(ctxPtr, T0, T1) \
+	{                                   \
+		Skein_Set_T0(ctxPtr, (T0));     \
+		Skein_Set_T1(ctxPtr, (T1));     \
+	}
 
-#define Skein_Set_Type(ctxPtr,BLK_TYPE)         \
-  Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+#define Skein_Set_Type(ctxPtr, BLK_TYPE) \
+	Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE)
 
 /* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
-#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
-{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+#define Skein_Start_New_Type(ctxPtr, BLK_TYPE)                                          \
+	{                                                                                   \
+		Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); \
+		(ctxPtr)->h.bCnt = 0;                                                           \
+	}
 
-#define Skein_Clear_First_Flag(hdr)      { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;       }
-#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+#define Skein_Clear_First_Flag(hdr)         \
+	{                                       \
+		(hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; \
+	}
+#define Skein_Set_Bit_Pad_Flag(hdr)          \
+	{                                        \
+		(hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \
+	}
 
-#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);}
+#define Skein_Set_Tree_Level(hdr, height)          \
+	{                                              \
+		(hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height); \
+	}
 
 /*****************************************************************
 ** "Internal" Skein definitions for debugging and error checking
 ******************************************************************/
-#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr)
-#define Skein_Show_Round(bits,ctx,r,X)
-#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr)
-#define Skein_Show_Final(bits,ctx,cnt,outPtr)
-#define Skein_Show_Key(bits,ctx,key,keyBytes)
-
-
-#ifndef SKEIN_ERR_CHECK        /* run-time checks (e.g., bad params, uninitialized context)? */
-#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */
+#define Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr)
+#define Skein_Show_Round(bits, ctx, r, X)
+#define Skein_Show_R_Ptr(bits, ctx, r, X_ptr)
+#define Skein_Show_Final(bits, ctx, cnt, outPtr)
+#define Skein_Show_Key(bits, ctx, key, keyBytes)
+
+#ifndef SKEIN_ERR_CHECK			 /* run-time checks (e.g., bad params, uninitialized context)? */
+#define Skein_Assert(x, retCode) /* default: ignore all Asserts, for performance */
 #define Skein_assert(x)
-#elif   defined(SKEIN_ASSERT)
+#elif defined(SKEIN_ASSERT)
 #include <assert.h>
-#define Skein_Assert(x,retCode) assert(x)
-#define Skein_assert(x)         assert(x)
+#define Skein_Assert(x, retCode) assert(x)
+#define Skein_assert(x) assert(x)
 #else
 #include <assert.h>
-#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
-#define Skein_assert(x)         assert(x)                     /* internal error */
+#define Skein_Assert(x, retCode) \
+	{                            \
+		if(!(x))                 \
+			return retCode;      \
+	}							  /*  caller  error */
+#define Skein_assert(x) assert(x) /* internal error */
 #endif
 
 /*****************************************************************
@@ -271,48 +289,135 @@ static int  Skein1024_Output   (Skein1024_Ctxt_t *ctx, u08b_t * hashVal);
 ******************************************************************/
 enum
 {
-  /* Skein_256 round rotation constants */
-  R_256_0_0=14, R_256_0_1=16,
-  R_256_1_0=52, R_256_1_1=57,
-  R_256_2_0=23, R_256_2_1=40,
-  R_256_3_0= 5, R_256_3_1=37,
-  R_256_4_0=25, R_256_4_1=33,
-  R_256_5_0=46, R_256_5_1=12,
-  R_256_6_0=58, R_256_6_1=22,
-  R_256_7_0=32, R_256_7_1=32,
-
-  /* Skein_512 round rotation constants */
-  R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
-  R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
-  R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
-  R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
-  R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
-  R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
-  R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
-  R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
-
-  /* Skein1024 round rotation constants */
-  R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
-  R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
-  R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
-  R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
-  R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
-  R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
-  R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
-  R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
+	/* Skein_256 round rotation constants */
+	R_256_0_0 = 14,
+	R_256_0_1 = 16,
+	R_256_1_0 = 52,
+	R_256_1_1 = 57,
+	R_256_2_0 = 23,
+	R_256_2_1 = 40,
+	R_256_3_0 = 5,
+	R_256_3_1 = 37,
+	R_256_4_0 = 25,
+	R_256_4_1 = 33,
+	R_256_5_0 = 46,
+	R_256_5_1 = 12,
+	R_256_6_0 = 58,
+	R_256_6_1 = 22,
+	R_256_7_0 = 32,
+	R_256_7_1 = 32,
+
+	/* Skein_512 round rotation constants */
+	R_512_0_0 = 46,
+	R_512_0_1 = 36,
+	R_512_0_2 = 19,
+	R_512_0_3 = 37,
+	R_512_1_0 = 33,
+	R_512_1_1 = 27,
+	R_512_1_2 = 14,
+	R_512_1_3 = 42,
+	R_512_2_0 = 17,
+	R_512_2_1 = 49,
+	R_512_2_2 = 36,
+	R_512_2_3 = 39,
+	R_512_3_0 = 44,
+	R_512_3_1 = 9,
+	R_512_3_2 = 54,
+	R_512_3_3 = 56,
+	R_512_4_0 = 39,
+	R_512_4_1 = 30,
+	R_512_4_2 = 34,
+	R_512_4_3 = 24,
+	R_512_5_0 = 13,
+	R_512_5_1 = 50,
+	R_512_5_2 = 10,
+	R_512_5_3 = 17,
+	R_512_6_0 = 25,
+	R_512_6_1 = 29,
+	R_512_6_2 = 39,
+	R_512_6_3 = 43,
+	R_512_7_0 = 8,
+	R_512_7_1 = 35,
+	R_512_7_2 = 56,
+	R_512_7_3 = 22,
+
+	/* Skein1024 round rotation constants */
+	R1024_0_0 = 24,
+	R1024_0_1 = 13,
+	R1024_0_2 = 8,
+	R1024_0_3 = 47,
+	R1024_0_4 = 8,
+	R1024_0_5 = 17,
+	R1024_0_6 = 22,
+	R1024_0_7 = 37,
+	R1024_1_0 = 38,
+	R1024_1_1 = 19,
+	R1024_1_2 = 10,
+	R1024_1_3 = 55,
+	R1024_1_4 = 49,
+	R1024_1_5 = 18,
+	R1024_1_6 = 23,
+	R1024_1_7 = 52,
+	R1024_2_0 = 33,
+	R1024_2_1 = 4,
+	R1024_2_2 = 51,
+	R1024_2_3 = 13,
+	R1024_2_4 = 34,
+	R1024_2_5 = 41,
+	R1024_2_6 = 59,
+	R1024_2_7 = 17,
+	R1024_3_0 = 5,
+	R1024_3_1 = 20,
+	R1024_3_2 = 48,
+	R1024_3_3 = 41,
+	R1024_3_4 = 47,
+	R1024_3_5 = 28,
+	R1024_3_6 = 16,
+	R1024_3_7 = 25,
+	R1024_4_0 = 41,
+	R1024_4_1 = 9,
+	R1024_4_2 = 37,
+	R1024_4_3 = 31,
+	R1024_4_4 = 12,
+	R1024_4_5 = 47,
+	R1024_4_6 = 44,
+	R1024_4_7 = 30,
+	R1024_5_0 = 16,
+	R1024_5_1 = 34,
+	R1024_5_2 = 56,
+	R1024_5_3 = 51,
+	R1024_5_4 = 4,
+	R1024_5_5 = 53,
+	R1024_5_6 = 42,
+	R1024_5_7 = 41,
+	R1024_6_0 = 31,
+	R1024_6_1 = 44,
+	R1024_6_2 = 47,
+	R1024_6_3 = 46,
+	R1024_6_4 = 19,
+	R1024_6_5 = 42,
+	R1024_6_6 = 44,
+	R1024_6_7 = 25,
+	R1024_7_0 = 9,
+	R1024_7_1 = 48,
+	R1024_7_2 = 35,
+	R1024_7_3 = 52,
+	R1024_7_4 = 23,
+	R1024_7_5 = 31,
+	R1024_7_6 = 37,
+	R1024_7_7 = 20
 };
 
 #ifndef SKEIN_ROUNDS
-#define SKEIN_256_ROUNDS_TOTAL (72)          /* number of rounds for the different block sizes */
+#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */
 #define SKEIN_512_ROUNDS_TOTAL (72)
 #define SKEIN1024_ROUNDS_TOTAL (80)
-#else                                        /* allow command-line define in range 8*(5..14)   */
-#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5))
-#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5))
-#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS    ) + 5) % 10) + 5))
+#else /* allow command-line define in range 8*(5..14)   */
+#define SKEIN_256_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS / 100) + 5) % 10) + 5))
+#define SKEIN_512_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS / 10) + 5) % 10) + 5))
+#define SKEIN1024_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS) + 5) % 10) + 5))
 #endif
 
-
 /*
 ***************** Pre-computed Skein IVs *******************
 **
@@ -332,239 +437,233 @@ enum
 /* blkSize =  256 bits. hashSize =  128 bits */
 const u64b_t SKEIN_256_IV_128[] =
 	{
-	MK_64(0xE1111906,0x964D7260),
-	MK_64(0x883DAAA7,0x7C8D811C),
-	MK_64(0x10080DF4,0x91960F7A),
-	MK_64(0xCCF7DDE5,0xB45BC1C2)
-	};
+		MK_64(0xE1111906, 0x964D7260),
+		MK_64(0x883DAAA7, 0x7C8D811C),
+		MK_64(0x10080DF4, 0x91960F7A),
+		MK_64(0xCCF7DDE5, 0xB45BC1C2)};
 
 /* blkSize =  256 bits. hashSize =  160 bits */
 const u64b_t SKEIN_256_IV_160[] =
 	{
-	MK_64(0x14202314,0x72825E98),
-	MK_64(0x2AC4E9A2,0x5A77E590),
-	MK_64(0xD47A5856,0x8838D63E),
-	MK_64(0x2DD2E496,0x8586AB7D)
-	};
+		MK_64(0x14202314, 0x72825E98),
+		MK_64(0x2AC4E9A2, 0x5A77E590),
+		MK_64(0xD47A5856, 0x8838D63E),
+		MK_64(0x2DD2E496, 0x8586AB7D)};
 
 /* blkSize =  256 bits. hashSize =  224 bits */
 const u64b_t SKEIN_256_IV_224[] =
 	{
-	MK_64(0xC6098A8C,0x9AE5EA0B),
-	MK_64(0x876D5686,0x08C5191C),
-	MK_64(0x99CB88D7,0xD7F53884),
-	MK_64(0x384BDDB1,0xAEDDB5DE)
-	};
+		MK_64(0xC6098A8C, 0x9AE5EA0B),
+		MK_64(0x876D5686, 0x08C5191C),
+		MK_64(0x99CB88D7, 0xD7F53884),
+		MK_64(0x384BDDB1, 0xAEDDB5DE)};
 
 /* blkSize =  256 bits. hashSize =  256 bits */
 const u64b_t SKEIN_256_IV_256[] =
 	{
-	MK_64(0xFC9DA860,0xD048B449),
-	MK_64(0x2FCA6647,0x9FA7D833),
-	MK_64(0xB33BC389,0x6656840F),
-	MK_64(0x6A54E920,0xFDE8DA69)
-	};
+		MK_64(0xFC9DA860, 0xD048B449),
+		MK_64(0x2FCA6647, 0x9FA7D833),
+		MK_64(0xB33BC389, 0x6656840F),
+		MK_64(0x6A54E920, 0xFDE8DA69)};
 
 /* blkSize =  512 bits. hashSize =  128 bits */
 const u64b_t SKEIN_512_IV_128[] =
 	{
-	MK_64(0xA8BC7BF3,0x6FBF9F52),
-	MK_64(0x1E9872CE,0xBD1AF0AA),
-	MK_64(0x309B1790,0xB32190D3),
-	MK_64(0xBCFBB854,0x3F94805C),
-	MK_64(0x0DA61BCD,0x6E31B11B),
-	MK_64(0x1A18EBEA,0xD46A32E3),
-	MK_64(0xA2CC5B18,0xCE84AA82),
-	MK_64(0x6982AB28,0x9D46982D)
-	};
+		MK_64(0xA8BC7BF3, 0x6FBF9F52),
+		MK_64(0x1E9872CE, 0xBD1AF0AA),
+		MK_64(0x309B1790, 0xB32190D3),
+		MK_64(0xBCFBB854, 0x3F94805C),
+		MK_64(0x0DA61BCD, 0x6E31B11B),
+		MK_64(0x1A18EBEA, 0xD46A32E3),
+		MK_64(0xA2CC5B18, 0xCE84AA82),
+		MK_64(0x6982AB28, 0x9D46982D)};
 
 /* blkSize =  512 bits. hashSize =  160 bits */
 const u64b_t SKEIN_512_IV_160[] =
 	{
-	MK_64(0x28B81A2A,0xE013BD91),
-	MK_64(0xC2F11668,0xB5BDF78F),
-	MK_64(0x1760D8F3,0xF6A56F12),
-	MK_64(0x4FB74758,0x8239904F),
-	MK_64(0x21EDE07F,0x7EAF5056),
-	MK_64(0xD908922E,0x63ED70B8),
-	MK_64(0xB8EC76FF,0xECCB52FA),
-	MK_64(0x01A47BB8,0xA3F27A6E)
-	};
+		MK_64(0x28B81A2A, 0xE013BD91),
+		MK_64(0xC2F11668, 0xB5BDF78F),
+		MK_64(0x1760D8F3, 0xF6A56F12),
+		MK_64(0x4FB74758, 0x8239904F),
+		MK_64(0x21EDE07F, 0x7EAF5056),
+		MK_64(0xD908922E, 0x63ED70B8),
+		MK_64(0xB8EC76FF, 0xECCB52FA),
+		MK_64(0x01A47BB8, 0xA3F27A6E)};
 
 /* blkSize =  512 bits. hashSize =  224 bits */
 const u64b_t SKEIN_512_IV_224[] =
 	{
-	MK_64(0xCCD06162,0x48677224),
-	MK_64(0xCBA65CF3,0xA92339EF),
-	MK_64(0x8CCD69D6,0x52FF4B64),
-	MK_64(0x398AED7B,0x3AB890B4),
-	MK_64(0x0F59D1B1,0x457D2BD0),
-	MK_64(0x6776FE65,0x75D4EB3D),
-	MK_64(0x99FBC70E,0x997413E9),
-	MK_64(0x9E2CFCCF,0xE1C41EF7)
-	};
+		MK_64(0xCCD06162, 0x48677224),
+		MK_64(0xCBA65CF3, 0xA92339EF),
+		MK_64(0x8CCD69D6, 0x52FF4B64),
+		MK_64(0x398AED7B, 0x3AB890B4),
+		MK_64(0x0F59D1B1, 0x457D2BD0),
+		MK_64(0x6776FE65, 0x75D4EB3D),
+		MK_64(0x99FBC70E, 0x997413E9),
+		MK_64(0x9E2CFCCF, 0xE1C41EF7)};
 
 /* blkSize =  512 bits. hashSize =  256 bits */
 const u64b_t SKEIN_512_IV_256[] =
 	{
-	MK_64(0xCCD044A1,0x2FDB3E13),
-	MK_64(0xE8359030,0x1A79A9EB),
-	MK_64(0x55AEA061,0x4F816E6F),
-	MK_64(0x2A2767A4,0xAE9B94DB),
-	MK_64(0xEC06025E,0x74DD7683),
-	MK_64(0xE7A436CD,0xC4746251),
-	MK_64(0xC36FBAF9,0x393AD185),
-	MK_64(0x3EEDBA18,0x33EDFC13)
-	};
+		MK_64(0xCCD044A1, 0x2FDB3E13),
+		MK_64(0xE8359030, 0x1A79A9EB),
+		MK_64(0x55AEA061, 0x4F816E6F),
+		MK_64(0x2A2767A4, 0xAE9B94DB),
+		MK_64(0xEC06025E, 0x74DD7683),
+		MK_64(0xE7A436CD, 0xC4746251),
+		MK_64(0xC36FBAF9, 0x393AD185),
+		MK_64(0x3EEDBA18, 0x33EDFC13)};
 
 /* blkSize =  512 bits. hashSize =  384 bits */
 const u64b_t SKEIN_512_IV_384[] =
 	{
-	MK_64(0xA3F6C6BF,0x3A75EF5F),
-	MK_64(0xB0FEF9CC,0xFD84FAA4),
-	MK_64(0x9D77DD66,0x3D770CFE),
-	MK_64(0xD798CBF3,0xB468FDDA),
-	MK_64(0x1BC4A666,0x8A0E4465),
-	MK_64(0x7ED7D434,0xE5807407),
-	MK_64(0x548FC1AC,0xD4EC44D6),
-	MK_64(0x266E1754,0x6AA18FF8)
-	};
+		MK_64(0xA3F6C6BF, 0x3A75EF5F),
+		MK_64(0xB0FEF9CC, 0xFD84FAA4),
+		MK_64(0x9D77DD66, 0x3D770CFE),
+		MK_64(0xD798CBF3, 0xB468FDDA),
+		MK_64(0x1BC4A666, 0x8A0E4465),
+		MK_64(0x7ED7D434, 0xE5807407),
+		MK_64(0x548FC1AC, 0xD4EC44D6),
+		MK_64(0x266E1754, 0x6AA18FF8)};
 
 /* blkSize =  512 bits. hashSize =  512 bits */
 const u64b_t SKEIN_512_IV_512[] =
 	{
-	MK_64(0x4903ADFF,0x749C51CE),
-	MK_64(0x0D95DE39,0x9746DF03),
-	MK_64(0x8FD19341,0x27C79BCE),
-	MK_64(0x9A255629,0xFF352CB1),
-	MK_64(0x5DB62599,0xDF6CA7B0),
-	MK_64(0xEABE394C,0xA9D5C3F4),
-	MK_64(0x991112C7,0x1A75B523),
-	MK_64(0xAE18A40B,0x660FCC33)
-	};
+		MK_64(0x4903ADFF, 0x749C51CE),
+		MK_64(0x0D95DE39, 0x9746DF03),
+		MK_64(0x8FD19341, 0x27C79BCE),
+		MK_64(0x9A255629, 0xFF352CB1),
+		MK_64(0x5DB62599, 0xDF6CA7B0),
+		MK_64(0xEABE394C, 0xA9D5C3F4),
+		MK_64(0x991112C7, 0x1A75B523),
+		MK_64(0xAE18A40B, 0x660FCC33)};
 
 /* blkSize = 1024 bits. hashSize =  384 bits */
 const u64b_t SKEIN1024_IV_384[] =
 	{
-	MK_64(0x5102B6B8,0xC1894A35),
-	MK_64(0xFEEBC9E3,0xFE8AF11A),
-	MK_64(0x0C807F06,0xE32BED71),
-	MK_64(0x60C13A52,0xB41A91F6),
-	MK_64(0x9716D35D,0xD4917C38),
-	MK_64(0xE780DF12,0x6FD31D3A),
-	MK_64(0x797846B6,0xC898303A),
-	MK_64(0xB172C2A8,0xB3572A3B),
-	MK_64(0xC9BC8203,0xA6104A6C),
-	MK_64(0x65909338,0xD75624F4),
-	MK_64(0x94BCC568,0x4B3F81A0),
-	MK_64(0x3EBBF51E,0x10ECFD46),
-	MK_64(0x2DF50F0B,0xEEB08542),
-	MK_64(0x3B5A6530,0x0DBC6516),
-	MK_64(0x484B9CD2,0x167BBCE1),
-	MK_64(0x2D136947,0xD4CBAFEA)
-	};
+		MK_64(0x5102B6B8, 0xC1894A35),
+		MK_64(0xFEEBC9E3, 0xFE8AF11A),
+		MK_64(0x0C807F06, 0xE32BED71),
+		MK_64(0x60C13A52, 0xB41A91F6),
+		MK_64(0x9716D35D, 0xD4917C38),
+		MK_64(0xE780DF12, 0x6FD31D3A),
+		MK_64(0x797846B6, 0xC898303A),
+		MK_64(0xB172C2A8, 0xB3572A3B),
+		MK_64(0xC9BC8203, 0xA6104A6C),
+		MK_64(0x65909338, 0xD75624F4),
+		MK_64(0x94BCC568, 0x4B3F81A0),
+		MK_64(0x3EBBF51E, 0x10ECFD46),
+		MK_64(0x2DF50F0B, 0xEEB08542),
+		MK_64(0x3B5A6530, 0x0DBC6516),
+		MK_64(0x484B9CD2, 0x167BBCE1),
+		MK_64(0x2D136947, 0xD4CBAFEA)};
 
 /* blkSize = 1024 bits. hashSize =  512 bits */
 const u64b_t SKEIN1024_IV_512[] =
 	{
-	MK_64(0xCAEC0E5D,0x7C1B1B18),
-	MK_64(0xA01B0E04,0x5F03E802),
-	MK_64(0x33840451,0xED912885),
-	MK_64(0x374AFB04,0xEAEC2E1C),
-	MK_64(0xDF25A0E2,0x813581F7),
-	MK_64(0xE4004093,0x8B12F9D2),
-	MK_64(0xA662D539,0xC2ED39B6),
-	MK_64(0xFA8B85CF,0x45D8C75A),
-	MK_64(0x8316ED8E,0x29EDE796),
-	MK_64(0x053289C0,0x2E9F91B8),
-	MK_64(0xC3F8EF1D,0x6D518B73),
-	MK_64(0xBDCEC3C4,0xD5EF332E),
-	MK_64(0x549A7E52,0x22974487),
-	MK_64(0x67070872,0x5B749816),
-	MK_64(0xB9CD28FB,0xF0581BD1),
-	MK_64(0x0E2940B8,0x15804974)
-	};
+		MK_64(0xCAEC0E5D, 0x7C1B1B18),
+		MK_64(0xA01B0E04, 0x5F03E802),
+		MK_64(0x33840451, 0xED912885),
+		MK_64(0x374AFB04, 0xEAEC2E1C),
+		MK_64(0xDF25A0E2, 0x813581F7),
+		MK_64(0xE4004093, 0x8B12F9D2),
+		MK_64(0xA662D539, 0xC2ED39B6),
+		MK_64(0xFA8B85CF, 0x45D8C75A),
+		MK_64(0x8316ED8E, 0x29EDE796),
+		MK_64(0x053289C0, 0x2E9F91B8),
+		MK_64(0xC3F8EF1D, 0x6D518B73),
+		MK_64(0xBDCEC3C4, 0xD5EF332E),
+		MK_64(0x549A7E52, 0x22974487),
+		MK_64(0x67070872, 0x5B749816),
+		MK_64(0xB9CD28FB, 0xF0581BD1),
+		MK_64(0x0E2940B8, 0x15804974)};
 
 /* blkSize = 1024 bits. hashSize = 1024 bits */
 const u64b_t SKEIN1024_IV_1024[] =
 	{
-	MK_64(0xD593DA07,0x41E72355),
-	MK_64(0x15B5E511,0xAC73E00C),
-	MK_64(0x5180E5AE,0xBAF2C4F0),
-	MK_64(0x03BD41D3,0xFCBCAFAF),
-	MK_64(0x1CAEC6FD,0x1983A898),
-	MK_64(0x6E510B8B,0xCDD0589F),
-	MK_64(0x77E2BDFD,0xC6394ADA),
-	MK_64(0xC11E1DB5,0x24DCB0A3),
-	MK_64(0xD6D14AF9,0xC6329AB5),
-	MK_64(0x6A9B0BFC,0x6EB67E0D),
-	MK_64(0x9243C60D,0xCCFF1332),
-	MK_64(0x1A1F1DDE,0x743F02D4),
-	MK_64(0x0996753C,0x10ED0BB8),
-	MK_64(0x6572DD22,0xF2B4969A),
-	MK_64(0x61FD3062,0xD00A579A),
-	MK_64(0x1DE0536E,0x8682E539)
-	};
-
+		MK_64(0xD593DA07, 0x41E72355),
+		MK_64(0x15B5E511, 0xAC73E00C),
+		MK_64(0x5180E5AE, 0xBAF2C4F0),
+		MK_64(0x03BD41D3, 0xFCBCAFAF),
+		MK_64(0x1CAEC6FD, 0x1983A898),
+		MK_64(0x6E510B8B, 0xCDD0589F),
+		MK_64(0x77E2BDFD, 0xC6394ADA),
+		MK_64(0xC11E1DB5, 0x24DCB0A3),
+		MK_64(0xD6D14AF9, 0xC6329AB5),
+		MK_64(0x6A9B0BFC, 0x6EB67E0D),
+		MK_64(0x9243C60D, 0xCCFF1332),
+		MK_64(0x1A1F1DDE, 0x743F02D4),
+		MK_64(0x0996753C, 0x10ED0BB8),
+		MK_64(0x6572DD22, 0xF2B4969A),
+		MK_64(0x61FD3062, 0xD00A579A),
+		MK_64(0x1DE0536E, 0x8682E539)};
 
 #ifndef SKEIN_USE_ASM
-#define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
+#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
 #endif
 
 #ifndef SKEIN_LOOP
-#define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
+#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
 #endif
 
-#define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
-#define KW_TWK_BASE     (0)
-#define KW_KEY_BASE     (3)
-#define ks              (kw + KW_KEY_BASE)
-#define ts              (kw + KW_TWK_BASE)
+#define BLK_BITS (WCNT * 64) /* some useful definitions for code here */
+#define KW_TWK_BASE (0)
+#define KW_KEY_BASE (3)
+#define ks (kw + KW_KEY_BASE)
+#define ts (kw + KW_TWK_BASE)
 
 #ifdef SKEIN_DEBUG
-#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
+#define DebugSaveTweak(ctx)  \
+	{                        \
+		ctx->h.T[0] = ts[0]; \
+		ctx->h.T[1] = ts[1]; \
+	}
 #else
 #define DebugSaveTweak(ctx)
 #endif
 
 /*****************************  Skein_256 ******************************/
 #if !(SKEIN_USE_ASM & 256)
-static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
-	{ /* do it in C */
+static void Skein_256_Process_Block(Skein_256_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd)
+{ /* do it in C */
 	enum
-		{
+	{
 		WCNT = SKEIN_256_STATE_WORDS
-		};
-#undef  RCNT
-#define RCNT  (SKEIN_256_ROUNDS_TOTAL/8)
+	};
+#undef RCNT
+#define RCNT (SKEIN_256_ROUNDS_TOTAL / 8)
 
-#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
-#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
 #else
 #define SKEIN_UNROLL_256 (0)
 #endif
 
 #if SKEIN_UNROLL_256
-#if (RCNT % SKEIN_UNROLL_256)
+#if(RCNT % SKEIN_UNROLL_256)
 #error "Invalid SKEIN_UNROLL_256"               /* sanity check on unroll count */
 #endif
-	size_t  r;
-	u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+	size_t r;
+	u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/
 #else
-	u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+	u64b_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
 #endif
-	u64b_t  X0,X1,X2,X3;                        /* local copy of context vars, for speed */
-	u64b_t  w [WCNT];                           /* local copy of input block */
+	u64b_t X0, X1, X2, X3; /* local copy of context vars, for speed */
+	u64b_t w[WCNT];		   /* local copy of input block */
 #ifdef SKEIN_DEBUG
-	const u64b_t *Xptr[4];                      /* use for debugging (help compiler put Xn in registers) */
-	Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
+	const u64b_t* Xptr[4]; /* use for debugging (help compiler put Xn in registers) */
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
 #endif
-	Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+	Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
 	ts[0] = ctx->h.T[0];
 	ts[1] = ctx->h.T[1];
-	do  {
+	do
+	{
 		/* this implementation only supports 2**64 input bytes (no carry out here) */
-		ts[0] += byteCntAdd;                    /* update processed length */
+		ts[0] += byteCntAdd; /* update processed length */
 
 		/* precompute the key schedule for this block */
 		ks[0] = ctx->X[0];
@@ -575,114 +674,118 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s
 
 		ts[2] = ts[0] ^ ts[1];
 
-		Skein_Get64_LSB_First(w,blkPtr,WCNT);   /* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
 		DebugSaveTweak(ctx);
-		Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 
-		X0 = w[0] + ks[0];                      /* do the first full key injection */
+		X0 = w[0] + ks[0]; /* do the first full key injection */
 		X1 = w[1] + ks[1] + ts[0];
 		X2 = w[2] + ks[2] + ts[1];
 		X3 = w[3] + ks[3];
 
-		Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);    /* show starting state values */
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); /* show starting state values */
 
 		blkPtr += SKEIN_256_BLOCK_BYTES;
 
 		/* run the rounds */
 
-#define Round256(p0,p1,p2,p3,ROT,rNum)                              \
-	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
-	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+#define Round256(p0, p1, p2, p3, ROT, rNum) \
+	X##p0 += X##p1;                         \
+	X##p1 = RotL_64(X##p1, ROT##_0);        \
+	X##p1 ^= X##p0;                         \
+	X##p2 += X##p3;                         \
+	X##p3 = RotL_64(X##p3, ROT##_1);        \
+	X##p3 ^= X##p2;
 
 #if SKEIN_UNROLL_256 == 0
-#define R256(p0,p1,p2,p3,ROT,rNum)           /* fully unrolled */   \
-	Round256(p0,p1,p2,p3,ROT,rNum)                                  \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
-
-#define I256(R)                                                     \
-	X0   += ks[((R)+1) % 5];    /* inject the key schedule value */ \
-	X1   += ks[((R)+2) % 5] + ts[((R)+1) % 3];                      \
-	X2   += ks[((R)+3) % 5] + ts[((R)+2) % 3];                      \
-	X3   += ks[((R)+4) % 5] +     (R)+1;                            \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-#else                                       /* looping version */
-#define R256(p0,p1,p2,p3,ROT,rNum)                                  \
-	Round256(p0,p1,p2,p3,ROT,rNum)                                  \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
-
-#define I256(R)                                                     \
-	X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
-	X1   += ks[r+(R)+1] + ts[r+(R)+0];                              \
-	X2   += ks[r+(R)+2] + ts[r+(R)+1];                              \
-	X3   += ks[r+(R)+3] +    r+(R)   ;                              \
-	ks[r + (R)+4    ]   = ks[r+(R)-1];     /* rotate key schedule */\
-	ts[r + (R)+2    ]   = ts[r+(R)-1];                              \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-
-	for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256)  /* loop thru it */
+#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \
+	Round256(p0, p1, p2, p3, ROT, rNum)                      \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define I256(R)                                                  \
+	X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \
+	X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];                 \
+	X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];                 \
+	X3 += ks[((R) + 4) % 5] + (R) + 1;                           \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else /* looping version */
+#define R256(p0, p1, p2, p3, ROT, rNum) \
+	Round256(p0, p1, p2, p3, ROT, rNum) \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define I256(R)                                                \
+	X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
+	X1 += ks[r + (R) + 1] + ts[r + (R) + 0];                   \
+	X2 += ks[r + (R) + 2] + ts[r + (R) + 1];                   \
+	X3 += ks[r + (R) + 3] + r + (R);                           \
+	ks[r + (R) + 4] = ks[r + (R)-1]; /* rotate key schedule */ \
+	ts[r + (R) + 2] = ts[r + (R)-1];                           \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		for(r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) /* loop thru it */
 #endif
 		{
-#define R256_8_rounds(R)                  \
-		R256(0,1,2,3,R_256_0,8*(R) + 1);  \
-		R256(0,3,2,1,R_256_1,8*(R) + 2);  \
-		R256(0,1,2,3,R_256_2,8*(R) + 3);  \
-		R256(0,3,2,1,R_256_3,8*(R) + 4);  \
-		I256(2*(R));                      \
-		R256(0,1,2,3,R_256_4,8*(R) + 5);  \
-		R256(0,3,2,1,R_256_5,8*(R) + 6);  \
-		R256(0,1,2,3,R_256_6,8*(R) + 7);  \
-		R256(0,3,2,1,R_256_7,8*(R) + 8);  \
-		I256(2*(R)+1);
-
-		R256_8_rounds( 0);
-
-#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
-
-  #if   R256_Unroll_R( 1)
-		R256_8_rounds( 1);
-  #endif
-  #if   R256_Unroll_R( 2)
-		R256_8_rounds( 2);
-  #endif
-  #if   R256_Unroll_R( 3)
-		R256_8_rounds( 3);
-  #endif
-  #if   R256_Unroll_R( 4)
-		R256_8_rounds( 4);
-  #endif
-  #if   R256_Unroll_R( 5)
-		R256_8_rounds( 5);
-  #endif
-  #if   R256_Unroll_R( 6)
-		R256_8_rounds( 6);
-  #endif
-  #if   R256_Unroll_R( 7)
-		R256_8_rounds( 7);
-  #endif
-  #if   R256_Unroll_R( 8)
-		R256_8_rounds( 8);
-  #endif
-  #if   R256_Unroll_R( 9)
-		R256_8_rounds( 9);
-  #endif
-  #if   R256_Unroll_R(10)
-		R256_8_rounds(10);
-  #endif
-  #if   R256_Unroll_R(11)
-		R256_8_rounds(11);
-  #endif
-  #if   R256_Unroll_R(12)
-		R256_8_rounds(12);
-  #endif
-  #if   R256_Unroll_R(13)
-		R256_8_rounds(13);
-  #endif
-  #if   R256_Unroll_R(14)
-		R256_8_rounds(14);
-  #endif
-  #if  (SKEIN_UNROLL_256 > 14)
-#error  "need more unrolling in Skein_256_Process_Block"
-  #endif
+#define R256_8_rounds(R)                    \
+	R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \
+	R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \
+	R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \
+	R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \
+	I256(2 * (R));                          \
+	R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \
+	R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \
+	R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \
+	R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \
+	I256(2 * (R) + 1);
+
+			R256_8_rounds(0);
+
+#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
+
+#if R256_Unroll_R(1)
+			R256_8_rounds(1);
+#endif
+#if R256_Unroll_R(2)
+			R256_8_rounds(2);
+#endif
+#if R256_Unroll_R(3)
+			R256_8_rounds(3);
+#endif
+#if R256_Unroll_R(4)
+			R256_8_rounds(4);
+#endif
+#if R256_Unroll_R(5)
+			R256_8_rounds(5);
+#endif
+#if R256_Unroll_R(6)
+			R256_8_rounds(6);
+#endif
+#if R256_Unroll_R(7)
+			R256_8_rounds(7);
+#endif
+#if R256_Unroll_R(8)
+			R256_8_rounds(8);
+#endif
+#if R256_Unroll_R(9)
+			R256_8_rounds(9);
+#endif
+#if R256_Unroll_R(10)
+			R256_8_rounds(10);
+#endif
+#if R256_Unroll_R(11)
+			R256_8_rounds(11);
+#endif
+#if R256_Unroll_R(12)
+			R256_8_rounds(12);
+#endif
+#if R256_Unroll_R(13)
+			R256_8_rounds(13);
+#endif
+#if R256_Unroll_R(14)
+			R256_8_rounds(14);
+#endif
+#if(SKEIN_UNROLL_256 > 14)
+#error "need more unrolling in Skein_256_Process_Block"
+#endif
 		}
 		/* do the final "feedforward" xor, update context chaining vars */
 		ctx->X[0] = X0 ^ w[0];
@@ -690,68 +793,74 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s
 		ctx->X[2] = X2 ^ w[2];
 		ctx->X[3] = X3 ^ w[3];
 
-		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
-		}
-	while (--blkCnt);
+	} while(--blkCnt);
 	ctx->h.T[0] = ts[0];
 	ctx->h.T[1] = ts[1];
-	}
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein_256_Process_Block_CodeSize(void)
-	{
-	return ((u08b_t *) Skein_256_Process_Block_CodeSize) -
-		   ((u08b_t *) Skein_256_Process_Block);
-	}
+{
+	return ((u08b_t*)Skein_256_Process_Block_CodeSize) -
+		   ((u08b_t*)Skein_256_Process_Block);
+}
 static uint_t Skein_256_Unroll_Cnt(void)
-	{
+{
 	return SKEIN_UNROLL_256;
-	}
+}
 #endif
 #endif
 
 /*****************************  Skein_512 ******************************/
 #if !(SKEIN_USE_ASM & 512)
-static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
-	{ /* do it in C */
+static void Skein_512_Process_Block(Skein_512_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd)
+{ /* do it in C */
 	enum
-		{
+	{
 		WCNT = SKEIN_512_STATE_WORDS
-		};
-#undef  RCNT
-#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)
+	};
+#undef RCNT
+#define RCNT (SKEIN_512_ROUNDS_TOTAL / 8)
 
-#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
-#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
 #else
 #define SKEIN_UNROLL_512 (0)
 #endif
 
 #if SKEIN_UNROLL_512
-#if (RCNT % SKEIN_UNROLL_512)
+#if(RCNT % SKEIN_UNROLL_512)
 #error "Invalid SKEIN_UNROLL_512"               /* sanity check on unroll count */
 #endif
-	size_t  r;
-	u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+	size_t r;
+	u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/
 #else
-	u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+	u64b_t kw[WCNT + 4];									/* key schedule words : chaining vars + tweak */
 #endif
-	u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
-	u64b_t  w [WCNT];                           /* local copy of input block */
+	u64b_t X0, X1, X2, X3, X4, X5, X6, X7; /* local copy of vars, for speed */
+	u64b_t w[WCNT];						   /* local copy of input block */
 #ifdef SKEIN_DEBUG
-	const u64b_t *Xptr[8];                      /* use for debugging (help compiler put Xn in registers) */
-	Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
-	Xptr[4] = &X4;  Xptr[5] = &X5;  Xptr[6] = &X6;  Xptr[7] = &X7;
+	const u64b_t* Xptr[8]; /* use for debugging (help compiler put Xn in registers) */
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
+	Xptr[4] = &X4;
+	Xptr[5] = &X5;
+	Xptr[6] = &X6;
+	Xptr[7] = &X7;
 #endif
 
-	Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+	Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
 	ts[0] = ctx->h.T[0];
 	ts[1] = ctx->h.T[1];
-	do  {
+	do
+	{
 		/* this implementation only supports 2**64 input bytes (no carry out here) */
-		ts[0] += byteCntAdd;                    /* update processed length */
+		ts[0] += byteCntAdd; /* update processed length */
 
 		/* precompute the key schedule for this block */
 		ks[0] = ctx->X[0];
@@ -767,126 +876,134 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
 
 		ts[2] = ts[0] ^ ts[1];
 
-		Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
 		DebugSaveTweak(ctx);
-		Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 
-		X0   = w[0] + ks[0];                    /* do the first full key injection */
-		X1   = w[1] + ks[1];
-		X2   = w[2] + ks[2];
-		X3   = w[3] + ks[3];
-		X4   = w[4] + ks[4];
-		X5   = w[5] + ks[5] + ts[0];
-		X6   = w[6] + ks[6] + ts[1];
-		X7   = w[7] + ks[7];
+		X0 = w[0] + ks[0]; /* do the first full key injection */
+		X1 = w[1] + ks[1];
+		X2 = w[2] + ks[2];
+		X3 = w[3] + ks[3];
+		X4 = w[4] + ks[4];
+		X5 = w[5] + ks[5] + ts[0];
+		X6 = w[6] + ks[6] + ts[1];
+		X7 = w[7] + ks[7];
 
 		blkPtr += SKEIN_512_BLOCK_BYTES;
 
-		Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr);
 		/* run the rounds */
-#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
-	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
-	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
-	X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
-	X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+#define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+	X##p0 += X##p1;                                         \
+	X##p1 = RotL_64(X##p1, ROT##_0);                        \
+	X##p1 ^= X##p0;                                         \
+	X##p2 += X##p3;                                         \
+	X##p3 = RotL_64(X##p3, ROT##_1);                        \
+	X##p3 ^= X##p2;                                         \
+	X##p4 += X##p5;                                         \
+	X##p5 = RotL_64(X##p5, ROT##_2);                        \
+	X##p5 ^= X##p4;                                         \
+	X##p6 += X##p7;                                         \
+	X##p7 = RotL_64(X##p7, ROT##_3);                        \
+	X##p7 ^= X##p6;
 
 #if SKEIN_UNROLL_512 == 0
-#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
-	Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
-
-#define I512(R)                                                     \
-	X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
-	X1   += ks[((R)+2) % 9];                                        \
-	X2   += ks[((R)+3) % 9];                                        \
-	X3   += ks[((R)+4) % 9];                                        \
-	X4   += ks[((R)+5) % 9];                                        \
-	X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
-	X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
-	X7   += ks[((R)+8) % 9] +     (R)+1;                            \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-#else                                       /* looping version */
-#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
-	Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
-
-#define I512(R)                                                     \
-	X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
-	X1   += ks[r+(R)+1];                                            \
-	X2   += ks[r+(R)+2];                                            \
-	X3   += ks[r+(R)+3];                                            \
-	X4   += ks[r+(R)+4];                                            \
-	X5   += ks[r+(R)+5] + ts[r+(R)+0];                              \
-	X6   += ks[r+(R)+6] + ts[r+(R)+1];                              \
-	X7   += ks[r+(R)+7] +    r+(R)   ;                              \
-	ks[r +       (R)+8] = ks[r+(R)-1];  /* rotate key schedule */   \
-	ts[r +       (R)+2] = ts[r+(R)-1];                              \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-
-	for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512)   /* loop thru it */
-#endif                         /* end of looped code definitions */
+#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)                \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define I512(R)                                                  \
+	X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */ \
+	X1 += ks[((R) + 2) % 9];                                     \
+	X2 += ks[((R) + 3) % 9];                                     \
+	X3 += ks[((R) + 4) % 9];                                     \
+	X4 += ks[((R) + 5) % 9];                                     \
+	X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];                 \
+	X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];                 \
+	X7 += ks[((R) + 8) % 9] + (R) + 1;                           \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else /* looping version */
+#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define I512(R)                                                \
+	X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
+	X1 += ks[r + (R) + 1];                                     \
+	X2 += ks[r + (R) + 2];                                     \
+	X3 += ks[r + (R) + 3];                                     \
+	X4 += ks[r + (R) + 4];                                     \
+	X5 += ks[r + (R) + 5] + ts[r + (R) + 0];                   \
+	X6 += ks[r + (R) + 6] + ts[r + (R) + 1];                   \
+	X7 += ks[r + (R) + 7] + r + (R);                           \
+	ks[r + (R) + 8] = ks[r + (R)-1]; /* rotate key schedule */ \
+	ts[r + (R) + 2] = ts[r + (R)-1];                           \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		for(r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) /* loop thru it */
+#endif /* end of looped code definitions */
 		{
-#define R512_8_rounds(R)  /* do 8 full rounds */  \
-		R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
-		R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
-		R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
-		R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
-		I512(2*(R));                              \
-		R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
-		R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
-		R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
-		R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
-		I512(2*(R)+1);        /* and key injection */
-
-		R512_8_rounds( 0);
-
-#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
-
-  #if   R512_Unroll_R( 1)
-		R512_8_rounds( 1);
-  #endif
-  #if   R512_Unroll_R( 2)
-		R512_8_rounds( 2);
-  #endif
-  #if   R512_Unroll_R( 3)
-		R512_8_rounds( 3);
-  #endif
-  #if   R512_Unroll_R( 4)
-		R512_8_rounds( 4);
-  #endif
-  #if   R512_Unroll_R( 5)
-		R512_8_rounds( 5);
-  #endif
-  #if   R512_Unroll_R( 6)
-		R512_8_rounds( 6);
-  #endif
-  #if   R512_Unroll_R( 7)
-		R512_8_rounds( 7);
-  #endif
-  #if   R512_Unroll_R( 8)
-		R512_8_rounds( 8);
-  #endif
-  #if   R512_Unroll_R( 9)
-		R512_8_rounds( 9);
-  #endif
-  #if   R512_Unroll_R(10)
-		R512_8_rounds(10);
-  #endif
-  #if   R512_Unroll_R(11)
-		R512_8_rounds(11);
-  #endif
-  #if   R512_Unroll_R(12)
-		R512_8_rounds(12);
-  #endif
-  #if   R512_Unroll_R(13)
-		R512_8_rounds(13);
-  #endif
-  #if   R512_Unroll_R(14)
-		R512_8_rounds(14);
-  #endif
-  #if  (SKEIN_UNROLL_512 > 14)
-#error  "need more unrolling in Skein_512_Process_Block"
-  #endif
+#define R512_8_rounds(R) /* do 8 full rounds */         \
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \
+	I512(2 * (R));                                      \
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \
+	I512(2 * (R) + 1); /* and key injection */
+
+			R512_8_rounds(0);
+
+#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
+
+#if R512_Unroll_R(1)
+			R512_8_rounds(1);
+#endif
+#if R512_Unroll_R(2)
+			R512_8_rounds(2);
+#endif
+#if R512_Unroll_R(3)
+			R512_8_rounds(3);
+#endif
+#if R512_Unroll_R(4)
+			R512_8_rounds(4);
+#endif
+#if R512_Unroll_R(5)
+			R512_8_rounds(5);
+#endif
+#if R512_Unroll_R(6)
+			R512_8_rounds(6);
+#endif
+#if R512_Unroll_R(7)
+			R512_8_rounds(7);
+#endif
+#if R512_Unroll_R(8)
+			R512_8_rounds(8);
+#endif
+#if R512_Unroll_R(9)
+			R512_8_rounds(9);
+#endif
+#if R512_Unroll_R(10)
+			R512_8_rounds(10);
+#endif
+#if R512_Unroll_R(11)
+			R512_8_rounds(11);
+#endif
+#if R512_Unroll_R(12)
+			R512_8_rounds(12);
+#endif
+#if R512_Unroll_R(13)
+			R512_8_rounds(13);
+#endif
+#if R512_Unroll_R(14)
+			R512_8_rounds(14);
+#endif
+#if(SKEIN_UNROLL_512 > 14)
+#error "need more unrolling in Skein_512_Process_Block"
+#endif
 		}
 
 		/* do the final "feedforward" xor, update context chaining vars */
@@ -898,256 +1015,284 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s
 		ctx->X[5] = X5 ^ w[5];
 		ctx->X[6] = X6 ^ w[6];
 		ctx->X[7] = X7 ^ w[7];
-		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
-		}
-	while (--blkCnt);
+	} while(--blkCnt);
 	ctx->h.T[0] = ts[0];
 	ctx->h.T[1] = ts[1];
-	}
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein_512_Process_Block_CodeSize(void)
-	{
-	return ((u08b_t *) Skein_512_Process_Block_CodeSize) -
-		   ((u08b_t *) Skein_512_Process_Block);
-	}
+{
+	return ((u08b_t*)Skein_512_Process_Block_CodeSize) -
+		   ((u08b_t*)Skein_512_Process_Block);
+}
 static uint_t Skein_512_Unroll_Cnt(void)
-	{
+{
 	return SKEIN_UNROLL_512;
-	}
+}
 #endif
 #endif
 
 /*****************************  Skein1024 ******************************/
 #if !(SKEIN_USE_ASM & 1024)
-static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
-	{ /* do it in C, always looping (unrolled is bigger AND slower!) */
+static void Skein1024_Process_Block(Skein1024_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd)
+{ /* do it in C, always looping (unrolled is bigger AND slower!) */
 	enum
-		{
+	{
 		WCNT = SKEIN1024_STATE_WORDS
-		};
-#undef  RCNT
-#define RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
+	};
+#undef RCNT
+#define RCNT (SKEIN1024_ROUNDS_TOTAL / 8)
 
-#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
-#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
+#define SKEIN_UNROLL_1024 ((SKEIN_LOOP) % 10)
 #else
 #define SKEIN_UNROLL_1024 (0)
 #endif
 
-#if (SKEIN_UNROLL_1024 != 0)
-#if (RCNT % SKEIN_UNROLL_1024)
+#if(SKEIN_UNROLL_1024 != 0)
+#if(RCNT % SKEIN_UNROLL_1024)
 #error "Invalid SKEIN_UNROLL_1024"              /* sanity check on unroll count */
 #endif
-	size_t  r;
-	u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
+	size_t r;
+	u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/
 #else
-	u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+	u64b_t kw[WCNT + 4];									/* key schedule words : chaining vars + tweak */
 #endif
 
-	u64b_t  X00,X01,X02,X03,X04,X05,X06,X07,    /* local copy of vars, for speed */
-			X08,X09,X10,X11,X12,X13,X14,X15;
-	u64b_t  w [WCNT];                           /* local copy of input block */
+	u64b_t X00, X01, X02, X03, X04, X05, X06, X07, /* local copy of vars, for speed */
+		X08, X09, X10, X11, X12, X13, X14, X15;
+	u64b_t w[WCNT]; /* local copy of input block */
 #ifdef SKEIN_DEBUG
-	const u64b_t *Xptr[16];                     /* use for debugging (help compiler put Xn in registers) */
-	Xptr[ 0] = &X00;  Xptr[ 1] = &X01;  Xptr[ 2] = &X02;  Xptr[ 3] = &X03;
-	Xptr[ 4] = &X04;  Xptr[ 5] = &X05;  Xptr[ 6] = &X06;  Xptr[ 7] = &X07;
-	Xptr[ 8] = &X08;  Xptr[ 9] = &X09;  Xptr[10] = &X10;  Xptr[11] = &X11;
-	Xptr[12] = &X12;  Xptr[13] = &X13;  Xptr[14] = &X14;  Xptr[15] = &X15;
+	const u64b_t* Xptr[16]; /* use for debugging (help compiler put Xn in registers) */
+	Xptr[0] = &X00;
+	Xptr[1] = &X01;
+	Xptr[2] = &X02;
+	Xptr[3] = &X03;
+	Xptr[4] = &X04;
+	Xptr[5] = &X05;
+	Xptr[6] = &X06;
+	Xptr[7] = &X07;
+	Xptr[8] = &X08;
+	Xptr[9] = &X09;
+	Xptr[10] = &X10;
+	Xptr[11] = &X11;
+	Xptr[12] = &X12;
+	Xptr[13] = &X13;
+	Xptr[14] = &X14;
+	Xptr[15] = &X15;
 #endif
 
-	Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+	Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
 	ts[0] = ctx->h.T[0];
 	ts[1] = ctx->h.T[1];
-	do  {
+	do
+	{
 		/* this implementation only supports 2**64 input bytes (no carry out here) */
-		ts[0] += byteCntAdd;                    /* update processed length */
+		ts[0] += byteCntAdd; /* update processed length */
 
 		/* precompute the key schedule for this block */
-		ks[ 0] = ctx->X[ 0];
-		ks[ 1] = ctx->X[ 1];
-		ks[ 2] = ctx->X[ 2];
-		ks[ 3] = ctx->X[ 3];
-		ks[ 4] = ctx->X[ 4];
-		ks[ 5] = ctx->X[ 5];
-		ks[ 6] = ctx->X[ 6];
-		ks[ 7] = ctx->X[ 7];
-		ks[ 8] = ctx->X[ 8];
-		ks[ 9] = ctx->X[ 9];
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ctx->X[8];
+		ks[9] = ctx->X[9];
 		ks[10] = ctx->X[10];
 		ks[11] = ctx->X[11];
 		ks[12] = ctx->X[12];
 		ks[13] = ctx->X[13];
 		ks[14] = ctx->X[14];
 		ks[15] = ctx->X[15];
-		ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^
-				 ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^
-				 ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^
+		ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+				 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
+				 ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
 				 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
 
-		ts[2]  = ts[0] ^ ts[1];
+		ts[2] = ts[0] ^ ts[1];
 
-		Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
 		DebugSaveTweak(ctx);
-		Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);
-
-		X00    = w[ 0] + ks[ 0];                 /* do the first full key injection */
-		X01    = w[ 1] + ks[ 1];
-		X02    = w[ 2] + ks[ 2];
-		X03    = w[ 3] + ks[ 3];
-		X04    = w[ 4] + ks[ 4];
-		X05    = w[ 5] + ks[ 5];
-		X06    = w[ 6] + ks[ 6];
-		X07    = w[ 7] + ks[ 7];
-		X08    = w[ 8] + ks[ 8];
-		X09    = w[ 9] + ks[ 9];
-		X10    = w[10] + ks[10];
-		X11    = w[11] + ks[11];
-		X12    = w[12] + ks[12];
-		X13    = w[13] + ks[13] + ts[0];
-		X14    = w[14] + ks[14] + ts[1];
-		X15    = w[15] + ks[15];
-
-		Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
-
-#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \
-	X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0;   \
-	X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2;   \
-	X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4;   \
-	X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6;   \
-	X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8;   \
-	X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA;   \
-	X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC;   \
-	X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE;   \
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X00 = w[0] + ks[0]; /* do the first full key injection */
+		X01 = w[1] + ks[1];
+		X02 = w[2] + ks[2];
+		X03 = w[3] + ks[3];
+		X04 = w[4] + ks[4];
+		X05 = w[5] + ks[5];
+		X06 = w[6] + ks[6];
+		X07 = w[7] + ks[7];
+		X08 = w[8] + ks[8];
+		X09 = w[9] + ks[9];
+		X10 = w[10] + ks[10];
+		X11 = w[11] + ks[11];
+		X12 = w[12] + ks[12];
+		X13 = w[13] + ks[13] + ts[0];
+		X14 = w[14] + ks[14] + ts[1];
+		X15 = w[15] + ks[15];
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr);
+
+#define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rNum) \
+	X##p0 += X##p1;                                                                          \
+	X##p1 = RotL_64(X##p1, ROT##_0);                                                         \
+	X##p1 ^= X##p0;                                                                          \
+	X##p2 += X##p3;                                                                          \
+	X##p3 = RotL_64(X##p3, ROT##_1);                                                         \
+	X##p3 ^= X##p2;                                                                          \
+	X##p4 += X##p5;                                                                          \
+	X##p5 = RotL_64(X##p5, ROT##_2);                                                         \
+	X##p5 ^= X##p4;                                                                          \
+	X##p6 += X##p7;                                                                          \
+	X##p7 = RotL_64(X##p7, ROT##_3);                                                         \
+	X##p7 ^= X##p6;                                                                          \
+	X##p8 += X##p9;                                                                          \
+	X##p9 = RotL_64(X##p9, ROT##_4);                                                         \
+	X##p9 ^= X##p8;                                                                          \
+	X##pA += X##pB;                                                                          \
+	X##pB = RotL_64(X##pB, ROT##_5);                                                         \
+	X##pB ^= X##pA;                                                                          \
+	X##pC += X##pD;                                                                          \
+	X##pD = RotL_64(X##pD, ROT##_6);                                                         \
+	X##pD ^= X##pC;                                                                          \
+	X##pE += X##pF;                                                                          \
+	X##pF = RotL_64(X##pF, ROT##_7);                                                         \
+	X##pF ^= X##pE;
 
 #if SKEIN_UNROLL_1024 == 0
-#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
-	Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr);
-
-#define I1024(R)                                                      \
-	X00   += ks[((R)+ 1) % 17]; /* inject the key schedule value */   \
-	X01   += ks[((R)+ 2) % 17];                                       \
-	X02   += ks[((R)+ 3) % 17];                                       \
-	X03   += ks[((R)+ 4) % 17];                                       \
-	X04   += ks[((R)+ 5) % 17];                                       \
-	X05   += ks[((R)+ 6) % 17];                                       \
-	X06   += ks[((R)+ 7) % 17];                                       \
-	X07   += ks[((R)+ 8) % 17];                                       \
-	X08   += ks[((R)+ 9) % 17];                                       \
-	X09   += ks[((R)+10) % 17];                                       \
-	X10   += ks[((R)+11) % 17];                                       \
-	X11   += ks[((R)+12) % 17];                                       \
-	X12   += ks[((R)+13) % 17];                                       \
-	X13   += ks[((R)+14) % 17] + ts[((R)+1) % 3];                     \
-	X14   += ks[((R)+15) % 17] + ts[((R)+2) % 3];                     \
-	X15   += ks[((R)+16) % 17] +     (R)+1;                           \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-#else                                       /* looping version */
-#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
-	Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr);
-
-#define I1024(R)                                                      \
-	X00   += ks[r+(R)+ 0];    /* inject the key schedule value */     \
-	X01   += ks[r+(R)+ 1];                                            \
-	X02   += ks[r+(R)+ 2];                                            \
-	X03   += ks[r+(R)+ 3];                                            \
-	X04   += ks[r+(R)+ 4];                                            \
-	X05   += ks[r+(R)+ 5];                                            \
-	X06   += ks[r+(R)+ 6];                                            \
-	X07   += ks[r+(R)+ 7];                                            \
-	X08   += ks[r+(R)+ 8];                                            \
-	X09   += ks[r+(R)+ 9];                                            \
-	X10   += ks[r+(R)+10];                                            \
-	X11   += ks[r+(R)+11];                                            \
-	X12   += ks[r+(R)+12];                                            \
-	X13   += ks[r+(R)+13] + ts[r+(R)+0];                              \
-	X14   += ks[r+(R)+14] + ts[r+(R)+1];                              \
-	X15   += ks[r+(R)+15] +    r+(R)   ;                              \
-	ks[r  +       (R)+16] = ks[r+(R)-1];  /* rotate key schedule */   \
-	ts[r  +       (R)+ 2] = ts[r+(R)-1];                              \
-	Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
-
-	for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024)    /* loop thru it */
+#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
+
+#define I1024(R)                                                   \
+	X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */ \
+	X01 += ks[((R) + 2) % 17];                                     \
+	X02 += ks[((R) + 3) % 17];                                     \
+	X03 += ks[((R) + 4) % 17];                                     \
+	X04 += ks[((R) + 5) % 17];                                     \
+	X05 += ks[((R) + 6) % 17];                                     \
+	X06 += ks[((R) + 7) % 17];                                     \
+	X07 += ks[((R) + 8) % 17];                                     \
+	X08 += ks[((R) + 9) % 17];                                     \
+	X09 += ks[((R) + 10) % 17];                                    \
+	X10 += ks[((R) + 11) % 17];                                    \
+	X11 += ks[((R) + 12) % 17];                                    \
+	X12 += ks[((R) + 13) % 17];                                    \
+	X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];                \
+	X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];                \
+	X15 += ks[((R) + 16) % 17] + (R) + 1;                          \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else /* looping version */
+#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
+
+#define I1024(R)                                                \
+	X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \
+	X01 += ks[r + (R) + 1];                                     \
+	X02 += ks[r + (R) + 2];                                     \
+	X03 += ks[r + (R) + 3];                                     \
+	X04 += ks[r + (R) + 4];                                     \
+	X05 += ks[r + (R) + 5];                                     \
+	X06 += ks[r + (R) + 6];                                     \
+	X07 += ks[r + (R) + 7];                                     \
+	X08 += ks[r + (R) + 8];                                     \
+	X09 += ks[r + (R) + 9];                                     \
+	X10 += ks[r + (R) + 10];                                    \
+	X11 += ks[r + (R) + 11];                                    \
+	X12 += ks[r + (R) + 12];                                    \
+	X13 += ks[r + (R) + 13] + ts[r + (R) + 0];                  \
+	X14 += ks[r + (R) + 14] + ts[r + (R) + 1];                  \
+	X15 += ks[r + (R) + 15] + r + (R);                          \
+	ks[r + (R) + 16] = ks[r + (R)-1]; /* rotate key schedule */ \
+	ts[r + (R) + 2] = ts[r + (R)-1];                            \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		for(r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) /* loop thru it */
 #endif
 		{
-#define R1024_8_rounds(R)    /* do 8 full rounds */                               \
-		R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \
-		R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \
-		R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \
-		R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \
-		I1024(2*(R));                                                             \
-		R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \
-		R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \
-		R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \
-		R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \
-		I1024(2*(R)+1);
-
-		R1024_8_rounds( 0);
-
-#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
-
-  #if   R1024_Unroll_R( 1)
-		R1024_8_rounds( 1);
-  #endif
-  #if   R1024_Unroll_R( 2)
-		R1024_8_rounds( 2);
-  #endif
-  #if   R1024_Unroll_R( 3)
-		R1024_8_rounds( 3);
-  #endif
-  #if   R1024_Unroll_R( 4)
-		R1024_8_rounds( 4);
-  #endif
-  #if   R1024_Unroll_R( 5)
-		R1024_8_rounds( 5);
-  #endif
-  #if   R1024_Unroll_R( 6)
-		R1024_8_rounds( 6);
-  #endif
-  #if   R1024_Unroll_R( 7)
-		R1024_8_rounds( 7);
-  #endif
-  #if   R1024_Unroll_R( 8)
-		R1024_8_rounds( 8);
-  #endif
-  #if   R1024_Unroll_R( 9)
-		R1024_8_rounds( 9);
-  #endif
-  #if   R1024_Unroll_R(10)
-		R1024_8_rounds(10);
-  #endif
-  #if   R1024_Unroll_R(11)
-		R1024_8_rounds(11);
-  #endif
-  #if   R1024_Unroll_R(12)
-		R1024_8_rounds(12);
-  #endif
-  #if   R1024_Unroll_R(13)
-		R1024_8_rounds(13);
-  #endif
-  #if   R1024_Unroll_R(14)
-		R1024_8_rounds(14);
-  #endif
-  #if  (SKEIN_UNROLL_1024 > 14)
-#error  "need more unrolling in Skein_1024_Process_Block"
-  #endif
+#define R1024_8_rounds(R) /* do 8 full rounds */                                                 \
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_0, 8 * (R) + 1); \
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_1, 8 * (R) + 2); \
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_2, 8 * (R) + 3); \
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_3, 8 * (R) + 4); \
+	I1024(2 * (R));                                                                              \
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_4, 8 * (R) + 5); \
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_5, 8 * (R) + 6); \
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_6, 8 * (R) + 7); \
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_7, 8 * (R) + 8); \
+	I1024(2 * (R) + 1);
+
+			R1024_8_rounds(0);
+
+#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
+
+#if R1024_Unroll_R(1)
+			R1024_8_rounds(1);
+#endif
+#if R1024_Unroll_R(2)
+			R1024_8_rounds(2);
+#endif
+#if R1024_Unroll_R(3)
+			R1024_8_rounds(3);
+#endif
+#if R1024_Unroll_R(4)
+			R1024_8_rounds(4);
+#endif
+#if R1024_Unroll_R(5)
+			R1024_8_rounds(5);
+#endif
+#if R1024_Unroll_R(6)
+			R1024_8_rounds(6);
+#endif
+#if R1024_Unroll_R(7)
+			R1024_8_rounds(7);
+#endif
+#if R1024_Unroll_R(8)
+			R1024_8_rounds(8);
+#endif
+#if R1024_Unroll_R(9)
+			R1024_8_rounds(9);
+#endif
+#if R1024_Unroll_R(10)
+			R1024_8_rounds(10);
+#endif
+#if R1024_Unroll_R(11)
+			R1024_8_rounds(11);
+#endif
+#if R1024_Unroll_R(12)
+			R1024_8_rounds(12);
+#endif
+#if R1024_Unroll_R(13)
+			R1024_8_rounds(13);
+#endif
+#if R1024_Unroll_R(14)
+			R1024_8_rounds(14);
+#endif
+#if(SKEIN_UNROLL_1024 > 14)
+#error "need more unrolling in Skein_1024_Process_Block"
+#endif
 		}
 		/* do the final "feedforward" xor, update context chaining vars */
 
-		ctx->X[ 0] = X00 ^ w[ 0];
-		ctx->X[ 1] = X01 ^ w[ 1];
-		ctx->X[ 2] = X02 ^ w[ 2];
-		ctx->X[ 3] = X03 ^ w[ 3];
-		ctx->X[ 4] = X04 ^ w[ 4];
-		ctx->X[ 5] = X05 ^ w[ 5];
-		ctx->X[ 6] = X06 ^ w[ 6];
-		ctx->X[ 7] = X07 ^ w[ 7];
-		ctx->X[ 8] = X08 ^ w[ 8];
-		ctx->X[ 9] = X09 ^ w[ 9];
+		ctx->X[0] = X00 ^ w[0];
+		ctx->X[1] = X01 ^ w[1];
+		ctx->X[2] = X02 ^ w[2];
+		ctx->X[3] = X03 ^ w[3];
+		ctx->X[4] = X04 ^ w[4];
+		ctx->X[5] = X05 ^ w[5];
+		ctx->X[6] = X06 ^ w[6];
+		ctx->X[7] = X07 ^ w[7];
+		ctx->X[8] = X08 ^ w[8];
+		ctx->X[9] = X09 ^ w[9];
 		ctx->X[10] = X10 ^ w[10];
 		ctx->X[11] = X11 ^ w[11];
 		ctx->X[12] = X12 ^ w[12];
@@ -1155,30 +1300,28 @@ static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,s
 		ctx->X[14] = X14 ^ w[14];
 		ctx->X[15] = X15 ^ w[15];
 
-		Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 		blkPtr += SKEIN1024_BLOCK_BYTES;
-		}
-	while (--blkCnt);
+	} while(--blkCnt);
 	ctx->h.T[0] = ts[0];
 	ctx->h.T[1] = ts[1];
-	}
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein1024_Process_Block_CodeSize(void)
-	{
-	return ((u08b_t *) Skein1024_Process_Block_CodeSize) -
-		   ((u08b_t *) Skein1024_Process_Block);
-	}
+{
+	return ((u08b_t*)Skein1024_Process_Block_CodeSize) -
+		   ((u08b_t*)Skein1024_Process_Block);
+}
 static uint_t Skein1024_Unroll_Cnt(void)
-	{
+{
 	return SKEIN_UNROLL_1024;
-	}
+}
 #endif
 #endif
 
-
 #if 0
 /*****************************************************************/
 /*     256-bit Skein                                             */
@@ -1289,93 +1432,93 @@ static int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* process the input bytes */
-static int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
-	{
+static int Skein_256_Update(Skein_256_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt)
+{
 	size_t n;
 
-	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
 	/* process full blocks, if any */
-	if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES)
+	if(msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES)
+	{
+		if(ctx->h.bCnt) /* finish up any buffered message data */
 		{
-		if (ctx->h.bCnt)                              /* finish up any buffered message data */
+			n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
+			if(n)
 			{
-			n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
-			if (n)
-				{
-				Skein_assert(n < msgByteCnt);         /* check on our logic here */
-				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
-				msgByteCnt  -= n;
-				msg         += n;
+				Skein_assert(n < msgByteCnt); /* check on our logic here */
+				memcpy(&ctx->b[ctx->h.bCnt], msg, n);
+				msgByteCnt -= n;
+				msg += n;
 				ctx->h.bCnt += n;
-				}
+			}
 			Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
-			Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES);
+			Skein_256_Process_Block(ctx, ctx->b, 1, SKEIN_256_BLOCK_BYTES);
 			ctx->h.bCnt = 0;
-			}
+		}
 		/* now process any remaining full blocks, directly from input message data */
-		if (msgByteCnt > SKEIN_256_BLOCK_BYTES)
-			{
-			n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES;   /* number of full blocks to process */
-			Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES);
+		if(msgByteCnt > SKEIN_256_BLOCK_BYTES)
+		{
+			n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES; /* number of full blocks to process */
+			Skein_256_Process_Block(ctx, msg, n, SKEIN_256_BLOCK_BYTES);
 			msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
-			msg        += n * SKEIN_256_BLOCK_BYTES;
-			}
-		Skein_assert(ctx->h.bCnt == 0);
+			msg += n * SKEIN_256_BLOCK_BYTES;
 		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
 
 	/* copy any remaining source message data bytes into b[] */
-	if (msgByteCnt)
-		{
+	if(msgByteCnt)
+	{
 		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
-		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt);
 		ctx->h.bCnt += msgByteCnt;
-		}
+	}
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
-static int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal)
-	{
-	size_t i,n,byteCnt;
+static int Skein_256_Final(Skein_256_Ctxt_t* ctx, u08b_t* hashVal)
+{
+	size_t i, n, byteCnt;
 	u64b_t X[SKEIN_256_STATE_WORDS];
-	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
-	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
-	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)            /* zero pad b[] if necessary */
-		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;		/* tag as the final block */
+	if(ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
 
-	Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+	Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */
 
 	/* now output the result */
-	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
 
 	/* run Threefish in "counter mode" to generate output */
-	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
-	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
-	for (i=0;i < byteCnt;i += SKEIN_256_BLOCK_BYTES)
-		{
-		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
-		Skein_Start_New_Type(ctx,OUT_FINAL);
-		Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
-		n = byteCnt - i;   /* number of output bytes left to go */
-		if (n >= SKEIN_256_BLOCK_BYTES)
-			n  = SKEIN_256_BLOCK_BYTES;
-		Skein_Put64_LSB_First(hashVal+i,ctx->X,n);   /* "output" the ctr mode bytes */
-		Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES);
-		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
-		}
-	return SKEIN_SUCCESS;
+	memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+	memcpy(X, ctx->X, sizeof(X));	  /* keep a local copy of counter mode "key" */
+	for(i = 0; i < byteCnt; i += SKEIN_256_BLOCK_BYTES)
+	{
+		((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		Skein_256_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i;										 /* number of output bytes left to go */
+		if(n >= SKEIN_256_BLOCK_BYTES)
+			n = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i, ctx->X, n); /* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n, hashVal + i * SKEIN_256_BLOCK_BYTES);
+		memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */
 	}
+	return SKEIN_SUCCESS;
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein_256_API_CodeSize(void)
-	{
-	return ((u08b_t *) Skein_256_API_CodeSize) -
-		   ((u08b_t *) Skein_256_Init);
-	}
+{
+	return ((u08b_t*)Skein_256_API_CodeSize) -
+		   ((u08b_t*)Skein_256_Init);
+}
 #endif
 
 /*****************************************************************/
@@ -1384,47 +1527,54 @@ static size_t Skein_256_API_CodeSize(void)
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* init the context for a straight hashing operation  */
-static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
-	{
-	union
-		{
-		u08b_t  b[SKEIN_512_STATE_BYTES];
-		u64b_t  w[SKEIN_512_STATE_WORDS];
-		} cfg;                              /* config block */
+static int Skein_512_Init(Skein_512_Ctxt_t* ctx, size_t hashBitLen)
+{
+	union {
+		u08b_t b[SKEIN_512_STATE_BYTES];
+		u64b_t w[SKEIN_512_STATE_WORDS];
+	} cfg; /* config block */
 
-	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
-	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
 
-	switch (hashBitLen)
-		{             /* use pre-computed values, where available */
+	switch(hashBitLen)
+	{ /* use pre-computed values, where available */
 #ifndef SKEIN_NO_PRECOMP
-		case  512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X));  break;
-		case  384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X));  break;
-		case  256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X));  break;
-		case  224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X));  break;
+	case 512:
+		memcpy(ctx->X, SKEIN_512_IV_512, sizeof(ctx->X));
+		break;
+	case 384:
+		memcpy(ctx->X, SKEIN_512_IV_384, sizeof(ctx->X));
+		break;
+	case 256:
+		memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X));
+		break;
+	case 224:
+		memcpy(ctx->X, SKEIN_512_IV_224, sizeof(ctx->X));
+		break;
 #endif
-		default:
-			/* here if there is no precomputed IV value available */
-			/* build/process the config block, type == CONFIG (could be precomputed) */
-			Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
-
-			cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
-			cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
-			cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
-			memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
-
-			/* compute the initial chaining values from config block */
-			memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
-			Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
-			break;
-		}
+	default:
+		/* here if there is no precomputed IV value available */
+		/* build/process the config block, type == CONFIG (could be precomputed) */
+		Skein_Start_New_Type(ctx, CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */
+
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */
+		cfg.w[1] = Skein_Swap64(hashBitLen);	   /* hash result length in bits */
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		memset(&cfg.w[3], 0, sizeof(cfg) - 3 * sizeof(cfg.w[0])); /* zero pad config block */
+
+		/* compute the initial chaining values from config block */
+		memset(ctx->X, 0, sizeof(ctx->X)); /* zero the chaining variables */
+		Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
 
 	/* The chaining vars ctx->X are now initialized for the given hashBitLen. */
 	/* Set up to process the data message portion of the hash (default) */
-	Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+	Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 #if 0
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -1489,93 +1639,93 @@ static int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* process the input bytes */
-static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
-	{
+static int Skein_512_Update(Skein_512_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt)
+{
 	size_t n;
 
-	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
 	/* process full blocks, if any */
-	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+	if(msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+	{
+		if(ctx->h.bCnt) /* finish up any buffered message data */
 		{
-		if (ctx->h.bCnt)                              /* finish up any buffered message data */
+			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
+			if(n)
 			{
-			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
-			if (n)
-				{
-				Skein_assert(n < msgByteCnt);         /* check on our logic here */
-				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
-				msgByteCnt  -= n;
-				msg         += n;
+				Skein_assert(n < msgByteCnt); /* check on our logic here */
+				memcpy(&ctx->b[ctx->h.bCnt], msg, n);
+				msgByteCnt -= n;
+				msg += n;
 				ctx->h.bCnt += n;
-				}
+			}
 			Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
-			Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+			Skein_512_Process_Block(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES);
 			ctx->h.bCnt = 0;
-			}
+		}
 		/* now process any remaining full blocks, directly from input message data */
-		if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
-			{
-			n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;   /* number of full blocks to process */
-			Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+		if(msgByteCnt > SKEIN_512_BLOCK_BYTES)
+		{
+			n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */
+			Skein_512_Process_Block(ctx, msg, n, SKEIN_512_BLOCK_BYTES);
 			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
-			msg        += n * SKEIN_512_BLOCK_BYTES;
-			}
-		Skein_assert(ctx->h.bCnt == 0);
+			msg += n * SKEIN_512_BLOCK_BYTES;
 		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
 
 	/* copy any remaining source message data bytes into b[] */
-	if (msgByteCnt)
-		{
+	if(msgByteCnt)
+	{
 		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
-		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt);
 		ctx->h.bCnt += msgByteCnt;
-		}
+	}
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
-static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal)
-	{
-	size_t i,n,byteCnt;
+static int Skein_512_Final(Skein_512_Ctxt_t* ctx, u08b_t* hashVal)
+{
+	size_t i, n, byteCnt;
 	u64b_t X[SKEIN_512_STATE_WORDS];
-	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
-	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
-	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)            /* zero pad b[] if necessary */
-		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;		/* tag as the final block */
+	if(ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
 
-	Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+	Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */
 
 	/* now output the result */
-	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
 
 	/* run Threefish in "counter mode" to generate output */
-	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
-	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
-	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
-		{
-		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
-		Skein_Start_New_Type(ctx,OUT_FINAL);
-		Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
-		n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
-		if (n >= SKEIN_512_BLOCK_BYTES)
-			n  = SKEIN_512_BLOCK_BYTES;
-		Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
-		Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES);
-		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
-		}
-	return SKEIN_SUCCESS;
+	memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+	memcpy(X, ctx->X, sizeof(X));	  /* keep a local copy of counter mode "key" */
+	for(i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++)
+	{
+		((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		Skein_512_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;				 /* number of output bytes left to go */
+		if(n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES, ctx->X, n); /* "output" the ctr mode bytes */
+		Skein_Show_Final(512, &ctx->h, n, hashVal + i * SKEIN_512_BLOCK_BYTES);
+		memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */
 	}
+	return SKEIN_SUCCESS;
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein_512_API_CodeSize(void)
-	{
-	return ((u08b_t *) Skein_512_API_CodeSize) -
-		   ((u08b_t *) Skein_512_Init);
-	}
+{
+	return ((u08b_t*)Skein_512_API_CodeSize) -
+		   ((u08b_t*)Skein_512_Init);
+}
 #endif
 
 /*****************************************************************/
@@ -1583,46 +1733,51 @@ static size_t Skein_512_API_CodeSize(void)
 /*****************************************************************/
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* init the context for a straight hashing operation  */
-static int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
-	{
-	union
-		{
-		u08b_t  b[SKEIN1024_STATE_BYTES];
-		u64b_t  w[SKEIN1024_STATE_WORDS];
-		} cfg;                              /* config block */
+static int Skein1024_Init(Skein1024_Ctxt_t* ctx, size_t hashBitLen)
+{
+	union {
+		u08b_t b[SKEIN1024_STATE_BYTES];
+		u64b_t w[SKEIN1024_STATE_WORDS];
+	} cfg; /* config block */
 
-	Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN);
-	ctx->h.hashBitLen = hashBitLen;         /* output hash bit count */
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen; /* output hash bit count */
 
-	switch (hashBitLen)
-		{              /* use pre-computed values, where available */
+	switch(hashBitLen)
+	{ /* use pre-computed values, where available */
 #ifndef SKEIN_NO_PRECOMP
-		case  512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break;
-		case  384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break;
-		case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break;
+	case 512:
+		memcpy(ctx->X, SKEIN1024_IV_512, sizeof(ctx->X));
+		break;
+	case 384:
+		memcpy(ctx->X, SKEIN1024_IV_384, sizeof(ctx->X));
+		break;
+	case 1024:
+		memcpy(ctx->X, SKEIN1024_IV_1024, sizeof(ctx->X));
+		break;
 #endif
-		default:
-			/* here if there is no precomputed IV value available */
-			/* build/process the config block, type == CONFIG (could be precomputed) */
-			Skein_Start_New_Type(ctx,CFG_FINAL);        /* set tweaks: T0=0; T1=CFG | FINAL */
-
-			cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);  /* set the schema, version */
-			cfg.w[1] = Skein_Swap64(hashBitLen);        /* hash result length in bits */
-			cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
-			memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */
-
-			/* compute the initial chaining values from config block */
-			memset(ctx->X,0,sizeof(ctx->X));            /* zero the chaining variables */
-			Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN);
-			break;
-		}
+	default:
+		/* here if there is no precomputed IV value available */
+		/* build/process the config block, type == CONFIG (could be precomputed) */
+		Skein_Start_New_Type(ctx, CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */
+
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */
+		cfg.w[1] = Skein_Swap64(hashBitLen);	   /* hash result length in bits */
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		memset(&cfg.w[3], 0, sizeof(cfg) - 3 * sizeof(cfg.w[0])); /* zero pad config block */
+
+		/* compute the initial chaining values from config block */
+		memset(ctx->X, 0, sizeof(ctx->X)); /* zero the chaining variables */
+		Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
 
 	/* The chaining vars ctx->X are now initialized for the given hashBitLen. */
 	/* Set up to process the data message portion of the hash (default) */
-	Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+	Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 #if 0
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
@@ -1687,93 +1842,93 @@ static int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* process the input bytes */
-static int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt)
-	{
+static int Skein1024_Update(Skein1024_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt)
+{
 	size_t n;
 
-	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
 	/* process full blocks, if any */
-	if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES)
+	if(msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES)
+	{
+		if(ctx->h.bCnt) /* finish up any buffered message data */
 		{
-		if (ctx->h.bCnt)                              /* finish up any buffered message data */
+			n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */
+			if(n)
 			{
-			n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
-			if (n)
-				{
-				Skein_assert(n < msgByteCnt);         /* check on our logic here */
-				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
-				msgByteCnt  -= n;
-				msg         += n;
+				Skein_assert(n < msgByteCnt); /* check on our logic here */
+				memcpy(&ctx->b[ctx->h.bCnt], msg, n);
+				msgByteCnt -= n;
+				msg += n;
 				ctx->h.bCnt += n;
-				}
+			}
 			Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
-			Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES);
+			Skein1024_Process_Block(ctx, ctx->b, 1, SKEIN1024_BLOCK_BYTES);
 			ctx->h.bCnt = 0;
-			}
+		}
 		/* now process any remaining full blocks, directly from input message data */
-		if (msgByteCnt > SKEIN1024_BLOCK_BYTES)
-			{
-			n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES;   /* number of full blocks to process */
-			Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES);
+		if(msgByteCnt > SKEIN1024_BLOCK_BYTES)
+		{
+			n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES; /* number of full blocks to process */
+			Skein1024_Process_Block(ctx, msg, n, SKEIN1024_BLOCK_BYTES);
 			msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
-			msg        += n * SKEIN1024_BLOCK_BYTES;
-			}
-		Skein_assert(ctx->h.bCnt == 0);
+			msg += n * SKEIN1024_BLOCK_BYTES;
 		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
 
 	/* copy any remaining source message data bytes into b[] */
-	if (msgByteCnt)
-		{
+	if(msgByteCnt)
+	{
 		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
-		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt);
 		ctx->h.bCnt += msgByteCnt;
-		}
+	}
 
 	return SKEIN_SUCCESS;
-	}
+}
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize the hash computation and output the result */
-static int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
-	{
-	size_t i,n,byteCnt;
+static int Skein1024_Final(Skein1024_Ctxt_t* ctx, u08b_t* hashVal)
+{
+	size_t i, n, byteCnt;
 	u64b_t X[SKEIN1024_STATE_WORDS];
-	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */
 
-	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
-	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)            /* zero pad b[] if necessary */
-		memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;		/* tag as the final block */
+	if(ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */
+		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
 
-	Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+	Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */
 
 	/* now output the result */
-	byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */
 
 	/* run Threefish in "counter mode" to generate output */
-	memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
-	memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
-	for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++)
-		{
-		((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
-		Skein_Start_New_Type(ctx,OUT_FINAL);
-		Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
-		n = byteCnt - i*SKEIN1024_BLOCK_BYTES;   /* number of output bytes left to go */
-		if (n >= SKEIN1024_BLOCK_BYTES)
-			n  = SKEIN1024_BLOCK_BYTES;
-		Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
-		Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES);
-		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
-		}
-	return SKEIN_SUCCESS;
+	memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */
+	memcpy(X, ctx->X, sizeof(X));	  /* keep a local copy of counter mode "key" */
+	for(i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++)
+	{
+		((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		Skein1024_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */
+		n = byteCnt - i * SKEIN1024_BLOCK_BYTES;				 /* number of output bytes left to go */
+		if(n >= SKEIN1024_BLOCK_BYTES)
+			n = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES, ctx->X, n); /* "output" the ctr mode bytes */
+		Skein_Show_Final(1024, &ctx->h, n, hashVal + i * SKEIN1024_BLOCK_BYTES);
+		memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */
 	}
+	return SKEIN_SUCCESS;
+}
 
 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 static size_t Skein1024_API_CodeSize(void)
-	{
-	return ((u08b_t *) Skein1024_API_CodeSize) -
-		   ((u08b_t *) Skein1024_Init);
-	}
+{
+	return ((u08b_t*)Skein1024_API_CodeSize) -
+		   ((u08b_t*)Skein1024_Init);
+}
 #endif
 
 /**************** Functions to support MAC/tree hashing ***************/
@@ -1828,7 +1983,6 @@ static int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
 	return SKEIN_SUCCESS;
 	}
 
-
 #if SKEIN_TREE_HASH
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* just do the OUTPUT stage                                       */
@@ -1921,116 +2075,126 @@ static int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal)
 
 typedef struct
 {
-  uint_t  statebits;                      /* 256, 512, or 1024 */
-  union
-  {
-	Skein_Ctxt_Hdr_t h;                 /* common header "overlay" */
-	Skein_256_Ctxt_t ctx_256;
-	Skein_512_Ctxt_t ctx_512;
-	Skein1024_Ctxt_t ctx1024;
-  } u;
-}
-hashState;
+	uint_t statebits; /* 256, 512, or 1024 */
+	union {
+		Skein_Ctxt_Hdr_t h; /* common header "overlay" */
+		Skein_256_Ctxt_t ctx_256;
+		Skein_512_Ctxt_t ctx_512;
+		Skein1024_Ctxt_t ctx1024;
+	} u;
+} hashState;
 
 /* "incremental" hashing API */
-static SkeinHashReturn Init  (hashState *state, int hashbitlen);
-static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen);
-static SkeinHashReturn Final (hashState *state,       SkeinBitSequence *hashval);
+static SkeinHashReturn Init(hashState* state, int hashbitlen);
+static SkeinHashReturn Update(hashState* state, const SkeinBitSequence* data, SkeinDataLength databitlen);
+static SkeinHashReturn Final(hashState* state, SkeinBitSequence* hashval);
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* select the context size and init the context */
-static SkeinHashReturn Init(hashState *state, int hashbitlen)
+static SkeinHashReturn Init(hashState* state, int hashbitlen)
 {
 #if SKEIN_256_NIST_MAX_HASH_BITS
-  if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS)
-  {
-	Skein_Assert(hashbitlen > 0,BAD_HASHLEN);
-	state->statebits = 64*SKEIN_256_STATE_WORDS;
-	return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen);
-  }
-#endif
-  if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS)
-  {
-	state->statebits = 64*SKEIN_512_STATE_WORDS;
-	return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen);
-  }
-  else
-  {
-	state->statebits = 64*SKEIN1024_STATE_WORDS;
-	return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen);
-  }
+	if(hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS)
+	{
+		Skein_Assert(hashbitlen > 0, BAD_HASHLEN);
+		state->statebits = 64 * SKEIN_256_STATE_WORDS;
+		return Skein_256_Init(&state->u.ctx_256, (size_t)hashbitlen);
+	}
+#endif
+	if(hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS)
+	{
+		state->statebits = 64 * SKEIN_512_STATE_WORDS;
+		return Skein_512_Init(&state->u.ctx_512, (size_t)hashbitlen);
+	}
+	else
+	{
+		state->statebits = 64 * SKEIN1024_STATE_WORDS;
+		return Skein1024_Init(&state->u.ctx1024, (size_t)hashbitlen);
+	}
 }
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* process data to be hashed */
-static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen)
+static SkeinHashReturn Update(hashState* state, const SkeinBitSequence* data, SkeinDataLength databitlen)
 {
-  /* only the final Update() call is allowed do partial bytes, else assert an error */
-  Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL);
+	/* only the final Update() call is allowed do partial bytes, else assert an error */
+	Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL);
 
-  Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,SKEIN_FAIL);
-  if ((databitlen & 7) == 0)  /* partial bytes? */
-  {
-	switch ((state->statebits >> 8) & 3)
+	Skein_Assert(state->statebits % 256 == 0 && (state->statebits - 256) < 1024, SKEIN_FAIL);
+	if((databitlen & 7) == 0) /* partial bytes? */
 	{
-	case 2:  return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3);
-	case 1:  return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3);
-	case 0:  return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3);
-	default: return SKEIN_FAIL;
+		switch((state->statebits >> 8) & 3)
+		{
+		case 2:
+			return Skein_512_Update(&state->u.ctx_512, data, databitlen >> 3);
+		case 1:
+			return Skein_256_Update(&state->u.ctx_256, data, databitlen >> 3);
+		case 0:
+			return Skein1024_Update(&state->u.ctx1024, data, databitlen >> 3);
+		default:
+			return SKEIN_FAIL;
+		}
 	}
-  }
-  else
-  {   /* handle partial final byte */
-	size_t bCnt = (databitlen >> 3) + 1;                  /* number of bytes to handle (nonzero here!) */
-	u08b_t b,mask;
+	else
+	{										 /* handle partial final byte */
+		size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle (nonzero here!) */
+		u08b_t b, mask;
 
-	mask = (u08b_t) (1u << (7 - (databitlen & 7)));       /* partial byte bit mask */
-	b    = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask);   /* apply bit padding on final byte */
+		mask = (u08b_t)(1u << (7 - (databitlen & 7)));		/* partial byte bit mask */
+		b = (u08b_t)((data[bCnt - 1] & (0 - mask)) | mask); /* apply bit padding on final byte */
 
-	switch ((state->statebits >> 8) & 3)
-	{
-	case 2:  Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte    */
-	  Skein_512_Update(&state->u.ctx_512,&b  ,  1   ); /* process the (masked) partial byte */
-	  break;
-	case 1:  Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte    */
-	  Skein_256_Update(&state->u.ctx_256,&b  ,  1   ); /* process the (masked) partial byte */
-	  break;
-	case 0:  Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte    */
-	  Skein1024_Update(&state->u.ctx1024,&b  ,  1   ); /* process the (masked) partial byte */
-	  break;
-	default: return SKEIN_FAIL;
-	}
-	Skein_Set_Bit_Pad_Flag(state->u.h);                    /* set tweak flag for the final call */
+		switch((state->statebits >> 8) & 3)
+		{
+		case 2:
+			Skein_512_Update(&state->u.ctx_512, data, bCnt - 1); /* process all but the final byte    */
+			Skein_512_Update(&state->u.ctx_512, &b, 1);			 /* process the (masked) partial byte */
+			break;
+		case 1:
+			Skein_256_Update(&state->u.ctx_256, data, bCnt - 1); /* process all but the final byte    */
+			Skein_256_Update(&state->u.ctx_256, &b, 1);			 /* process the (masked) partial byte */
+			break;
+		case 0:
+			Skein1024_Update(&state->u.ctx1024, data, bCnt - 1); /* process all but the final byte    */
+			Skein1024_Update(&state->u.ctx1024, &b, 1);			 /* process the (masked) partial byte */
+			break;
+		default:
+			return SKEIN_FAIL;
+		}
+		Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */
 
-	return SKEIN_SUCCESS;
-  }
+		return SKEIN_SUCCESS;
+	}
 }
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* finalize hash computation and output the result (hashbitlen bits) */
-static SkeinHashReturn Final(hashState *state, SkeinBitSequence *hashval)
+static SkeinHashReturn Final(hashState* state, SkeinBitSequence* hashval)
 {
-  Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL);
-  switch ((state->statebits >> 8) & 3)
-  {
-  case 2:  return Skein_512_Final(&state->u.ctx_512,hashval);
-  case 1:  return Skein_256_Final(&state->u.ctx_256,hashval);
-  case 0:  return Skein1024_Final(&state->u.ctx1024,hashval);
-  default: return SKEIN_FAIL;
-  }
+	Skein_Assert(state->statebits % 256 == 0 && (state->statebits - 256) < 1024, FAIL);
+	switch((state->statebits >> 8) & 3)
+	{
+	case 2:
+		return Skein_512_Final(&state->u.ctx_512, hashval);
+	case 1:
+		return Skein_256_Final(&state->u.ctx_256, hashval);
+	case 0:
+		return Skein1024_Final(&state->u.ctx1024, hashval);
+	default:
+		return SKEIN_FAIL;
+	}
 }
 
 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
 /* all-in-one hash function */
-SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data, /* all-in-one call */
-				SkeinDataLength databitlen,SkeinBitSequence *hashval)
+SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence* data, /* all-in-one call */
+	SkeinDataLength databitlen, SkeinBitSequence* hashval)
 {
-  hashState  state;
-  SkeinHashReturn r = Init(&state,hashbitlen);
-  if (r == SKEIN_SUCCESS)
-  { /* these calls do not fail when called properly */
-	r = Update(&state,data,databitlen);
-	Final(&state,hashval);
-  }
-  return r;
+	hashState state;
+	SkeinHashReturn r = Init(&state, hashbitlen);
+	if(r == SKEIN_SUCCESS)
+	{ /* these calls do not fail when called properly */
+		r = Update(&state, data, databitlen);
+		Final(&state, hashval);
+	}
+	return r;
 }
diff --git a/xmrstak/backend/cpu/crypto/c_skein.h b/xmrstak/backend/cpu/crypto/c_skein.h
index 1aa11dea3561f143cccd849babbbf76a124350cf..52f359e82618a66dba54f5132fbf666818984c78 100644
--- a/xmrstak/backend/cpu/crypto/c_skein.h
+++ b/xmrstak/backend/cpu/crypto/c_skein.h
@@ -1,5 +1,5 @@
 #ifndef _SKEIN_H_
-#define _SKEIN_H_     1
+#define _SKEIN_H_ 1
 /**************************************************************************
 **
 ** Interface declarations and internal definitions for Skein hashing.
@@ -27,21 +27,20 @@
 **                                1: return SKEIN_FAIL to flag errors
 **
 ***************************************************************************/
-#include "skein_port.h"                      /* get platform-specific definitions */
+#include "skein_port.h" /* get platform-specific definitions */
 
 typedef enum
 {
-  SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
-  SKEIN_FAIL            =      1,
-  SKEIN_BAD_HASHLEN     =      2
-}
-SkeinHashReturn;
+	SKEIN_SUCCESS = 0, /* return codes from Skein calls */
+	SKEIN_FAIL = 1,
+	SKEIN_BAD_HASHLEN = 2
+} SkeinHashReturn;
 
-typedef uint32_t SkeinDataLength;                /* bit count  type */
-typedef u08b_t   SkeinBitSequence;               /* bit stream type */
+typedef uint32_t SkeinDataLength; /* bit count  type */
+typedef u08b_t SkeinBitSequence;  /* bit stream type */
 
 /* "all-in-one" call */
-SkeinHashReturn skein_hash(int hashbitlen,   const SkeinBitSequence *data,
-                           SkeinDataLength databitlen, SkeinBitSequence *hashval);
+SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence* data,
+	SkeinDataLength databitlen, SkeinBitSequence* hashval);
 
-#endif  /* ifndef _SKEIN_H_ */
+#endif /* ifndef _SKEIN_H_ */
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
index 5844d381461915090db552d22a47bf1f40b2ed10..2d333d1181ad04939641ac3b9b39f9d3050ed729 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu.hpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
@@ -4,8 +4,8 @@
 #include <stdint.h>
 
 #if defined(_WIN32) || defined(_WIN64)
-#include <malloc.h>
 #include <intrin.h>
+#include <malloc.h>
 #define HAS_WIN_INTRIN_API
 #endif
 
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
index 79b38373a0260cf332d643b3a04fd6a6b5905249..efded74c8f04734afc8686f6dcab8d40ec19d03d 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
@@ -1,12 +1,12 @@
-#include "cn_gpu.hpp"
 #include "../../cryptonight.hpp"
+#include "cn_gpu.hpp"
 
-#pragma GCC target ("avx2")
+#pragma GCC target("avx2")
 #ifndef _mm256_bslli_epi128
-	#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
+#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
 #endif
 #ifndef _mm256_bsrli_epi128
-	#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
+#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
 #endif
 
 inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01)
@@ -67,7 +67,7 @@ inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2,
 // 112Ã—4 = 448
 template <bool add>
 inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
-							  float lcnt, float hcnt, const __m256& rnd_c, __m256& sum)
+	float lcnt, float hcnt, const __m256& rnd_c, __m256& sum)
 {
 	__m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1);
 	__m256 r = _mm256_setzero_ps();
@@ -92,7 +92,7 @@ inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256&
 
 template <size_t rot>
 inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
-								float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out)
+	float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out)
 {
 	__m256i r = double_comupte<rot % 2 != 0>(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum);
 	if(rot != 0)
@@ -101,9 +101,7 @@ inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256
 	out = _mm256_xor_si256(out, r);
 }
 
-
-inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n*16); }
-
+inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n * 16); }
 
 void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo)
 {
@@ -155,7 +153,7 @@ void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& al
 		sum1 = _mm256_add_ps(suma, sumb);
 
 		out2 = _mm256_xor_si256(out2, out);
-		out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2);
+		out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2, out2, 0x41), out2);
 		suma = _mm256_permute2f128_ps(sum0, sum1, 0x30);
 		sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21);
 		sum0 = _mm256_add_ps(suma, sumb);
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
index c8627d8b848adf3fb041ed2ebe9c2196f4dc1dbe..d65d9651ebd3b795268bab15ca4a824166033322 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
@@ -1,7 +1,7 @@
-#include "cn_gpu.hpp"
 #include "../../cryptonight.hpp"
+#include "cn_gpu.hpp"
 
-#pragma GCC target ("sse2")
+#pragma GCC target("sse2")
 
 inline void prep_dv(__m128i* idx, __m128i& v, __m128& n)
 {
@@ -21,13 +21,13 @@ inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c,
 {
 	n1 = _mm_add_ps(n1, c);
 	__m128 nn = _mm_mul_ps(n0, c);
-	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn, nn));
 	nn = fma_break(nn);
 	n = _mm_add_ps(n, nn);
 
 	n3 = _mm_sub_ps(n3, c);
 	__m128 dd = _mm_mul_ps(n2, c);
-	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd, dd));
 	dd = fma_break(dd);
 	d = _mm_add_ps(d, dd);
 
@@ -57,12 +57,12 @@ inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd
 	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
 	d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d);
 	d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d);
-	r =_mm_add_ps(r, _mm_div_ps(n,d));
+	r = _mm_add_ps(r, _mm_div_ps(n, d));
 }
 
 // 112Ã—4 = 448
-template<bool add>
-inline __m128i single_comupte(__m128 n0, __m128 n1,  __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
+template <bool add>
+inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
 {
 	__m128 c = _mm_set1_ps(cnt);
 	__m128 r = _mm_setzero_ps();
@@ -85,8 +85,8 @@ inline __m128i single_comupte(__m128 n0, __m128 n1,  __m128 n2,  __m128 n3, floa
 	return _mm_cvttps_epi32(r);
 }
 
-template<size_t rot>
-inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+template <size_t rot>
+inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
 {
 	__m128i r = single_comupte<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
 	if(rot != 0)
@@ -94,7 +94,7 @@ inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2,  __m128 n3, flo
 	out = _mm_xor_si128(out, r);
 }
 
-inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n*16); }
+inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n * 16); }
 
 void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo)
 {
diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h
index 488805ec05516c07118f3bc4d652d0ca2eac66f1..c8b8320b0474349a6b89ac413e879f91ba63c09e 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight.h
@@ -1,6 +1,6 @@
 #pragma once
-#include <stddef.h>
 #include <inttypes.h>
+#include <stddef.h>
 
 #include "variant4_random_math.h"
 
@@ -12,8 +12,8 @@
 
 struct cryptonight_ctx;
 
-typedef void  (*cn_mainloop_fun)(cryptonight_ctx *ctx);
-typedef void  (*cn_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*);
+typedef void (*cn_mainloop_fun)(cryptonight_ctx* ctx);
+typedef void (*cn_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*);
 typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size);
@@ -36,11 +36,9 @@ struct cryptonight_ctx
 	int asm_version = 0;
 	xmrstak_algo last_algo = invalid_algo;
 
-	union
-	{
+	union {
 		extra_ctx_r cn_r_ctx;
 	};
-
 };
 
 struct alloc_msg
@@ -51,5 +49,3 @@ struct alloc_msg
 size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
 cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
 void cryptonight_free_ctx(cryptonight_ctx* ctx);
-
-
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index d7316b25e4ac2a783b590a7598f0b3aa16afcc49..6c9e3390cbe24e142ab4c1c6ea9e8c338ac865c0 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -15,22 +15,24 @@
   */
 #pragma once
 
-#include "cryptonight.h"
-#include "xmrstak/backend/cryptonight.hpp"
 #include "../../miner_work.hpp"
 #include "cn_gpu.hpp"
+#include "cryptonight.h"
+#include "xmrstak/backend/cryptonight.hpp"
+#include <cfenv>
 #include <memory.h>
 #include <stdio.h>
-#include <cfenv>
 #include <utility>
 
 #ifdef _WIN64
-#	include <winsock2.h>
-#	include <windows.h>
-#	include <ntsecapi.h>
-#	include <tchar.h>
+#include <winsock2.h>
+// this comment disable clang include reordering
+#include <ntsecapi.h>
+#include <tchar.h>
+// this comment disable clang include reordering for windows.h
+#include <windows.h>
 #else
-#	include <sys/mman.h>
+#include <sys/mman.h>
 #endif
 
 #ifdef __GNUC__
@@ -54,9 +56,9 @@ static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi)
 
 extern "C"
 {
-	void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
+	void keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen);
 	void keccakf(uint64_t st[25], int rounds);
-	extern void(*const extra_hashes[4])(const void *, uint32_t, char *);
+	extern void (*const extra_hashes[4])(const void*, uint32_t, char*);
 }
 
 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
@@ -73,7 +75,7 @@ static inline __m128i sl_xor(__m128i tmp1)
 	return tmp1;
 }
 
-template<uint8_t rcon>
+template <uint8_t rcon>
 static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2)
 {
 	__m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon);
@@ -98,14 +100,14 @@ static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t r
 	*xout2 = _mm_xor_si128(*xout2, xout1);
 }
 
-template<bool SOFT_AES>
+template <bool SOFT_AES>
 static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3,
 	__m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
 {
 	__m128i xout0, xout2;
 
 	xout0 = _mm_load_si128(memory);
-	xout2 = _mm_load_si128(memory+1);
+	xout2 = _mm_load_si128(memory + 1);
 	*k0 = xout0;
 	*k1 = xout2;
 
@@ -175,7 +177,7 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3
 	x7 = _mm_xor_si128(x7, tmp0);
 }
 
-template<bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
+template <bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
 void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo)
 {
 	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast;
@@ -197,7 +199,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 
 	if(HEAVY_MIX)
 	{
-		for(size_t i=0; i < 16; i++)
+		for(size_t i = 0; i < 16; i++)
 		{
 			if(SOFT_AES)
 			{
@@ -230,7 +232,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 	}
 
 	const size_t MEM = algo.Mem();
-	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+	for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 	{
 		if(SOFT_AES)
 		{
@@ -277,29 +279,29 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 	}
 }
 
-template<bool PREFETCH, xmrstak_algo_id ALGO>
+template <bool PREFETCH, xmrstak_algo_id ALGO>
 void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrstak_algo& algo)
 {
 	constexpr size_t hash_size = 200; // 25x8 bytes
 	alignas(128) uint64_t hash[25];
 	const size_t mem = algo.Mem();
 
-	for (uint64_t i = 0; i < mem / 512; i++)
+	for(uint64_t i = 0; i < mem / 512; i++)
 	{
 		memcpy(hash, input, hash_size);
 		hash[0] ^= i;
 
 		keccakf(hash, 24);
 		memcpy(output, hash, 160);
-		output+=160;
+		output += 160;
 
 		keccakf(hash, 24);
 		memcpy(output, hash, 176);
-		output+=176;
+		output += 176;
 
 		keccakf(hash, 24);
 		memcpy(output, hash, 176);
-		output+=176;
+		output += 176;
 
 		if(PREFETCH)
 		{
@@ -311,11 +313,11 @@ void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrs
 	}
 }
 
-template<bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
+template <bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
 void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo)
 {
 	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
-		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu;
+							   ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu;
 
 	// This is more than we have registers, compiler will assign 2 keys on the stack
 	__m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
@@ -333,7 +335,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 	xout7 = _mm_load_si128(output + 11);
 
 	const size_t MEM = algo.Mem();
-	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+	for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 	{
 		if(PREFETCH)
 			_mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
@@ -384,7 +386,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 
 	if(HEAVY_MIX)
 	{
-		for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+		for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 		{
 			if(PREFETCH)
 				_mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
@@ -433,7 +435,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_
 				mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 		}
 
-		for(size_t i=0; i < 16; i++)
+		for(size_t i = 0; i < 16; i++)
 		{
 			if(SOFT_AES)
 			{
@@ -494,7 +496,8 @@ inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0)
 #else
 	// GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence
 	// Fallback to simpler code
-	if (x2 < n0) ++r;
+	if(x2 < n0)
+		++r;
 #endif
 	return r;
 }
@@ -505,7 +508,7 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
 	alignas(16) uint32_t x[4];
 	_mm_store_si128((__m128i*)k, key);
 	_mm_store_si128((__m128i*)x, _mm_xor_si128(val, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))); // x = ~val
-	#define BYTE(p, i) ((unsigned char*)&p)[i]
+#define BYTE(p, i) ((unsigned char*)&p)[i]
 	k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
 	x[0] ^= k[0];
 	k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
@@ -513,11 +516,11 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
 	k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
 	x[2] ^= k[2];
 	k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
-	#undef BYTE
+#undef BYTE
 	return _mm_load_si128((__m128i*)k);
 }
 
-template<xmrstak_algo_id ALGO>
+template <xmrstak_algo_id ALGO>
 inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 {
 	mem_out[0] = _mm_cvtsi128_si64(tmp);
@@ -541,7 +544,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 
 		mem_out[1] = vh;
 	}
-
 }
 
 /** optimal type for sqrt
@@ -550,18 +552,18 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
  *
  * @tparam N number of hashes per thread
  */
-template<size_t N>
+template <size_t N>
 struct GetOptimalSqrtType
 {
 	using type = __m128i;
 };
 
-template<>
+template <>
 struct GetOptimalSqrtType<1u>
 {
 	using type = uint64_t;
 };
-template<size_t N>
+template <size_t N>
 using GetOptimalSqrtType_t = typename GetOptimalSqrtType<N>::type;
 
 /** assign a value and convert if necessary
@@ -625,273 +627,275 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 	cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc));
 }
 
-#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) \
-	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx)                              \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */              \
 	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \
-	{ \
-		const uint64_t idx1 = idx0 & MASK; \
-		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
-		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
-		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
-		if (ALGO == cryptonight_r) \
-			cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2)); \
-	} \
-	if(ALGO == cryptonight_v8_reversewaltz) \
-	{ \
-		const uint64_t idx1 = idx0 & MASK; \
-		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
-		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
-		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	{                                                                                       \
+		const uint64_t idx1 = idx0 & MASK;                                                  \
+		const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]);                  \
+		const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]);                  \
+		const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]);                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1));            \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0));            \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0));            \
+		if(ALGO == cryptonight_r)                                                           \
+			cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2));   \
+	}                                                                                       \
+	if(ALGO == cryptonight_v8_reversewaltz)                                                 \
+	{                                                                                       \
+		const uint64_t idx1 = idx0 & MASK;                                                  \
+		const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]);                  \
+		const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]);                  \
+		const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]);                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1));            \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0));            \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0));            \
 	}
 
-#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
-	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow) \
-	{ \
-		const uint64_t idx1 = idx0 & MASK; \
-		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
-		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
-		hi ^= ((uint64_t*)&chunk2)[0]; \
-		lo ^= ((uint64_t*)&chunk2)[1]; \
-		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
-	} \
-	if(ALGO == cryptonight_v8_reversewaltz) \
-	{ \
-		const uint64_t idx1 = idx0 & MASK; \
-		const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
-		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
-		hi ^= ((uint64_t*)&chunk2)[0]; \
-		lo ^= ((uint64_t*)&chunk2)[1]; \
-		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
-		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi)                                                \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */                                    \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow)                                                \
+	{                                                                                                             \
+		const uint64_t idx1 = idx0 & MASK;                                                                        \
+		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+		const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]);                                        \
+		hi ^= ((uint64_t*)&chunk2)[0];                                                                            \
+		lo ^= ((uint64_t*)&chunk2)[1];                                                                            \
+		const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]);                                        \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1));                                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0));                                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0));                                  \
+	}                                                                                                             \
+	if(ALGO == cryptonight_v8_reversewaltz)                                                                       \
+	{                                                                                                             \
+		const uint64_t idx1 = idx0 & MASK;                                                                        \
+		const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+		const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]);                                        \
+		hi ^= ((uint64_t*)&chunk2)[0];                                                                            \
+		lo ^= ((uint64_t*)&chunk2)[1];                                                                            \
+		const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]);                                        \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1));                                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0));                                  \
+		_mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0));                                  \
 	}
 
-#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \
-	{ \
-		uint64_t sqrt_result_tmp; \
-		assign(sqrt_result_tmp, sqrt_result); \
-		/* Use division and square root results from the _previous_ iteration to hide the latency */ \
-		const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \
-		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \
-		const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \
-		/* Most and least significant bits in the divisor are set to 1 \
-		 * to make sure we don't divide by a small or even number, \
-		 * so there are no shortcuts for such cases \
-		 * \
-		 * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \
-		 * We drop the highest bit to fit both quotient and remainder in 32 bits \
-		 */  \
-		/* Compiler will optimize it to a single div instruction */ \
-		const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
-		const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \
-		division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
+#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl)                                            \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)                                     \
+	{                                                                                                            \
+		uint64_t sqrt_result_tmp;                                                                                \
+		assign(sqrt_result_tmp, sqrt_result);                                                                    \
+		/* Use division and square root results from the _previous_ iteration to hide the latency */             \
+		const uint64_t cx_64 = _mm_cvtsi128_si64(cx);                                                            \
+		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32);           \
+		const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL;                                      \
+		/* Most and least significant bits in the divisor are set to 1                                           \
+		 * to make sure we don't divide by a small or even number,                                               \
+		 * so there are no shortcuts for such cases                                                              \
+		 *                                                                                                       \
+		 * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4                             \
+		 * We drop the highest bit to fit both quotient and remainder in 32 bits                                 \
+		 */                                                                                                      \
+		/* Compiler will optimize it to a single div instruction */                                              \
+		const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8));                                          \
+		const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32);                   \
+		division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result));                          \
 		/* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \
-		assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \
+		assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result));                             \
 	}
 
-#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data) \
-	if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \
-	{ \
+#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data)                                   \
+	if(ALGO == cryptonight_r || ALGO == cryptonight_r_wow)                                     \
+	{                                                                                          \
 		cl ^= (cn_r_data[0] + cn_r_data[1]) | ((uint64_t)(cn_r_data[2] + cn_r_data[3]) << 32); \
-		cn_r_data[4] = static_cast<uint32_t>(al); \
-		cn_r_data[5] = static_cast<uint32_t>(ah); \
-		cn_r_data[6] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx0)); \
-		cn_r_data[7] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx1)); \
-		cn_r_data[8] = static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
-		v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data); \
-	} \
-	if (ALGO == cryptonight_r) \
-	{ \
-		al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32); \
-		ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32); \
+		cn_r_data[4] = static_cast<uint32_t>(al);                                              \
+		cn_r_data[5] = static_cast<uint32_t>(ah);                                              \
+		cn_r_data[6] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx0));                          \
+		cn_r_data[7] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx1));                          \
+		cn_r_data[8] = static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8)));       \
+		v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data);                                      \
+	}                                                                                          \
+	if(ALGO == cryptonight_r)                                                                  \
+	{                                                                                          \
+		al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32);                                 \
+		ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32);                                 \
 	}
 
-#define CN_INIT_SINGLE \
+#define CN_INIT_SINGLE                                                                                                                                                                                 \
 	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \
-	{ \
-		memset(output, 0, 32 * N); \
-		return; \
+	{                                                                                                                                                                                                  \
+		memset(output, 0, 32 * N);                                                                                                                                                                     \
+		return;                                                                                                                                                                                        \
 	}
 
-#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data) \
-	keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
-	uint64_t monero_const; \
+#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data)                                                                   \
+	keccak((const uint8_t*)input + len * n, len, ctx[n]->hash_state, 200);                                                                                                               \
+	uint64_t monero_const;                                                                                                                                                               \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-	{ \
-		monero_const =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len * n + 35); \
-		monero_const ^=  *(reinterpret_cast<const uint64_t*>(ctx[n]->hash_state) + 24); \
-	} \
-	/* Optim - 99% time boundary */ \
-	cn_explode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo); \
-	\
-	__m128i ax0; \
-	uint64_t idx0; \
-	__m128i bx0; \
-	uint8_t* l0 = ctx[n]->long_state; \
-	/* BEGIN cryptonight_monero_v8 variables */ \
-	__m128i bx1; \
-	__m128i division_result_xmm; \
-	__m128 conc_var; \
-	if(ALGO == cryptonight_conceal) \
-	{\
-		set_float_rounding_mode_nearest(); \
-		conc_var = _mm_setzero_ps(); \
-	}\
-	GetOptimalSqrtType_t<N> sqrt_result; \
-	uint32_t cn_r_data[9]; \
-	/* END cryptonight_monero_v8 variables */ \
-	{ \
-		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
-		idx0 = h0[0] ^ h0[4]; \
-		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
-		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
-		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \
-		{ \
-			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
-			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
-			assign(sqrt_result, h0[13]); \
-			set_float_rounding_mode(); \
-		} \
-		if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \
-		{ \
-			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
-			cn_r_data[0] = (uint32_t)(h0[12]); \
-			cn_r_data[1] = (uint32_t)(h0[12] >> 32); \
-			cn_r_data[2] = (uint32_t)(h0[13]); \
-			cn_r_data[3] = (uint32_t)(h0[13] >> 32); \
-		} \
-	} \
-	__m128i *ptr0
+	{                                                                                                                                                                                    \
+		monero_const = *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len * n + 35);                                                                       \
+		monero_const ^= *(reinterpret_cast<const uint64_t*>(ctx[n]->hash_state) + 24);                                                                                                   \
+	}                                                                                                                                                                                    \
+	/* Optim - 99% time boundary */                                                                                                                                                      \
+	cn_explode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo);                                                                   \
+                                                                                                                                                                                         \
+	__m128i ax0;                                                                                                                                                                         \
+	uint64_t idx0;                                                                                                                                                                       \
+	__m128i bx0;                                                                                                                                                                         \
+	uint8_t* l0 = ctx[n]->long_state;                                                                                                                                                    \
+	/* BEGIN cryptonight_monero_v8 variables */                                                                                                                                          \
+	__m128i bx1;                                                                                                                                                                         \
+	__m128i division_result_xmm;                                                                                                                                                         \
+	__m128 conc_var;                                                                                                                                                                     \
+	if(ALGO == cryptonight_conceal)                                                                                                                                                      \
+	{                                                                                                                                                                                    \
+		set_float_rounding_mode_nearest();                                                                                                                                               \
+		conc_var = _mm_setzero_ps();                                                                                                                                                     \
+	}                                                                                                                                                                                    \
+	GetOptimalSqrtType_t<N> sqrt_result;                                                                                                                                                 \
+	uint32_t cn_r_data[9];                                                                                                                                                               \
+	/* END cryptonight_monero_v8 variables */                                                                                                                                            \
+	{                                                                                                                                                                                    \
+		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state;                                                                                                                                    \
+		idx0 = h0[0] ^ h0[4];                                                                                                                                                            \
+		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0);                                                                                                                                       \
+		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);                                                                                                                              \
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)                                                                                                         \
+		{                                                                                                                                                                                \
+			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);                                                                                                                        \
+			division_result_xmm = _mm_cvtsi64_si128(h0[12]);                                                                                                                             \
+			assign(sqrt_result, h0[13]);                                                                                                                                                 \
+			set_float_rounding_mode();                                                                                                                                                   \
+		}                                                                                                                                                                                \
+		if(ALGO == cryptonight_r || ALGO == cryptonight_r_wow)                                                                                                                           \
+		{                                                                                                                                                                                \
+			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);                                                                                                                        \
+			cn_r_data[0] = (uint32_t)(h0[12]);                                                                                                                                           \
+			cn_r_data[1] = (uint32_t)(h0[12] >> 32);                                                                                                                                     \
+			cn_r_data[2] = (uint32_t)(h0[13]);                                                                                                                                           \
+			cn_r_data[3] = (uint32_t)(h0[13] >> 32);                                                                                                                                     \
+		}                                                                                                                                                                                \
+	}                                                                                                                                                                                    \
+	__m128i* ptr0
 
 #define CN_STEP1(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1) \
-	__m128i cx; \
-	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
-	cx = _mm_load_si128(ptr0); \
-	if (ALGO == cryptonight_conceal) \
-		cryptonight_conceal_tweak(cx, conc_var); \
-	if (ALGO == cryptonight_bittube2) \
-	{ \
-		cx = aes_round_bittube2(cx, ax0); \
-	} \
-	else \
-	{ \
-		if(SOFT_AES) \
-			cx = soft_aesenc(cx, ax0); \
-		else \
-			cx = _mm_aesenc_si128(cx, ax0); \
-	} \
+	__m128i cx;                                                                \
+	ptr0 = (__m128i*)&l0[idx0 & MASK];                                         \
+	cx = _mm_load_si128(ptr0);                                                 \
+	if(ALGO == cryptonight_conceal)                                            \
+		cryptonight_conceal_tweak(cx, conc_var);                               \
+	if(ALGO == cryptonight_bittube2)                                           \
+	{                                                                          \
+		cx = aes_round_bittube2(cx, ax0);                                      \
+	}                                                                          \
+	else                                                                       \
+	{                                                                          \
+		if(SOFT_AES)                                                           \
+			cx = soft_aesenc(cx, ax0);                                         \
+		else                                                                   \
+			cx = _mm_aesenc_si128(cx, ax0);                                    \
+	}                                                                          \
 	CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx)
 
-#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
+#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx)                                                                                                                          \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-		cryptonight_monero_tweak<ALGO>((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \
-	else \
-		_mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \
-	idx0 = _mm_cvtsi128_si64(cx); \
-	\
-	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
-	if(PREFETCH) \
-		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
-	if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow && ALGO != cryptonight_v8_reversewaltz) \
-		bx0 = cx
+		cryptonight_monero_tweak<ALGO>((uint64_t*)ptr0, _mm_xor_si128(bx0, cx));                                                                                                         \
+	else                                                                                                                                                                                 \
+		_mm_store_si128((__m128i*)ptr0, _mm_xor_si128(bx0, cx));                                                                                                                         \
+	idx0 = _mm_cvtsi128_si64(cx);                                                                                                                                                        \
+                                                                                                                                                                                         \
+	ptr0 = (__m128i*)&l0[idx0 & MASK];                                                                                                                                                   \
+	if(PREFETCH)                                                                                                                                                                         \
+		_mm_prefetch((const char*)ptr0, _MM_HINT_T0);                                                                                                                                    \
+	if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow && ALGO != cryptonight_v8_reversewaltz)                                                       \
+	bx0 = cx
 
 #define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data) \
-	uint64_t lo, cl, ch; \
-	uint64_t al0 = _mm_cvtsi128_si64(ax0); \
-	uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
-	cl = ((uint64_t*)ptr0)[0]; \
-	ch = ((uint64_t*)ptr0)[1]; \
-	CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data); \
-	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
-	{ \
-		uint64_t hi; \
-		lo = _umul128(idx0, cl, &hi); \
-		if(ALGO == cryptonight_r) \
-		{ \
-			CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx); \
-		} \
-		else \
-		{ \
-			CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \
-		} \
-		ah0 += lo; \
-		al0 += hi; \
-	} \
-	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow || ALGO == cryptonight_v8_reversewaltz) \
-	{ \
-		bx1 = bx0; \
-		bx0 = cx; \
-	} \
-	((uint64_t*)ptr0)[0] = al0; \
-	if(PREFETCH) \
-		_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
-
-#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
-	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-	{ \
-		if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \
-			((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \
-		else \
-			((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \
-	} \
-	else \
-		((uint64_t*)ptr0)[1] = ah0; \
-	al0 ^= cl; \
-	ah0 ^= ch; \
-	ax0 = _mm_set_epi64x(ah0, al0); \
+	uint64_t lo, cl, ch;                                                                                                                \
+	uint64_t al0 = _mm_cvtsi128_si64(ax0);                                                                                              \
+	uint64_t ah0 = ((uint64_t*)&ax0)[1];                                                                                                \
+	cl = ((uint64_t*)ptr0)[0];                                                                                                          \
+	ch = ((uint64_t*)ptr0)[1];                                                                                                          \
+	CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data);                                                                             \
+	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl);                                                                      \
+	{                                                                                                                                   \
+		uint64_t hi;                                                                                                                    \
+		lo = _umul128(idx0, cl, &hi);                                                                                                   \
+		if(ALGO == cryptonight_r)                                                                                                       \
+		{                                                                                                                               \
+			CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx);                                                                     \
+		}                                                                                                                               \
+		else                                                                                                                            \
+		{                                                                                                                               \
+			CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi);                                                                 \
+		}                                                                                                                               \
+		ah0 += lo;                                                                                                                      \
+		al0 += hi;                                                                                                                      \
+	}                                                                                                                                   \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow || ALGO == cryptonight_v8_reversewaltz)      \
+	{                                                                                                                                   \
+		bx1 = bx0;                                                                                                                      \
+		bx0 = cx;                                                                                                                       \
+	}                                                                                                                                   \
+	((uint64_t*)ptr0)[0] = al0;                                                                                                         \
+	if(PREFETCH)                                                                                                                        \
+	_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
+
+#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0)                                                                                                        \
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+	{                                                                                                                                                                                    \
+		if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)                                                                                                                     \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0];                                                                                                            \
+		else                                                                                                                                                                             \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const;                                                                                                                                   \
+	}                                                                                                                                                                                    \
+	else                                                                                                                                                                                 \
+		((uint64_t*)ptr0)[1] = ah0;                                                                                                                                                      \
+	al0 ^= cl;                                                                                                                                                                           \
+	ah0 ^= ch;                                                                                                                                                                           \
+	ax0 = _mm_set_epi64x(ah0, al0);                                                                                                                                                      \
 	idx0 = al0;
 
-#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \
-	{ \
-		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
-		int64_t u  = ((int64_t*)ptr0)[0]; \
-		int32_t d  = ((int32_t*)ptr0)[2]; \
-		int64_t q = u / (d | 0x5); \
-		\
-		((int64_t*)ptr0)[0] = u ^ q; \
-		idx0 = d ^ q; \
-	} \
+#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0)             \
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)       \
+	{                                                                   \
+		ptr0 = (__m128i*)&l0[idx0 & MASK];                              \
+		int64_t u = ((int64_t*)ptr0)[0];                                \
+		int32_t d = ((int32_t*)ptr0)[2];                                \
+		int64_t q = u / (d | 0x5);                                      \
+                                                                        \
+		((int64_t*)ptr0)[0] = u ^ q;                                    \
+		idx0 = d ^ q;                                                   \
+	}                                                                   \
 	else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) \
-	{ \
-		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
-		int64_t u  = ((int64_t*)ptr0)[0]; \
-		int32_t d  = ((int32_t*)ptr0)[2]; \
-		int64_t q = u / (d | 0x5); \
-		\
-		((int64_t*)ptr0)[0] = u ^ q; \
-		idx0 = (~d) ^ q; \
+	{                                                                   \
+		ptr0 = (__m128i*)&l0[idx0 & MASK];                              \
+		int64_t u = ((int64_t*)ptr0)[0];                                \
+		int32_t d = ((int32_t*)ptr0)[2];                                \
+		int64_t q = u / (d | 0x5);                                      \
+                                                                        \
+		((int64_t*)ptr0)[0] = u ^ q;                                    \
+		idx0 = (~d) ^ q;                                                \
 	}
 
-#define CN_FINALIZE(n) \
-	/* Optim - 90% time boundary */ \
+#define CN_FINALIZE(n)                                                                                                 \
+	/* Optim - 90% time boundary */                                                                                    \
 	cn_implode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state, algo); \
-	/* Optim - 99% time boundary */ \
-	keccakf((uint64_t*)ctx[n]->hash_state, 24); \
+	/* Optim - 99% time boundary */                                                                                    \
+	keccakf((uint64_t*)ctx[n]->hash_state, 24);                                                                        \
 	extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n)
 
 //! defer the evaluation of an macro
 #ifndef _MSC_VER
-#	define CN_DEFER(...) __VA_ARGS__
+#define CN_DEFER(...) __VA_ARGS__
 #else
-#	define CN_EMPTY(...)
-#	define CN_DEFER(...) __VA_ARGS__ CN_EMPTY()
+#define CN_EMPTY(...)
+#define CN_DEFER(...) __VA_ARGS__ CN_EMPTY()
 #endif
 
 //! execute the macro f with the passed arguments
-#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__)
+#define CN_EXEC(f, ...) \
+	CN_DEFER(f)         \
+	(__VA_ARGS__)
 
 /** add append n to all arguments and keeps n as first argument
  *
@@ -904,22 +908,22 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
  * @endcode
  */
 #define CN_ENUM_0(n, ...) n
-#define CN_ENUM_1(n, x1) n, x1 ## n
-#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n
-#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n
-#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n
-#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n
-#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n
-#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n
-#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n
-#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n
-#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n
-#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n
-#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n
-#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n
-#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n
-#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n
-#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n, x16 ## n
+#define CN_ENUM_1(n, x1) n, x1##n
+#define CN_ENUM_2(n, x1, x2) n, x1##n, x2##n
+#define CN_ENUM_3(n, x1, x2, x3) n, x1##n, x2##n, x3##n
+#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1##n, x2##n, x3##n, x4##n
+#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1##n, x2##n, x3##n, x4##n, x5##n
+#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n
+#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n
+#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n
+#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n
+#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n
+#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n
+#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n
+#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n
+#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n
+#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n, x15##n
+#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n, x15##n, x16##n
 
 /** repeat a macro call multiple times
  *
@@ -933,21 +937,35 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
  * f(0, foo0, bar); f(1, foo1, bar1)
  * @endcode
  */
-#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__))
-#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__))
-#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__))
-#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__))
-#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__))
-
-template< size_t N>
+#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__))
+#define REPEAT_2(n, f, ...)                  \
+	CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__))
+#define REPEAT_3(n, f, ...)                  \
+	CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__))
+#define REPEAT_4(n, f, ...)                  \
+	CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(3, __VA_ARGS__))
+#define REPEAT_5(n, f, ...)                  \
+	CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(3, __VA_ARGS__)); \
+	CN_EXEC(f, CN_ENUM_##n(4, __VA_ARGS__))
+
+template <size_t N>
 struct Cryptonight_hash;
 
-template< >
+template <>
 struct Cryptonight_hash<1>
 {
 	static constexpr size_t N = 1;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -971,12 +989,12 @@ struct Cryptonight_hash<1>
 	}
 };
 
-template< >
+template <>
 struct Cryptonight_hash<2>
 {
 	static constexpr size_t N = 2;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -1000,12 +1018,12 @@ struct Cryptonight_hash<2>
 	}
 };
 
-template< >
+template <>
 struct Cryptonight_hash<3>
 {
 	static constexpr size_t N = 3;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -1029,12 +1047,12 @@ struct Cryptonight_hash<3>
 	}
 };
 
-template< >
+template <>
 struct Cryptonight_hash<4>
 {
 	static constexpr size_t N = 4;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -1058,12 +1076,12 @@ struct Cryptonight_hash<4>
 	}
 };
 
-template< >
+template <>
 struct Cryptonight_hash<5>
 {
 	static constexpr size_t N = 5;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const uint32_t MASK = algo.Mask();
@@ -1087,26 +1105,25 @@ struct Cryptonight_hash<5>
 	}
 };
 
-extern "C" void  cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
-extern "C" void  cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
 extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
 
-
-template< size_t N, size_t asm_version>
+template <size_t N, size_t asm_version>
 struct Cryptonight_hash_asm
 {
-	template<xmrstak_algo_id ALGO>
+	template <xmrstak_algo_id ALGO>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		for(size_t i = 0; i < N; ++i)
 		{
-			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+			keccak((const uint8_t*)input + len * i, len, ctx[i]->hash_state, 200);
 			cn_explode_scratchpad<false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo);
 		}
 		if(ALGO == cryptonight_r)
 		{
 			// API ATTRIBUTE is only required for cryptonight_r
-			typedef void ABI_ATTRIBUTE (*cn_r_mainloop_fun)(cryptonight_ctx *ctx);
+			typedef void ABI_ATTRIBUTE (*cn_r_mainloop_fun)(cryptonight_ctx * ctx);
 			for(size_t i = 0; i < N; ++i)
 				reinterpret_cast<cn_r_mainloop_fun>(ctx[0]->loop_fn)(ctx[i]); // use always loop_fn from ctx[0]!!
 		}
@@ -1126,19 +1143,19 @@ struct Cryptonight_hash_asm
 };
 
 // double hash with specialized asm only for intel
-template< >
+template <>
 struct Cryptonight_hash_asm<2, 0>
 {
 	static constexpr size_t N = 2;
 
-	template<xmrstak_algo_id ALGO>
+	template <xmrstak_algo_id ALGO>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		const size_t MEM = algo.Mem();
 
 		for(size_t i = 0; i < N; ++i)
 		{
-			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+			keccak((const uint8_t*)input + len * i, len, ctx[i]->hash_state, 200);
 			/* Optim - 99% time boundary */
 			cn_explode_scratchpad<false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo);
 		}
@@ -1167,89 +1184,90 @@ struct Cryptonight_hash_asm<2, 0>
 namespace
 {
 
-template<typename T, typename U>
+template <typename T, typename U>
 static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask)
 {
-    const uint8_t* p = reinterpret_cast<const uint8_t*>(src);
-
-    // Workaround for Visual Studio placing trampoline in debug builds.
-#   if defined(_MSC_VER)
-    if (p[0] == 0xE9) {
-        p += *(int32_t*)(p + 1) + 5;
-    }
-#   endif
-
-    size_t size = 0;
-    while (*(uint32_t*)(p + size) != 0xDEADC0DE) {
-        ++size;
-    }
-    size += sizeof(uint32_t);
-
-    memcpy((void*) dst, (const void*) src, size);
-
-    uint8_t* patched_data = reinterpret_cast<uint8_t*>(dst);
-    for (size_t i = 0; i + sizeof(uint32_t) <= size; ++i) {
-        switch (*(uint32_t*)(patched_data + i)) {
-        case CN_ITER:
-            *(uint32_t*)(patched_data + i) = iterations;
-            break;
-
-        case CN_MASK:
-            *(uint32_t*)(patched_data + i) = mask;
-            break;
-        }
-    }
-}
+	const uint8_t* p = reinterpret_cast<const uint8_t*>(src);
+
+	// Workaround for Visual Studio placing trampoline in debug builds.
+#if defined(_MSC_VER)
+	if(p[0] == 0xE9)
+	{
+		p += *(int32_t*)(p + 1) + 5;
+	}
+#endif
+
+	size_t size = 0;
+	while(*(uint32_t*)(p + size) != 0xDEADC0DE)
+	{
+		++size;
+	}
+	size += sizeof(uint32_t);
+
+	memcpy((void*)dst, (const void*)src, size);
+
+	uint8_t* patched_data = reinterpret_cast<uint8_t*>(dst);
+	for(size_t i = 0; i + sizeof(uint32_t) <= size; ++i)
+	{
+		switch(*(uint32_t*)(patched_data + i))
+		{
+		case CN_ITER:
+			*(uint32_t*)(patched_data + i) = iterations;
+			break;
 
+		case CN_MASK:
+			*(uint32_t*)(patched_data + i) = mask;
+			break;
+		}
+	}
+}
 
 void* allocateExecutableMemory(size_t size)
 {
 
 #ifdef _WIN64
-return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+	return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+#else
+#if defined(__APPLE__)
+	return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
 #else
-#   if defined(__APPLE__)
-    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
-#   else
-    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-#   endif
+	return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#endif
 #endif
 }
 
-
-void protectExecutableMemory(void *p, size_t size)
+void protectExecutableMemory(void* p, size_t size)
 {
 #ifdef _WIN64
-    DWORD oldProtect;
-    VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect);
+	DWORD oldProtect;
+	VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect);
 #else
-    mprotect(p, size, PROT_READ | PROT_EXEC);
+	mprotect(p, size, PROT_READ | PROT_EXEC);
 #endif
 }
 
-void unprotectExecutableMemory(void *p, size_t size)
+void unprotectExecutableMemory(void* p, size_t size)
 {
 #ifdef _WIN64
-    DWORD oldProtect;
-    VirtualProtect(p, size, PAGE_EXECUTE_READWRITE, &oldProtect);
+	DWORD oldProtect;
+	VirtualProtect(p, size, PAGE_EXECUTE_READWRITE, &oldProtect);
 #else
-    mprotect(p, size, PROT_WRITE | PROT_EXEC);
+	mprotect(p, size, PROT_WRITE | PROT_EXEC);
 #endif
 }
 
-
-void flushInstructionCache(void *p, size_t size)
+void flushInstructionCache(void* p, size_t size)
 {
 #ifdef _WIN64
-    ::FlushInstructionCache(GetCurrentProcess(), p, size);
+	::FlushInstructionCache(GetCurrentProcess(), p, size);
 #else
-#   ifndef __FreeBSD__
-    __builtin___clear_cache(reinterpret_cast<char*>(p), reinterpret_cast<char*>(p) + size);
-#   endif
+#ifndef __FreeBSD__
+	__builtin___clear_cache(reinterpret_cast<char*>(p), reinterpret_cast<char*>(p) + size);
+#endif
 #endif
 }
 
-template<size_t N>
+template <size_t N>
 void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 {
 	const uint32_t Iter = algo.Iter();
@@ -1270,7 +1288,8 @@ void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmr
 		if(N == 2)
 			src_code = reinterpret_cast<cn_mainloop_fun>(cryptonight_v8_double_mainloop_sandybridge_asm);
 		else
-			src_code = cryptonight_v8_mainloop_ivybridge_asm;;
+			src_code = cryptonight_v8_mainloop_ivybridge_asm;
+		;
 	}
 	// supports only 1 thread per hash
 	if(selected_asm == "amd_avx")
@@ -1295,19 +1314,17 @@ void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmr
 		flushInstructionCache(ctx[0]->fun_data, allocation_size);
 	}
 }
-} // namespace (anonymous)
-
-
+} // namespace
 
 struct Cryptonight_hash_gpu
 {
 	static constexpr size_t N = 1;
 
-	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	template <xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
 	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
 		set_float_rounding_mode_nearest();
-		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+		keccak((const uint8_t*)input, len, ctx[0]->hash_state, 200);
 		cn_explode_scratchpad_gpu<PREFETCH, ALGO>(ctx[0]->hash_state, ctx[0]->long_state, algo);
 
 		if(cngpu_check_avx2())
@@ -1321,16 +1338,15 @@ struct Cryptonight_hash_gpu
 	}
 };
 
-template<size_t N>
+template <size_t N>
 struct Cryptonight_R_generator
 {
-	template<xmrstak_algo_id ALGO>
+	template <xmrstak_algo_id ALGO>
 	static void cn_on_new_job(const xmrstak::miner_work& work, cryptonight_ctx** ctx)
 	{
 		if(ctx[0]->cn_r_ctx.height == work.iBlockHeight &&
 			ctx[0]->last_algo == POW(cryptonight_r) &&
-			reinterpret_cast<void*>(ctx[0]->hash_fn) == ctx[0]->fun_data
-		)
+			reinterpret_cast<void*>(ctx[0]->hash_fn) == ctx[0]->fun_data)
 			return;
 
 		ctx[0]->last_algo = POW(cryptonight_r);
@@ -1346,7 +1362,7 @@ struct Cryptonight_R_generator
 				ctx[0]->hash_fn = Cryptonight_hash_asm<N, 1u>::template hash<cryptonight_r>;
 		}
 
-		for(size_t i=1; i < N; i++)
+		for(size_t i = 1; i < N; i++)
 		{
 			ctx[i]->cn_r_ctx = ctx[0]->cn_r_ctx;
 			ctx[i]->loop_fn = ctx[0]->loop_fn;
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index a9d1c96fd9117e1933bc4de8c979294d24dbcfc6..e35c7c7b8b418eed6fe8c9cefbad13e400d84f16 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -23,19 +23,19 @@
 
 extern "C"
 {
-#include "c_groestl.h"
 #include "c_blake256.h"
+#include "c_groestl.h"
 #include "c_jh.h"
 #include "c_skein.h"
 }
-#include "xmrstak/backend/cryptonight.hpp"
 #include "cryptonight.h"
 #include "cryptonight_aesni.h"
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/console.hpp"
+#include <algorithm>
 #include <stdio.h>
 #include <stdlib.h>
-#include <algorithm>
 
 #ifdef __GNUC__
 #include <mm_malloc.h>
@@ -49,30 +49,35 @@ extern "C"
 
 #ifdef _WIN32
 #include <windows.h>
+// this comment avoid that clang format reorders the includes
 #include <ntsecapi.h>
 #else
-#include <sys/mman.h>
 #include <errno.h>
 #include <string.h>
+#include <sys/mman.h>
 #endif // _WIN32
 
-void do_blake_hash(const void* input, uint32_t len, char* output) {
+void do_blake_hash(const void* input, uint32_t len, char* output)
+{
 	blake256_hash((uint8_t*)output, (const uint8_t*)input, len);
 }
 
-void do_groestl_hash(const void* input, uint32_t len, char* output) {
+void do_groestl_hash(const void* input, uint32_t len, char* output)
+{
 	groestl((const uint8_t*)input, len * 8, (uint8_t*)output);
 }
 
-void do_jh_hash(const void* input, uint32_t len, char* output) {
+void do_jh_hash(const void* input, uint32_t len, char* output)
+{
 	jh_hash(32 * 8, (const uint8_t*)input, 8 * len, (uint8_t*)output);
 }
 
-void do_skein_hash(const void* input, uint32_t len, char* output) {
+void do_skein_hash(const void* input, uint32_t len, char* output)
+{
 	skein_hash(8 * 32, (const uint8_t*)input, 8 * len, (uint8_t*)output);
 }
 
-void (* const extra_hashes[4])(const void *, uint32_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
+void (*const extra_hashes[4])(const void*, uint32_t, char*) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
 
 #ifdef _WIN32
 #include "xmrstak/misc/uac.hpp"
@@ -81,21 +86,21 @@ BOOL bRebootDesirable = FALSE; //If VirtualAlloc fails, suggest a reboot
 
 BOOL AddPrivilege(TCHAR* pszPrivilege)
 {
-	HANDLE           hToken;
+	HANDLE hToken;
 	TOKEN_PRIVILEGES tp;
-	BOOL             status;
+	BOOL status;
 
-	if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
+	if(!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
 		return FALSE;
 
-	if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid))
+	if(!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid))
 		return FALSE;
 
 	tp.PrivilegeCount = 1;
 	tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
 	status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
 
-	if (!status || (GetLastError() != ERROR_SUCCESS))
+	if(!status || (GetLastError() != ERROR_SUCCESS))
 		return FALSE;
 
 	CloseHandle(hToken);
@@ -107,19 +112,19 @@ BOOL AddLargePageRights()
 	HANDLE hToken;
 	PTOKEN_USER user = NULL;
 
-	if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken) == TRUE)
+	if(OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken) == TRUE)
 	{
 		TOKEN_ELEVATION Elevation;
 		DWORD cbSize = sizeof(TOKEN_ELEVATION);
 		BOOL bIsElevated = FALSE;
 
-		if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
+		if(GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
 			bIsElevated = Elevation.TokenIsElevated;
 
 		DWORD size = 0;
 		GetTokenInformation(hToken, TokenUser, NULL, 0, &size);
 
-		if (size > 0 && bIsElevated)
+		if(size > 0 && bIsElevated)
 		{
 			user = (PTOKEN_USER)LocalAlloc(LPTR, size);
 			GetTokenInformation(hToken, TokenUser, user, size, &size);
@@ -128,7 +133,7 @@ BOOL AddLargePageRights()
 		CloseHandle(hToken);
 	}
 
-	if (!user)
+	if(!user)
 		return FALSE;
 
 	LSA_HANDLE handle;
@@ -136,7 +141,7 @@ BOOL AddLargePageRights()
 	ZeroMemory(&attributes, sizeof(attributes));
 
 	BOOL result = FALSE;
-	if (LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0)
+	if(LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0)
 	{
 		LSA_UNICODE_STRING lockmem;
 		lockmem.Buffer = L"SeLockMemoryPrivilege";
@@ -146,11 +151,11 @@ BOOL AddLargePageRights()
 		PLSA_UNICODE_STRING rights = NULL;
 		ULONG cnt = 0;
 		BOOL bHasRights = FALSE;
-		if (LsaEnumerateAccountRights(handle, user->User.Sid, &rights, &cnt) == 0)
+		if(LsaEnumerateAccountRights(handle, user->User.Sid, &rights, &cnt) == 0)
 		{
-			for (size_t i = 0; i < cnt; i++)
+			for(size_t i = 0; i < cnt; i++)
 			{
-				if (rights[i].Length == lockmem.Length &&
+				if(rights[i].Length == lockmem.Length &&
 					memcmp(rights[i].Buffer, lockmem.Buffer, 42) == 0)
 				{
 					bHasRights = TRUE;
@@ -220,7 +225,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 		ptr->ctx_info[0] = 0;
 		ptr->ctx_info[1] = 0;
 		if(ptr->long_state == NULL)
-			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str());
+			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte", std::to_string(hashMemSize).c_str());
 		return ptr;
 	}
 
@@ -250,7 +255,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 #else
 //http://man7.org/linux/man-pages/man2/mmap.2.html
 #if defined(__APPLE__)
-	ptr->long_state  = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
 #elif defined(__FreeBSD__)
 	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
@@ -261,7 +266,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 #else
 	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
-	if (ptr->long_state == MAP_FAILED)
+	if(ptr->long_state == MAP_FAILED)
 	{
 		// try without MAP_HUGETLB for crappy kernels
 		msg->warning = "mmap with HUGETLB failed, attempting without it (you should fix your kernel)";
@@ -270,7 +275,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 	}
 #endif
 
-	if (ptr->long_state == MAP_FAILED)
+	if(ptr->long_state == MAP_FAILED)
 	{
 		_mm_free(ptr);
 		msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'";
@@ -279,7 +284,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 
 	ptr->ctx_info[0] = 1;
 
-	if(madvise(ptr->long_state, hashMemSize, MADV_RANDOM|MADV_WILLNEED) != 0)
+	if(madvise(ptr->long_state, hashMemSize, MADV_RANDOM | MADV_WILLNEED) != 0)
 		msg->warning = "madvise failed";
 
 	ptr->ctx_info[1] = 0;
diff --git a/xmrstak/backend/cpu/crypto/groestl_tables.h b/xmrstak/backend/cpu/crypto/groestl_tables.h
index a23295c3508577c38b18c02ee150e86f79f8b5c3..85dd25f3d57ea0ccce32e61ee292df254122df00 100644
--- a/xmrstak/backend/cpu/crypto/groestl_tables.h
+++ b/xmrstak/backend/cpu/crypto/groestl_tables.h
@@ -1,38 +1,6 @@
 #ifndef __tables_h
 #define __tables_h
 
-
-const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc
-, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5
-, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d
-, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded
-, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1
-, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441
-, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4
-, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba
-, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616
-, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2
-, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c
-, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de
-, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7
-, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e
-, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c
-, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7
-, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b
-, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4
-, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e
-, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a
-, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37
-, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86
-, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b
-, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028
-, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3
-, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94
-, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836
-, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0
-, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2
-, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e
-, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3
-, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e};
+const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e};
 
 #endif /* __tables_h */
diff --git a/xmrstak/backend/cpu/crypto/hash.h b/xmrstak/backend/cpu/crypto/hash.h
index 2af330932eddc362e6fa0b648ff571f62b66355c..57458137692f172feb914e56d4955850f43cca26 100644
--- a/xmrstak/backend/cpu/crypto/hash.h
+++ b/xmrstak/backend/cpu/crypto/hash.h
@@ -4,4 +4,9 @@
 
 typedef unsigned char BitSequence;
 typedef uint32_t DataLength;
-typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn;
+typedef enum
+{
+	SUCCESS = 0,
+	FAIL = 1,
+	BAD_HASHLEN = 2
+} HashReturn;
diff --git a/xmrstak/backend/cpu/crypto/int-util.h b/xmrstak/backend/cpu/crypto/int-util.h
index 8748976c1d2aeb54bb8f8fde01e75cc6988c51fd..393b4f3d2d3a931d0bd180be9d44a0a79d47c234 100644
--- a/xmrstak/backend/cpu/crypto/int-util.h
+++ b/xmrstak/backend/cpu/crypto/int-util.h
@@ -12,43 +12,51 @@
 #if defined(_MSC_VER)
 #include <stdlib.h>
 
-static inline uint32_t rol32(uint32_t x, int r) {
+static inline uint32_t rol32(uint32_t x, int r)
+{
 	static_assert(sizeof(uint32_t) == sizeof(unsigned int), "this code assumes 32-bit integers");
 	return _rotl(x, r);
 }
 
-static inline uint64_t rol64(uint64_t x, int r) {
+static inline uint64_t rol64(uint64_t x, int r)
+{
 	return _rotl64(x, r);
 }
 
 #else
 
-static inline uint32_t rol32(uint32_t x, int r) {
+static inline uint32_t rol32(uint32_t x, int r)
+{
 	return (x << (r & 31)) | (x >> (-r & 31));
 }
 
-static inline uint64_t rol64(uint64_t x, int r) {
+static inline uint64_t rol64(uint64_t x, int r)
+{
 	return (x << (r & 63)) | (x >> (-r & 63));
 }
 
 #endif
 
-static inline uint64_t hi_dword(uint64_t val) {
+static inline uint64_t hi_dword(uint64_t val)
+{
 	return val >> 32;
 }
 
-static inline uint64_t lo_dword(uint64_t val) {
+static inline uint64_t lo_dword(uint64_t val)
+{
 	return val & 0xFFFFFFFF;
 }
 
-static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder) {
+static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder)
+{
 	dividend |= ((uint64_t)*remainder) << 32;
 	*remainder = dividend % divisor;
 	return dividend / divisor;
 }
 
 // Long division with 2^32 base
-static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo) {
+static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo)
+{
 	uint64_t dividend_dwords[4];
 	uint32_t remainder = 0;
 
@@ -65,30 +73,35 @@ static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uin
 	return remainder;
 }
 
-#define IDENT32(x) ((uint32_t) (x))
-#define IDENT64(x) ((uint64_t) (x))
+#define IDENT32(x) ((uint32_t)(x))
+#define IDENT64(x) ((uint64_t)(x))
 
-#define SWAP32(x) ((((uint32_t) (x) & 0x000000ff) << 24) | \
-  (((uint32_t) (x) & 0x0000ff00) <<  8) | \
-  (((uint32_t) (x) & 0x00ff0000) >>  8) | \
-  (((uint32_t) (x) & 0xff000000) >> 24))
-#define SWAP64(x) ((((uint64_t) (x) & 0x00000000000000ff) << 56) | \
-  (((uint64_t) (x) & 0x000000000000ff00) << 40) | \
-  (((uint64_t) (x) & 0x0000000000ff0000) << 24) | \
-  (((uint64_t) (x) & 0x00000000ff000000) <<  8) | \
-  (((uint64_t) (x) & 0x000000ff00000000) >>  8) | \
-  (((uint64_t) (x) & 0x0000ff0000000000) >> 24) | \
-  (((uint64_t) (x) & 0x00ff000000000000) >> 40) | \
-  (((uint64_t) (x) & 0xff00000000000000) >> 56))
+#define SWAP32(x) ((((uint32_t)(x)&0x000000ff) << 24) | \
+				   (((uint32_t)(x)&0x0000ff00) << 8) |  \
+				   (((uint32_t)(x)&0x00ff0000) >> 8) |  \
+				   (((uint32_t)(x)&0xff000000) >> 24))
+#define SWAP64(x) ((((uint64_t)(x)&0x00000000000000ff) << 56) | \
+				   (((uint64_t)(x)&0x000000000000ff00) << 40) | \
+				   (((uint64_t)(x)&0x0000000000ff0000) << 24) | \
+				   (((uint64_t)(x)&0x00000000ff000000) << 8) |  \
+				   (((uint64_t)(x)&0x000000ff00000000) >> 8) |  \
+				   (((uint64_t)(x)&0x0000ff0000000000) >> 24) | \
+				   (((uint64_t)(x)&0x00ff000000000000) >> 40) | \
+				   (((uint64_t)(x)&0xff00000000000000) >> 56))
 
-static inline uint32_t ident32(uint32_t x) { return x; }
+static inline uint32_t ident32(uint32_t x)
+{
+	return x;
+}
 static inline uint64_t ident64(uint64_t x) { return x; }
 
-static inline uint32_t swap32(uint32_t x) {
+static inline uint32_t swap32(uint32_t x)
+{
 	x = ((x & 0x00ff00ff) << 8) | ((x & 0xff00ff00) >> 8);
 	return (x << 16) | (x >> 16);
 }
-static inline uint64_t swap64(uint64_t x) {
+static inline uint64_t swap64(uint64_t x)
+{
 	x = ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8);
 	x = ((x & 0x0000ffff0000ffff) << 16) | ((x & 0xffff0000ffff0000) >> 16);
 	return (x << 32) | (x >> 32);
@@ -99,39 +112,51 @@ static inline uint64_t swap64(uint64_t x) {
 #else
 #define UNUSED
 #endif
-static inline void mem_inplace_ident(void *mem UNUSED, size_t n UNUSED) { }
+static inline void mem_inplace_ident(void* mem UNUSED, size_t n UNUSED)
+{
+}
 #undef UNUSED
 
-static inline void mem_inplace_swap32(void *mem, size_t n) {
+static inline void mem_inplace_swap32(void* mem, size_t n)
+{
 	size_t i;
-	for (i = 0; i < n; i++) {
-		((uint32_t *)mem)[i] = swap32(((const uint32_t *)mem)[i]);
+	for(i = 0; i < n; i++)
+	{
+		((uint32_t*)mem)[i] = swap32(((const uint32_t*)mem)[i]);
 	}
 }
-static inline void mem_inplace_swap64(void *mem, size_t n) {
+static inline void mem_inplace_swap64(void* mem, size_t n)
+{
 	size_t i;
-	for (i = 0; i < n; i++) {
-		((uint64_t *)mem)[i] = swap64(((const uint64_t *)mem)[i]);
+	for(i = 0; i < n; i++)
+	{
+		((uint64_t*)mem)[i] = swap64(((const uint64_t*)mem)[i]);
 	}
 }
 
-static inline void memcpy_ident32(void *dst, const void *src, size_t n) {
+static inline void memcpy_ident32(void* dst, const void* src, size_t n)
+{
 	memcpy(dst, src, 4 * n);
 }
-static inline void memcpy_ident64(void *dst, const void *src, size_t n) {
+static inline void memcpy_ident64(void* dst, const void* src, size_t n)
+{
 	memcpy(dst, src, 8 * n);
 }
 
-static inline void memcpy_swap32(void *dst, const void *src, size_t n) {
+static inline void memcpy_swap32(void* dst, const void* src, size_t n)
+{
 	size_t i;
-	for (i = 0; i < n; i++) {
-		((uint32_t *)dst)[i] = swap32(((const uint32_t *)src)[i]);
+	for(i = 0; i < n; i++)
+	{
+		((uint32_t*)dst)[i] = swap32(((const uint32_t*)src)[i]);
 	}
 }
-static inline void memcpy_swap64(void *dst, const void *src, size_t n) {
+static inline void memcpy_swap64(void* dst, const void* src, size_t n)
+{
 	size_t i;
-	for (i = 0; i < n; i++) {
-		((uint64_t *)dst)[i] = swap64(((const uint64_t *)src)[i]);
+	for(i = 0; i < n; i++)
+	{
+		((uint64_t*)dst)[i] = swap64(((const uint64_t*)src)[i]);
 	}
 }
 
diff --git a/xmrstak/backend/cpu/crypto/skein_port.h b/xmrstak/backend/cpu/crypto/skein_port.h
index 99641bcdf9227a2cd6c971a652c44e58c7f6e0e5..1648cdc7d0ca04046f039479a52ba528c8373792 100644
--- a/xmrstak/backend/cpu/crypto/skein_port.h
+++ b/xmrstak/backend/cpu/crypto/skein_port.h
@@ -2,38 +2,38 @@
 #define _SKEIN_PORT_H_
 
 #include <limits.h>
-#include <stdint.h>
 #include <stddef.h>
+#include <stdint.h>
 
 #ifndef RETURN_VALUES
-#  define RETURN_VALUES
-#  if defined( DLL_EXPORT )
-#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
-#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
-#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
-#    elif defined( __GNUC__ )
-#      define VOID_RETURN    __declspec( __dllexport__ ) void
-#      define INT_RETURN     __declspec( __dllexport__ ) int
-#    else
-#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
-#    endif
-#  elif defined( DLL_IMPORT )
-#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
-#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
-#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
-#    elif defined( __GNUC__ )
-#      define VOID_RETURN    __declspec( __dllimport__ ) void
-#      define INT_RETURN     __declspec( __dllimport__ ) int
-#    else
-#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
-#    endif
-#  elif defined( __WATCOMC__ )
-#    define VOID_RETURN  void __cdecl
-#    define INT_RETURN   int  __cdecl
-#  else
-#    define VOID_RETURN  void
-#    define INT_RETURN   int
-#  endif
+#define RETURN_VALUES
+#if defined(DLL_EXPORT)
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#define VOID_RETURN __declspec(dllexport) void __stdcall
+#define INT_RETURN __declspec(dllexport) int __stdcall
+#elif defined(__GNUC__)
+#define VOID_RETURN __declspec(__dllexport__) void
+#define INT_RETURN __declspec(__dllexport__) int
+#else
+#error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#endif
+#elif defined(DLL_IMPORT)
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#define VOID_RETURN __declspec(dllimport) void __stdcall
+#define INT_RETURN __declspec(dllimport) int __stdcall
+#elif defined(__GNUC__)
+#define VOID_RETURN __declspec(__dllimport__) void
+#define INT_RETURN __declspec(__dllimport__) int
+#else
+#error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#endif
+#elif defined(__WATCOMC__)
+#define VOID_RETURN void __cdecl
+#define INT_RETURN int __cdecl
+#else
+#define VOID_RETURN void
+#define INT_RETURN int
+#endif
 #endif
 
 /*  These defines are used to declare buffers in a way that allows
@@ -52,17 +52,17 @@
 								variable of length 'size' bits
 */
 
-#define ui_type(size)               uint##size##_t
-#define dec_unit_type(size,x)       typedef ui_type(size) x
-#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
-#define ptr_cast(x,size)            ((ui_type(size)*)(x))
+#define ui_type(size) uint##size##_t
+#define dec_unit_type(size, x) typedef ui_type(size) x
+#define dec_bufr_type(size, bsize, x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x, size) ((ui_type(size)*)(x))
 
-typedef unsigned int    uint_t;             /* native unsigned integer */
-typedef uint8_t         u08b_t;             /*  8-bit unsigned integer */
-typedef uint64_t        u64b_t;             /* 64-bit unsigned integer */
+typedef unsigned int uint_t; /* native unsigned integer */
+typedef uint8_t u08b_t;		 /*  8-bit unsigned integer */
+typedef uint64_t u64b_t;	 /* 64-bit unsigned integer */
 
 #ifndef RotL_64
-#define RotL_64(x,N)    (((x) << (N)) | ((x) >> (64-(N))))
+#define RotL_64(x, N) (((x) << (N)) | ((x) >> (64 - (N))))
 #endif
 
 /*
@@ -91,26 +91,25 @@ typedef uint64_t        u64b_t;             /* 64-bit unsigned integer */
 /* special handler for IA64, which may be either endianness (?)  */
 /* here we assume little-endian, but this may need to be changed */
 #if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
-#  define PLATFORM_MUST_ALIGN (1)
+#define PLATFORM_MUST_ALIGN (1)
 #ifndef PLATFORM_BYTE_ORDER
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #endif
 #endif
 
-#ifndef   PLATFORM_MUST_ALIGN
-#  define PLATFORM_MUST_ALIGN (0)
+#ifndef PLATFORM_MUST_ALIGN
+#define PLATFORM_MUST_ALIGN (0)
 #endif
 
-
-#if   PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
-	/* here for big-endian CPUs */
-#define SKEIN_NEED_SWAP   (1)
+#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+/* here for big-endian CPUs */
+#define SKEIN_NEED_SWAP (1)
 #elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
-	/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
-#define SKEIN_NEED_SWAP   (0)
-#if   PLATFORM_MUST_ALIGN == 0              /* ok to use "fast" versions? */
-#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt)
-#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt))
+/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define SKEIN_NEED_SWAP (0)
+#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */
+#define Skein_Put64_LSB_First(dst08, src64, bCnt) memcpy(dst08, src64, bCnt)
+#define Skein_Get64_LSB_First(dst64, src08, wCnt) memcpy(dst64, src08, 8 * (wCnt))
 #endif
 #else
 #error "Skein needs endianness setting!"
@@ -123,57 +122,55 @@ typedef uint64_t        u64b_t;             /* 64-bit unsigned integer */
  *      Provide any definitions still needed.
  ******************************************************************
  */
-#ifndef Skein_Swap64  /* swap for big-endian, nop for little-endian */
-#if     SKEIN_NEED_SWAP
-#define Skein_Swap64(w64)                       \
-  ( (( ((u64b_t)(w64))       & 0xFF) << 56) |   \
-	(((((u64b_t)(w64)) >> 8) & 0xFF) << 48) |   \
-	(((((u64b_t)(w64)) >>16) & 0xFF) << 40) |   \
-	(((((u64b_t)(w64)) >>24) & 0xFF) << 32) |   \
-	(((((u64b_t)(w64)) >>32) & 0xFF) << 24) |   \
-	(((((u64b_t)(w64)) >>40) & 0xFF) << 16) |   \
-	(((((u64b_t)(w64)) >>48) & 0xFF) <<  8) |   \
-	(((((u64b_t)(w64)) >>56) & 0xFF)      ) )
+#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */
+#if SKEIN_NEED_SWAP
+#define Skein_Swap64(w64)                          \
+	(((((u64b_t)(w64)) & 0xFF) << 56) |            \
+		(((((u64b_t)(w64)) >> 8) & 0xFF) << 48) |  \
+		(((((u64b_t)(w64)) >> 16) & 0xFF) << 40) | \
+		(((((u64b_t)(w64)) >> 24) & 0xFF) << 32) | \
+		(((((u64b_t)(w64)) >> 32) & 0xFF) << 24) | \
+		(((((u64b_t)(w64)) >> 40) & 0xFF) << 16) | \
+		(((((u64b_t)(w64)) >> 48) & 0xFF) << 8) |  \
+		(((((u64b_t)(w64)) >> 56) & 0xFF)))
 #else
-#define Skein_Swap64(w64)  (w64)
+#define Skein_Swap64(w64) (w64)
 #endif
-#endif  /* ifndef Skein_Swap64 */
-
+#endif /* ifndef Skein_Swap64 */
 
 #ifndef Skein_Put64_LSB_First
-void    Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt)
-#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
-	{ /* this version is fully portable (big-endian or little-endian), but slow */
+void Skein_Put64_LSB_First(u08b_t* dst, const u64b_t* src, size_t bCnt)
+#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */
+{					   /* this version is fully portable (big-endian or little-endian), but slow */
 	size_t n;
 
-	for (n=0;n<bCnt;n++)
-		dst[n] = (u08b_t) (src[n>>3] >> (8*(n&7)));
-	}
+	for(n = 0; n < bCnt; n++)
+		dst[n] = (u08b_t)(src[n >> 3] >> (8 * (n & 7)));
+}
 #else
-	;    /* output only the function prototype */
+	; /* output only the function prototype */
 #endif
-#endif   /* ifndef Skein_Put64_LSB_First */
-
+#endif /* ifndef Skein_Put64_LSB_First */
 
 #ifndef Skein_Get64_LSB_First
-void    Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt)
-#ifdef  SKEIN_PORT_CODE /* instantiate the function code here? */
-	{ /* this version is fully portable (big-endian or little-endian), but slow */
+void Skein_Get64_LSB_First(u64b_t* dst, const u08b_t* src, size_t wCnt)
+#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */
+{					   /* this version is fully portable (big-endian or little-endian), but slow */
 	size_t n;
 
-	for (n=0;n<8*wCnt;n+=8)
-		dst[n/8] = (((u64b_t) src[n  ])      ) +
-				   (((u64b_t) src[n+1]) <<  8) +
-				   (((u64b_t) src[n+2]) << 16) +
-				   (((u64b_t) src[n+3]) << 24) +
-				   (((u64b_t) src[n+4]) << 32) +
-				   (((u64b_t) src[n+5]) << 40) +
-				   (((u64b_t) src[n+6]) << 48) +
-				   (((u64b_t) src[n+7]) << 56) ;
-	}
+	for(n = 0; n < 8 * wCnt; n += 8)
+		dst[n / 8] = (((u64b_t)src[n])) +
+					 (((u64b_t)src[n + 1]) << 8) +
+					 (((u64b_t)src[n + 2]) << 16) +
+					 (((u64b_t)src[n + 3]) << 24) +
+					 (((u64b_t)src[n + 4]) << 32) +
+					 (((u64b_t)src[n + 5]) << 40) +
+					 (((u64b_t)src[n + 6]) << 48) +
+					 (((u64b_t)src[n + 7]) << 56);
+}
 #else
-	;    /* output only the function prototype */
+	; /* output only the function prototype */
 #endif
-#endif   /* ifndef Skein_Get64_LSB_First */
+#endif /* ifndef Skein_Get64_LSB_First */
 
-#endif   /* ifndef _SKEIN_PORT_H_ */
+#endif /* ifndef _SKEIN_PORT_H_ */
diff --git a/xmrstak/backend/cpu/crypto/soft_aes.hpp b/xmrstak/backend/cpu/crypto/soft_aes.hpp
index 9b4ae0ab5724a8bc7c778a43e85fe6895de86bf6..3ea75c5e69095ad48b75ca740c7a289a056fa9a9 100644
--- a/xmrstak/backend/cpu/crypto/soft_aes.hpp
+++ b/xmrstak/backend/cpu/crypto/soft_aes.hpp
@@ -34,56 +34,58 @@
 
 #include <inttypes.h>
 
-#define saes_data(w) {\
-	w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
-	w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
-	w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
-	w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
-	w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
-	w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
-	w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
-	w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
-	w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
-	w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
-	w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
-	w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
-	w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
-	w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
-	w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
-	w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
-	w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
-	w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
-	w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
-	w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
-	w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
-	w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
-	w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
-	w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
-	w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
-	w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
-	w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
-	w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
-	w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
-	w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
-	w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
-	w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
+#define saes_data(w)                                                                \
+	{                                                                               \
+		w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),     \
+			w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76), \
+			w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0), \
+			w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0), \
+			w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), \
+			w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), \
+			w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+			w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75), \
+			w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0), \
+			w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84), \
+			w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b), \
+			w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), \
+			w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), \
+			w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+			w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5), \
+			w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2), \
+			w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17), \
+			w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73), \
+			w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), \
+			w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), \
+			w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+			w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79), \
+			w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9), \
+			w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08), \
+			w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6), \
+			w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), \
+			w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), \
+			w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+			w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94), \
+			w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf), \
+			w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68), \
+			w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16)  \
+	}
 
-#define SAES_WPOLY           0x011b
+#define SAES_WPOLY 0x011b
 
 #define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
-	((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
+								  ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
 
-#define saes_f2(x)   ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY))
-#define saes_f3(x)   (saes_f2(x) ^ x)
-#define saes_h0(x)   (x)
+#define saes_f2(x) ((x << 1) ^ (((x >> 7) & 1) * SAES_WPOLY))
+#define saes_f3(x) (saes_f2(x) ^ x)
+#define saes_h0(x) (x)
 
-#define saes_u0(p)   saes_b2w(saes_f2(p),          p,          p, saes_f3(p))
-#define saes_u1(p)   saes_b2w(saes_f3(p), saes_f2(p),          p,          p)
-#define saes_u2(p)   saes_b2w(         p, saes_f3(p), saes_f2(p),          p)
-#define saes_u3(p)   saes_b2w(         p,          p, saes_f3(p), saes_f2(p))
+#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p))
+#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p)
+#define saes_u2(p) saes_b2w(p, saes_f3(p), saes_f2(p), p)
+#define saes_u3(p) saes_b2w(p, p, saes_f3(p), saes_f2(p))
 
-alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) };
-alignas(16) const uint8_t  saes_sbox[256] = saes_data(saes_h0);
+alignas(16) const uint32_t saes_table[4][256] = {saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3)};
+alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0);
 
 static inline __m128i soft_aesenc(__m128i in, __m128i key)
 {
@@ -104,10 +106,10 @@ static inline __m128i soft_aesenc(__m128i in, __m128i key)
 
 static inline uint32_t sub_word(uint32_t key)
 {
-	return (saes_sbox[key >> 24 ] << 24)   |
-		(saes_sbox[(key >> 16) & 0xff] << 16 ) |
-		(saes_sbox[(key >> 8)  & 0xff] << 8  ) |
-		 saes_sbox[key & 0xff];
+	return (saes_sbox[key >> 24] << 24) |
+		   (saes_sbox[(key >> 16) & 0xff] << 16) |
+		   (saes_sbox[(key >> 8) & 0xff] << 8) |
+		   saes_sbox[key & 0xff];
 }
 
 #ifdef __clang__
@@ -121,5 +123,5 @@ static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
 {
 	uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)));
 	uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)));
-	return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1);
+	return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1);
 }
diff --git a/xmrstak/backend/cpu/crypto/variant4_random_math.h b/xmrstak/backend/cpu/crypto/variant4_random_math.h
index 50228adf2fe14ab7ad3b2ca7b301af72a640bc98..9fe61db51e05908bb2db0c1c010998dea310a0bb 100644
--- a/xmrstak/backend/cpu/crypto/variant4_random_math.h
+++ b/xmrstak/backend/cpu/crypto/variant4_random_math.h
@@ -1,12 +1,12 @@
 #pragma once
 
-#include <string.h>
 #include "../../cryptonight.hpp"
 #include "xmrstak/misc/console.hpp"
+#include <string.h>
 
 extern "C"
 {
-    #include "c_blake256.h"
+#include "c_blake256.h"
 }
 
 enum V4_Settings
@@ -31,13 +31,13 @@ enum V4_Settings
 
 enum V4_InstructionList
 {
-	MUL,	// a*b
-	ADD,	// a+b + C, C is an unsigned 32-bit constant
-	SUB,	// a-b
-	ROR,	// rotate right "a" by "b & 31" bits
-	ROL,	// rotate left "a" by "b & 31" bits
-	XOR,	// a^b
-	RET,	// finish execution
+	MUL, // a*b
+	ADD, // a+b + C, C is an unsigned 32-bit constant
+	SUB, // a-b
+	ROR, // rotate right "a" by "b & 31" bits
+	ROL, // rotate left "a" by "b & 31" bits
+	XOR, // a^b
+	RET, // finish execution
 	V4_INSTRUCTION_COUNT = RET,
 };
 
@@ -87,7 +87,7 @@ struct V4_Instruction
 // every switch-case will point to the same destination on every iteration of Cryptonight main loop
 //
 // This is about as fast as it can get without using low-level machine code generation
-template<typename v4_reg>
+template <typename v4_reg>
 static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
 {
 	enum
@@ -95,55 +95,55 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
 		REG_BITS = sizeof(v4_reg) * 8,
 	};
 
-#define V4_EXEC(i) \
-	{ \
-		const struct V4_Instruction* op = code + i; \
-		const v4_reg src = r[op->src_index]; \
-		v4_reg* dst = r + op->dst_index; \
-		switch (op->opcode) \
-		{ \
-		case MUL: \
-			*dst *= src; \
-			break; \
-		case ADD: \
-			*dst += src + op->C; \
-			break; \
-		case SUB: \
-			*dst -= src; \
-			break; \
-		case ROR: \
-			{ \
-				const uint32_t shift = src % REG_BITS; \
-				*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
-			} \
-			break; \
-		case ROL: \
-			{ \
-				const uint32_t shift = src % REG_BITS; \
-				*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
-			} \
-			break; \
-		case XOR: \
-			*dst ^= src; \
-			break; \
-		case RET: \
-			return; \
-		default: \
-			UNREACHABLE_CODE; \
-			break; \
-		} \
+#define V4_EXEC(i)                                                              \
+	{                                                                           \
+		const struct V4_Instruction* op = code + i;                             \
+		const v4_reg src = r[op->src_index];                                    \
+		v4_reg* dst = r + op->dst_index;                                        \
+		switch(op->opcode)                                                      \
+		{                                                                       \
+		case MUL:                                                               \
+			*dst *= src;                                                        \
+			break;                                                              \
+		case ADD:                                                               \
+			*dst += src + op->C;                                                \
+			break;                                                              \
+		case SUB:                                                               \
+			*dst -= src;                                                        \
+			break;                                                              \
+		case ROR:                                                               \
+		{                                                                       \
+			const uint32_t shift = src % REG_BITS;                              \
+			*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
+		}                                                                       \
+		break;                                                                  \
+		case ROL:                                                               \
+		{                                                                       \
+			const uint32_t shift = src % REG_BITS;                              \
+			*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
+		}                                                                       \
+		break;                                                                  \
+		case XOR:                                                               \
+			*dst ^= src;                                                        \
+			break;                                                              \
+		case RET:                                                               \
+			return;                                                             \
+		default:                                                                \
+			UNREACHABLE_CODE;                                                   \
+			break;                                                              \
+		}                                                                       \
 	}
 
 #define V4_EXEC_10(j) \
-	V4_EXEC(j + 0) \
-	V4_EXEC(j + 1) \
-	V4_EXEC(j + 2) \
-	V4_EXEC(j + 3) \
-	V4_EXEC(j + 4) \
-	V4_EXEC(j + 5) \
-	V4_EXEC(j + 6) \
-	V4_EXEC(j + 7) \
-	V4_EXEC(j + 8) \
+	V4_EXEC(j + 0)    \
+	V4_EXEC(j + 1)    \
+	V4_EXEC(j + 2)    \
+	V4_EXEC(j + 3)    \
+	V4_EXEC(j + 4)    \
+	V4_EXEC(j + 5)    \
+	V4_EXEC(j + 6)    \
+	V4_EXEC(j + 7)    \
+	V4_EXEC(j + 8)    \
 	V4_EXEC(j + 9)
 
 	// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
@@ -161,13 +161,13 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
 	// 69      102
 
 	// Unroll 70 instructions here
-	V4_EXEC_10(0);		// instructions 0-9
-	V4_EXEC_10(10);		// instructions 10-19
-	V4_EXEC_10(20);		// instructions 20-29
-	V4_EXEC_10(30);		// instructions 30-39
-	V4_EXEC_10(40);		// instructions 40-49
-	V4_EXEC_10(50);		// instructions 50-59
-	V4_EXEC_10(60);		// instructions 60-69
+	V4_EXEC_10(0);  // instructions 0-9
+	V4_EXEC_10(10); // instructions 10-19
+	V4_EXEC_10(20); // instructions 20-29
+	V4_EXEC_10(30); // instructions 30-39
+	V4_EXEC_10(40); // instructions 40-49
+	V4_EXEC_10(50); // instructions 50-59
+	V4_EXEC_10(60); // instructions 60-69
 
 #undef V4_EXEC_10
 #undef V4_EXEC
@@ -176,7 +176,7 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
 // If we don't have enough data available, generate more
 static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
 {
-	if (*data_index + bytes_needed > data_size)
+	if(*data_index + bytes_needed > data_size)
 	{
 		blake256_hash((uint8_t*)data, (uint8_t*)data, data_size);
 		*data_index = 0;
@@ -188,7 +188,7 @@ static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed
 
 // Generates as many random math operations as possible with given latency and ALU restrictions
 // "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
-template<xmrstak_algo_id ALGO>
+template <xmrstak_algo_id ALGO>
 static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
 {
 	printer::inst()->print_msg(LDEBUG, "CryptonightR create random math for block %llu", height);
@@ -199,13 +199,13 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 	// Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
 	// AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
 	// Source: https://www.agner.org/optimize/instruction_tables.pdf
-	const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
+	const int op_latency[V4_INSTRUCTION_COUNT] = {3, 2, 1, 2, 2, 1};
 
 	// Instruction latencies for theoretical ASIC implementation
-	const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
+	const int asic_op_latency[V4_INSTRUCTION_COUNT] = {3, 1, 1, 1, 1, 1};
 
 	// Available ALUs for each instruction
-	const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
+	const int op_ALUs[V4_INSTRUCTION_COUNT] = {ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT};
 
 	int8_t data[32];
 	memset(data, 0, sizeof(data));
@@ -226,7 +226,8 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 	// There is a small chance (1.8%) that register R8 won't be used in the generated program
 	// So we keep track of it and try again if it's not used
 	bool r8_used;
-	do {
+	do
+	{
 		int latency[9];
 		int asic_latency[9];
 
@@ -237,7 +238,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 		//
 		// Registers R4-R8 are constant and are treated as having the same value because when we do
 		// the same operation twice with two constant source registers, it can be optimized into a single operation
-		uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
+		uint32_t inst_data[9] = {0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF};
 
 		bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
 		bool is_rotation[V4_INSTRUCTION_COUNT];
@@ -260,11 +261,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 
 		// Generate random code to achieve minimal required latency for our abstract CPU
 		// Try to get this latency for all 4 registers
-		while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
+		while(((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
 		{
 			// Fail-safe to guarantee loop termination
 			++total_iterations;
-			if (total_iterations > 256)
+			if(total_iterations > 256)
 				break;
 
 			check_data(&data_index, 1, data, sizeof(data));
@@ -277,12 +278,12 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			// ROR/ROL = opcode 5, shift direction is selected randomly
 			// XOR = opcodes 6-7
 			uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
-			if (opcode == 5)
+			if(opcode == 5)
 			{
 				check_data(&data_index, 1, data, sizeof(data));
 				opcode = (data[data_index++] >= 0) ? ROR : ROL;
 			}
-			else if (opcode >= 6)
+			else if(opcode >= 6)
 			{
 				opcode = XOR;
 			}
@@ -298,7 +299,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			int b = src_index;
 
 			// Don't do ADD/SUB/XOR with the same register
-			if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
+			if(((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
 			{
 				// a is always < 4, so we don't need to check bounds here
 				b = (ALGO == cryptonight_r_wow) ? (a + 4) : 8;
@@ -306,7 +307,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			}
 
 			// Don't do rotation with the same destination twice because it's equal to a single rotation
-			if (is_rotation[opcode] && rotated[a])
+			if(is_rotation[opcode] && rotated[a])
 			{
 				continue;
 			}
@@ -314,7 +315,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			// Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
 			// 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
 			// 2xXOR(a, b) = NOP
-			if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
+			if((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
 			{
 				continue;
 			}
@@ -322,20 +323,20 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			// Find which ALU is available (and when) for this instruction
 			int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
 			int alu_index = -1;
-			while (next_latency < TOTAL_LATENCY)
+			while(next_latency < TOTAL_LATENCY)
 			{
-				for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
+				for(int i = op_ALUs[opcode] - 1; i >= 0; --i)
 				{
-					if (!alu_busy[next_latency][i])
+					if(!alu_busy[next_latency][i])
 					{
 						// ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
-						if ((opcode == ADD) && alu_busy[next_latency + 1][i])
+						if((opcode == ADD) && alu_busy[next_latency + 1][i])
 						{
 							continue;
 						}
 
 						// Rotation can only start when previous rotation is finished, so do an additional availability check
-						if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
+						if(is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
 						{
 							continue;
 						}
@@ -344,7 +345,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 						break;
 					}
 				}
-				if (alu_index >= 0)
+				if(alu_index >= 0)
 				{
 					break;
 				}
@@ -352,16 +353,16 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			}
 
 			// Don't generate instructions that leave some register unchanged for more than 7 cycles
-			if (next_latency > latency[a] + 7)
+			if(next_latency > latency[a] + 7)
 			{
 				continue;
 			}
 
 			next_latency += op_latency[opcode];
 
-			if (next_latency <= TOTAL_LATENCY)
+			if(next_latency <= TOTAL_LATENCY)
 			{
-				if (is_rotation[opcode])
+				if(is_rotation[opcode])
 				{
 					++rotate_count;
 				}
@@ -382,12 +383,12 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 				code[code_size].src_index = src_index;
 				code[code_size].C = 0;
 
-				if (src_index == 8)
+				if(src_index == 8)
 				{
 					r8_used = true;
 				}
 
-				if (opcode == ADD)
+				if(opcode == ADD)
 				{
 					// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
 					alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
@@ -401,7 +402,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 				}
 
 				++code_size;
-				if (code_size >= NUM_INSTRUCTIONS_MIN)
+				if(code_size >= NUM_INSTRUCTIONS_MIN)
 				{
 					break;
 				}
@@ -416,17 +417,19 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 		// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
 		// Get this latency for at least 1 of the 4 registers
 		const int prev_code_size = code_size;
-		while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
+		while((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
 		{
 			int min_idx = 0;
 			int max_idx = 0;
-			for (int i = 1; i < 4; ++i)
+			for(int i = 1; i < 4; ++i)
 			{
-				if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
-				if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
+				if(asic_latency[i] < asic_latency[min_idx])
+					min_idx = i;
+				if(asic_latency[i] > asic_latency[max_idx])
+					max_idx = i;
 			}
 
-			const uint8_t pattern[3] = { ROR, MUL, MUL };
+			const uint8_t pattern[3] = {ROR, MUL, MUL};
 			const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
 			latency[min_idx] = latency[max_idx] + op_latency[opcode];
 			asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
@@ -438,9 +441,9 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
 			++code_size;
 		}
 
-	// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
-	// It never does more than 4 iterations for all block heights < 10,000,000
-	}  while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
+		// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
+		// It never does more than 4 iterations for all block heights < 10,000,000
+	} while(!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
 
 	// It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
 	// Add final instruction to stop the interpreter
diff --git a/xmrstak/backend/cpu/hwlocMemory.cpp b/xmrstak/backend/cpu/hwlocMemory.cpp
index 089570fc01bfa137d3d7ae55093851f480a260f3..804edc55d5254ee3857dcb3c9cc5b13ba0739f0d 100644
--- a/xmrstak/backend/cpu/hwlocMemory.cpp
+++ b/xmrstak/backend/cpu/hwlocMemory.cpp
@@ -13,7 +13,7 @@
  *
  * @param puId core id
  */
-void bindMemoryToNUMANode( size_t puId )
+void bindMemoryToNUMANode(size_t puId)
 {
 	int depth;
 	hwloc_topology_t topology;
@@ -30,18 +30,18 @@ void bindMemoryToNUMANode( size_t puId )
 
 	depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
 
-	for( uint32_t i = 0;
+	for(uint32_t i = 0;
 		i < hwloc_get_nbobjs_by_depth(topology, depth);
-		i++ )
+		i++)
 	{
 		hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, depth, i);
-		if(  pu->os_index == puId )
+		if(pu->os_index == puId)
 		{
-			if( 0 > hwloc_set_membind_nodeset(
-				topology,
-				pu->nodeset,
-				HWLOC_MEMBIND_BIND,
-				HWLOC_MEMBIND_THREAD))
+			if(0 > hwloc_set_membind_nodeset(
+					   topology,
+					   pu->nodeset,
+					   HWLOC_MEMBIND_BIND,
+					   HWLOC_MEMBIND_THREAD))
 			{
 				printer::inst()->print_msg(L0, "hwloc: can't bind memory");
 			}
@@ -57,7 +57,7 @@ void bindMemoryToNUMANode( size_t puId )
 }
 #else
 
-void bindMemoryToNUMANode( size_t )
+void bindMemoryToNUMANode(size_t)
 {
 }
 
diff --git a/xmrstak/backend/cpu/hwlocMemory.hpp b/xmrstak/backend/cpu/hwlocMemory.hpp
index 2130c2ced86cab8a05ee2f00d71210af3488460a..42fa3456f23a8e9e390bab6f82ede216348f1f48 100644
--- a/xmrstak/backend/cpu/hwlocMemory.hpp
+++ b/xmrstak/backend/cpu/hwlocMemory.hpp
@@ -9,4 +9,4 @@
  *
  * @param puId core id
  */
-void bindMemoryToNUMANode( size_t puId );
+void bindMemoryToNUMANode(size_t puId);
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index a14be1732b9fca69f9ab2d96941102339c320635..a7bb91d619210e2c2118e8c39d30b5a0c284d115 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -37,7 +37,6 @@
 #include <cpuid.h>
 #endif
 
-
 namespace xmrstak
 {
 namespace cpu
@@ -48,9 +47,14 @@ using namespace rapidjson;
 /*
  * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
  */
-enum configEnum { aCpuThreadsConf, sUseSlowMem };
+enum configEnum
+{
+	aCpuThreadsConf,
+	sUseSlowMem
+};
 
-struct configVal {
+struct configVal
+{
 	configEnum iName;
 	const char* sName;
 	Type iType;
@@ -59,10 +63,9 @@ struct configVal {
 // Same order as in configEnum, as per comment above
 // kNullType means any type
 configVal oConfigValues[] = {
-	{ aCpuThreadsConf, "cpu_threads_conf", kNullType }
-};
+	{aCpuThreadsConf, "cpu_threads_conf", kNullType}};
 
-constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0]));
 
 inline bool checkType(Type have, Type want)
 {
@@ -95,7 +98,7 @@ jconf::jconf()
 	prv = new opaque_private();
 }
 
-bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
+bool jconf::GetThreadConfig(size_t id, thd_cfg& cfg)
 {
 	if(!prv->configValues[aCpuThreadsConf]->IsArray())
 		return false;
@@ -148,7 +151,6 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	return true;
 }
 
-
 size_t jconf::GetThreadCount()
 {
 	if(prv->configValues[aCpuThreadsConf]->IsArray())
@@ -159,22 +161,22 @@ size_t jconf::GetThreadCount()
 
 bool jconf::parse_config(const char* sFilename)
 {
-	FILE * pFile;
-	char * buffer;
+	FILE* pFile;
+	char* buffer;
 	size_t flen;
 
 	pFile = fopen(sFilename, "rb");
-	if (pFile == NULL)
+	if(pFile == NULL)
 	{
 		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
 		return false;
 	}
 
-	fseek(pFile,0,SEEK_END);
+	fseek(pFile, 0, SEEK_END);
 	flen = ftell(pFile);
 	rewind(pFile);
 
-	if(flen >= 64*1024)
+	if(flen >= 64 * 1024)
 	{
 		fclose(pFile);
 		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
@@ -189,7 +191,7 @@ bool jconf::parse_config(const char* sFilename)
 	}
 
 	buffer = (char*)malloc(flen + 3);
-	if(fread(buffer+1, flen, 1, pFile) != 1)
+	if(fread(buffer + 1, flen, 1, pFile) != 1)
 	{
 		free(buffer);
 		fclose(pFile);
@@ -211,7 +213,7 @@ bool jconf::parse_config(const char* sFilename)
 	buffer[flen] = '}';
 	buffer[flen + 1] = '\0';
 
-	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	prv->jsonDoc.Parse<kParseCommentsFlag | kParseTrailingCommasFlag>(buffer, flen + 2);
 	free(buffer);
 
 	if(prv->jsonDoc.HasParseError())
@@ -251,7 +253,7 @@ bool jconf::parse_config(const char* sFilename)
 	}
 
 	thd_cfg c;
-	for(size_t i=0; i < GetThreadCount(); i++)
+	for(size_t i = 0; i < GetThreadCount(); i++)
 	{
 		if(!GetThreadConfig(i, c))
 		{
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index 4ec9165d59ec11b0a3ad5ed929d08218e95ca7ed..67dbd02758fbbf198d30b9b337a7fc90f5990158 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -12,16 +12,18 @@ namespace cpu
 
 class jconf
 {
-public:
+  public:
 	static jconf* inst()
 	{
-		if (oInst == nullptr) oInst = new jconf;
+		if(oInst == nullptr)
+			oInst = new jconf;
 		return oInst;
 	};
 
 	bool parse_config(const char* sFilename = params::inst().configFileCPU.c_str());
 
-	struct thd_cfg {
+	struct thd_cfg
+	{
 		int iMultiway;
 		bool bNoPrefetch;
 		std::string asm_version_str;
@@ -29,10 +31,10 @@ public:
 	};
 
 	size_t GetThreadCount();
-	bool GetThreadConfig(size_t id, thd_cfg &cfg);
+	bool GetThreadConfig(size_t id, thd_cfg& cfg);
 	bool NeedsAutoconf();
 
-private:
+  private:
 	jconf();
 	static jconf* oInst;
 
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index e90b59500b09305c149c37ef790e56410672d7b7..463be1aabea509de74784912ae05e1e452456035 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -23,33 +23,33 @@
 
 #include "crypto/cryptonight_aesni.h"
 
-#include "xmrstak/misc/console.hpp"
-#include "xmrstak/backend/iBackend.hpp"
+#include "jconf.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
 #include "xmrstak/backend/globalStates.hpp"
+#include "xmrstak/backend/iBackend.hpp"
 #include "xmrstak/misc/configEditor.hpp"
-#include "xmrstak/backend/cpu/cpuType.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
-#include "jconf.hpp"
 
-#include "xmrstak/misc/executor.hpp"
 #include "minethd.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/executor.hpp"
 
 #include "hwlocMemory.hpp"
 #include "xmrstak/backend/miner_work.hpp"
 
 #ifndef CONF_NO_HWLOC
-#   include "autoAdjustHwloc.hpp"
+#include "autoAdjustHwloc.hpp"
 #else
-#   include "autoAdjust.hpp"
+#include "autoAdjust.hpp"
 #endif
 
 #include <assert.h>
-#include <cmath>
+#include <bitset>
 #include <chrono>
+#include <cmath>
 #include <cstring>
 #include <thread>
-#include <bitset>
 #include <unordered_map>
 
 #ifdef _WIN32
@@ -58,9 +58,9 @@
 #include <pthread.h>
 
 #if defined(__APPLE__)
-#include <mach/thread_policy.h>
 #include <mach/thread_act.h>
-#define SYSCTL_CORE_COUNT   "machdep.cpu.core_count"
+#include <mach/thread_policy.h>
+#define SYSCTL_CORE_COUNT "machdep.cpu.core_count"
 #elif defined(__FreeBSD__)
 #include <pthread_np.h>
 #endif //__APPLE__
@@ -87,7 +87,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 	}
 #elif defined(__APPLE__)
 	thread_port_t mach_thread;
-	thread_affinity_policy_data_t policy = { static_cast<integer_t>(cpu_id) };
+	thread_affinity_policy_data_t policy = {static_cast<integer_t>(cpu_id)};
 	mach_thread = pthread_mach_thread_np(h);
 	return thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1) == KERN_SUCCESS;
 #elif defined(__FreeBSD__)
@@ -96,8 +96,8 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 	CPU_SET(cpu_id, &mn);
 	return pthread_setaffinity_np(h, sizeof(cpuset_t), &mn) == 0;
 #elif defined(__OpenBSD__)
-        printer::inst()->print_msg(L0,"WARNING: thread pinning is not supported under OPENBSD.");
-        return true;
+	printer::inst()->print_msg(L0, "WARNING: thread pinning is not supported under OPENBSD.");
+	return true;
 #else
 	cpu_set_t mn;
 	CPU_ZERO(&mn);
@@ -120,7 +120,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
 	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> order_guard = order_fix.get_future();
 
-	switch (iMultiway)
+	switch(iMultiway)
 	{
 	case 5:
 		oWorkThd = std::thread(&minethd::penta_work_main, this);
@@ -150,13 +150,13 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
 cryptonight_ctx* minethd::minethd_alloc_ctx()
 {
 	cryptonight_ctx* ctx;
-	alloc_msg msg = { 0 };
+	alloc_msg msg = {0};
 
-	switch (::jconf::inst()->GetSlowMemSetting())
+	switch(::jconf::inst()->GetSlowMemSetting())
 	{
 	case ::jconf::never_use:
 		ctx = cryptonight_alloc_ctx(1, 1, &msg);
-		if (ctx == NULL)
+		if(ctx == NULL)
 			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning);
 		else
 		{
@@ -170,7 +170,7 @@ cryptonight_ctx* minethd::minethd_alloc_ctx()
 
 	case ::jconf::no_mlck:
 		ctx = cryptonight_alloc_ctx(1, 0, &msg);
-		if (ctx == NULL)
+		if(ctx == NULL)
 			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning);
 		else
 		{
@@ -184,12 +184,12 @@ cryptonight_ctx* minethd::minethd_alloc_ctx()
 
 	case ::jconf::print_warning:
 		ctx = cryptonight_alloc_ctx(1, 1, &msg);
-		if (msg.warning != NULL)
+		if(msg.warning != NULL)
 			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning);
-		if (ctx == NULL)
+		if(ctx == NULL)
 			ctx = cryptonight_alloc_ctx(0, 0, NULL);
 
-		if (ctx != NULL)
+		if(ctx != NULL)
 		{
 			ctx->hash_fn = nullptr;
 			ctx->loop_fn = nullptr;
@@ -220,11 +220,11 @@ cryptonight_ctx* minethd::minethd_alloc_ctx()
 static constexpr size_t MAX_N = 5;
 bool minethd::self_test()
 {
-	alloc_msg msg = { 0 };
+	alloc_msg msg = {0};
 	size_t res;
 	bool fatal = false;
 
-	switch (::jconf::inst()->GetSlowMemSetting())
+	switch(::jconf::inst()->GetSlowMemSetting())
 	{
 	case ::jconf::never_use:
 		res = cryptonight_init(1, 1, &msg);
@@ -255,13 +255,13 @@ bool minethd::self_test()
 	if(res == 0 && fatal)
 		return false;
 
-	cryptonight_ctx *ctx[MAX_N] = {0};
-	for (int i = 0; i < MAX_N; i++)
+	cryptonight_ctx* ctx[MAX_N] = {0};
+	for(int i = 0; i < MAX_N; i++)
 	{
-		if ((ctx[i] = minethd_alloc_ctx()) == nullptr)
+		if((ctx[i] = minethd_alloc_ctx()) == nullptr)
 		{
 			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
-			for (int j = 0; j < i; j++)
+			for(int j = 0; j < i; j++)
 				cryptonight_free_ctx(ctx[j]);
 			return false;
 		}
@@ -279,63 +279,68 @@ bool minethd::self_test()
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
 			minethd::cn_on_new_job dm;
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
 			func_multi_selector<2>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+			bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+											 "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22",
+									 64) == 0;
 
 			func_multi_selector<2>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+			bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+											 "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22",
+									 64) == 0;
 
 			func_multi_selector<3>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a testThis is a testThis is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05",
+									 96) == 0;
 
 			func_multi_selector<4>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05",
+									 128) == 0;
 
 			func_multi_selector<5>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
+			bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+											 "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05",
+									 160) == 0;
 		}
 		else if(algo == POW(cryptonight_lite))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+			bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+			bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_monero))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+			bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+			bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_monero_v8))
 		{
@@ -351,61 +356,61 @@ bool minethd::self_test()
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+			bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+			bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_ipbc))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
+			bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
+			bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_stellite))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+			bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+			bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_masari))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+			bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+			bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_heavy))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+			bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+			bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_haven))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+			bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+			bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_bittube2))
 		{
@@ -415,7 +420,7 @@ bool minethd::self_test()
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 
 			ctx[0]->hash_fn("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
+			bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
 
 			ctx[0]->hash_fn("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
@@ -427,29 +432,29 @@ bool minethd::self_test()
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
+			bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_gpu))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
+			bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
+			bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_conceal))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
+			bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
 
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
 			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
+			bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
 		}
-		else if (algo == POW(cryptonight_turtle))
+		else if(algo == POW(cryptonight_turtle))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
 			ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo);
@@ -467,7 +472,7 @@ bool minethd::self_test()
 			work.iBlockHeight = 1806260;
 			set_job(work, ctx);
 			ctx[0]->hash_fn("\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74", 44, out, ctx, algo);
-			bResult = bResult &&  memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0;
+			bResult = bResult && memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0;
 		}
 		else if(algo == POW(cryptonight_v8_reversewaltz))
 		{
@@ -498,7 +503,7 @@ bool minethd::self_test()
 				"Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
 	}
 
-	for (int i = 0; i < MAX_N; i++)
+	for(int i = 0; i < MAX_N; i++)
 		cryptonight_free_ctx(ctx[i]);
 
 	return bResult;
@@ -520,14 +525,13 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 		win_exit();
 	}
 
-
 	//Launch the requested number of single and double threads, to distribute
 	//load evenly we need to alternate single and double threads
 	size_t i, n = jconf::inst()->GetThreadCount();
 	pvThreads.reserve(n);
 
 	jconf::thd_cfg cfg;
-	for (i = 0; i < n; i++)
+	for(i = 0; i < n; i++)
 	{
 		jconf::inst()->GetThreadConfig(i, cfg);
 
@@ -572,11 +576,11 @@ static std::string getAsmName(const uint32_t num_hashes)
 	return asm_type;
 }
 
-template<size_t N>
+template <size_t N>
 void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& on_new_job,
 	bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str)
 {
-	static_assert(N >= 1, "number of threads must be >= 1" );
+	static_assert(N >= 1, "number of threads must be >= 1");
 
 	// We have two independent flag bits in the functions
 	// therefore we will build a binary digit and select the
@@ -717,21 +721,20 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job&
 		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, true, true>
-	};
+		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, true, true>};
 
 	std::bitset<2> digit;
 	digit.set(0, !bHaveAes);
 	digit.set(1, !bNoPrefetch);
 
-	ctx[0]->hash_fn = func_table[ algv << 2 | digit.to_ulong() ];
+	ctx[0]->hash_fn = func_table[algv << 2 | digit.to_ulong()];
 
 	// check for asm optimized version for cryptonight_v8
 	if(algo == cryptonight_monero_v8)
 	{
 		std::string selected_asm = asm_version_str;
 		if(selected_asm == "auto")
-				selected_asm = cpu::getAsmName(N);
+			selected_asm = cpu::getAsmName(N);
 
 		if(selected_asm != "off")
 		{
@@ -747,7 +750,7 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job&
 	{
 		std::string selected_asm = asm_version_str;
 		if(selected_asm == "auto")
-				selected_asm = cpu::getAsmName(N);
+			selected_asm = cpu::getAsmName(N);
 		if(selected_asm == "off")
 		{
 			for(int h = 0; h < N; ++h)
@@ -769,7 +772,7 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job&
 	};
 
 	auto it = on_new_job_map.find(algo.Id());
-	if (it != on_new_job_map.end())
+	if(it != on_new_job_map.end())
 		on_new_job = it->second;
 	else
 		on_new_job = nullptr;
@@ -806,18 +809,18 @@ void minethd::penta_work_main()
 	multiway_work_main<5u>();
 }
 
-template<size_t N>
-void minethd::prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce)
+template <size_t N>
+void minethd::prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce)
 {
-	for (size_t i = 0; i < N; i++)
+	for(size_t i = 0; i < N; i++)
 	{
 		memcpy(bWorkBlob + oWork.iWorkSize * i, oWork.bWorkBlob, oWork.iWorkSize);
-		if (i > 0)
+		if(i > 0)
 			piNonce[i] = (uint32_t*)(bWorkBlob + oWork.iWorkSize * i + 39);
 	}
 }
 
-template<uint32_t N>
+template <uint32_t N>
 void minethd::multiway_work_main()
 {
 	if(affinity >= 0) //-1 means no affinity
@@ -828,22 +831,23 @@ void minethd::multiway_work_main()
 	lck.release();
 	std::this_thread::yield();
 
-	cryptonight_ctx *ctx[MAX_N];
+	cryptonight_ctx* ctx[MAX_N];
 	uint64_t iCount = 0;
-	uint64_t *piHashVal[MAX_N];
-	uint32_t *piNonce[MAX_N];
+	uint64_t iLastCount = 0;
+	uint64_t* piHashVal[MAX_N];
+	uint32_t* piNonce[MAX_N];
 	uint8_t bHashOut[MAX_N * 32];
 	uint8_t bWorkBlob[sizeof(miner_work::bWorkBlob) * MAX_N];
 	uint32_t iNonce;
 	job_result res;
 
-	for (size_t i = 0; i < N; i++)
+	for(size_t i = 0; i < N; i++)
 	{
 		ctx[i] = minethd_alloc_ctx();
 		if(ctx[i] == nullptr)
 		{
 			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
-			for (int j = 0; j < i; j++)
+			for(int j = 0; j < i; j++)
 				cryptonight_free_ctx(ctx[j]);
 			win_exit(1);
 		}
@@ -863,15 +867,15 @@ void minethd::multiway_work_main()
 	size_t lastPoolId = 0;
 
 	func_multi_selector<N>(ctx, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
-	while (bQuit == 0)
+	while(bQuit == 0)
 	{
-		if (oWork.bStall)
+		if(oWork.bStall)
 		{
 			/*	We are stalled here because the executor didn't find a job for us yet,
 			either because of network latency, or a socket problem. Since we are
 			raison d'etre of this software it us sensible to just wait until we have something*/
 
-			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+			while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
 			globalStates::inst().consume_work(oWork, iJobNo);
@@ -908,13 +912,12 @@ void minethd::multiway_work_main()
 		if(on_new_job != nullptr)
 			on_new_job(oWork, ctx);
 
-		while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 		{
-			if ((iCount++ & 0x7) == 0)  //Store stats every 8*N hashes
+			if((iCount++ & 0x7) == 0) //Store stats every 8*N hashes
 			{
-				uint64_t iStamp = get_timestamp_ms();
-				iHashCount.store(iCount * N, std::memory_order_relaxed);
-				iTimestamp.store(iStamp, std::memory_order_relaxed);
+				updateStats((iCount - iLastCount) * N, oWork.iPoolId);
+				iLastCount = iCount;
 			}
 
 			nonce_ctr -= N;
@@ -927,19 +930,18 @@ void minethd::multiway_work_main()
 					break;
 			}
 
-			for (size_t i = 0; i < N; i++)
+			for(size_t i = 0; i < N; i++)
 				*piNonce[i] = iNonce++;
 
 			ctx[0]->hash_fn(bWorkBlob, oWork.iWorkSize, bHashOut, ctx, miner_algo);
 
-			for (size_t i = 0; i < N; i++)
+			for(size_t i = 0; i < N; i++)
 			{
-				if (*piHashVal[i] < oWork.iTarget)
+				if(*piHashVal[i] < oWork.iTarget)
 				{
 					executor::inst()->push_event(
 						ex_event(job_result(oWork.sJobID, iNonce - N + i, bHashOut + 32 * i, iThreadNo, miner_algo),
-						oWork.iPoolId)
-					);
+							oWork.iPoolId));
 				}
 			}
 
@@ -950,7 +952,7 @@ void minethd::multiway_work_main()
 		prep_multiway_work<N>(bWorkBlob, piNonce);
 	}
 
-	for (int i = 0; i < N; i++)
+	for(int i = 0; i < N; i++)
 		cryptonight_free_ctx(ctx[i]);
 }
 
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 1e25f5d4fda0ce114f53a8978f8c24375a7b4c0d..a5201f37ae94f3cff16ed312871684649b9af901 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -1,15 +1,15 @@
 #pragma once
 
-#include "xmrstak/jconf.hpp"
 #include "crypto/cryptonight.h"
-#include "xmrstak/backend/miner_work.hpp"
 #include "xmrstak/backend/iBackend.hpp"
+#include "xmrstak/backend/miner_work.hpp"
+#include "xmrstak/jconf.hpp"
 
+#include <atomic>
+#include <future>
 #include <iostream>
 #include <thread>
 #include <vector>
-#include <atomic>
-#include <future>
 
 namespace xmrstak
 {
@@ -18,7 +18,7 @@ namespace cpu
 
 class minethd : public iBackend
 {
-public:
+  public:
 	static std::vector<iBackend*> thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
@@ -29,19 +29,18 @@ public:
 
 	static cryptonight_ctx* minethd_alloc_ctx();
 
-	template<size_t N>
+	template <size_t N>
 	static void func_multi_selector(cryptonight_ctx**, minethd::cn_on_new_job& on_new_job,
-			bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off");
+		bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off");
 
-	private:
-		
+  private:
 	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
 
-	template<uint32_t N>
+	template <uint32_t N>
 	void multiway_work_main();
 
-	template<size_t N>
-	void prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce);
+	template <size_t N>
+	void prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce);
 
 	void work_main();
 	void double_work_main();
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index e58665922bb6b7da8ea987153dee3b3297fb4654..262865ea0d21c156a83e3aa129f9fea724826920 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -1,9 +1,9 @@
 #pragma once
-#include <stddef.h>
+#include <array>
 #include <inttypes.h>
-#include <type_traits>
+#include <stddef.h>
 #include <string>
-#include <array>
+#include <type_traits>
 
 constexpr size_t start_derived_algo_id = 1000;
 
@@ -15,10 +15,10 @@ enum xmrstak_algo_id
 	cryptonight_monero = 3,
 	cryptonight_heavy = 4,
 	cryptonight_aeon = 5,
-	cryptonight_ipbc = 6, // equal to cryptonight_aeon with a small tweak in the miner code
-	cryptonight_stellite = 7, //equal to cryptonight_monero but with one tiny change
-	cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari
-	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
+	cryptonight_ipbc = 6,	  // equal to cryptonight_aeon with a small tweak in the miner code
+	cryptonight_stellite = 7,  //equal to cryptonight_monero but with one tiny change
+	cryptonight_masari = 8,	//equal to cryptonight_monero but with less iterations, used by masari
+	cryptonight_haven = 9,	 // equal to cryptonight_heavy with a small tweak
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
 	cryptonight_monero_v8 = 11,
 	cryptonight_superfast = 12,
@@ -42,35 +42,32 @@ enum xmrstak_algo_id
 inline std::string get_algo_name(xmrstak_algo_id algo_id)
 {
 	static std::array<std::string, 18> base_algo_names =
-	{{
-		"invalid_algo",
-		"cryptonight",
-		"cryptonight_lite",
-		"cryptonight_v7",
-		"cryptonight_heavy",
-		"cryptonight_lite_v7",
-		"cryptonight_lite_v7_xor",
-		"cryptonight_v7_stellite",
-		"cryptonight_masari",
-		"cryptonight_haven",
-		"cryptonight_bittube2",
-		"cryptonight_v8",
-		"cryptonight_superfast",
-		"cryptonight_gpu",
-		"cryptonight_conceal",
-		"cryptonight_r_wow",
-		"cryptonight_r",
-		"cryptonight_v8_reversewaltz" // used by graft
-	}};
+		{{
+			"invalid_algo",
+			"cryptonight",
+			"cryptonight_lite",
+			"cryptonight_v7",
+			"cryptonight_heavy",
+			"cryptonight_lite_v7",
+			"cryptonight_lite_v7_xor",
+			"cryptonight_v7_stellite",
+			"cryptonight_masari",
+			"cryptonight_haven",
+			"cryptonight_bittube2",
+			"cryptonight_v8",
+			"cryptonight_superfast",
+			"cryptonight_gpu",
+			"cryptonight_conceal",
+			"cryptonight_r_wow",
+			"cryptonight_r",
+			"cryptonight_v8_reversewaltz" // used by graft
+		}};
 
 	static std::array<std::string, 4> derived_algo_names =
-	{{
-		"cryptonight_turtle",
-		"cryptonight_v8_half", // used by masari and stellite
-		"cryptonight_v8_zelerius",
-		"cryptonight_v8_double"
-	}};
-
+		{{"cryptonight_turtle",
+			"cryptonight_v8_half", // used by masari and stellite
+			"cryptonight_v8_zelerius",
+			"cryptonight_v8_double"}};
 
 	if(algo_id < start_derived_algo_id)
 		return base_algo_names[algo_id];
@@ -80,19 +77,35 @@ inline std::string get_algo_name(xmrstak_algo_id algo_id)
 
 struct xmrstak_algo
 {
-	xmrstak_algo(xmrstak_algo_id name_id) : algo_name(name_id), base_algo(name_id)
+	xmrstak_algo(xmrstak_algo_id name_id) :
+		algo_name(name_id),
+		base_algo(name_id)
 	{
 	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : algo_name(name_id), base_algo(algorithm)
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) :
+		algo_name(name_id),
+		base_algo(algorithm)
 	{
 	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : algo_name(name_id), base_algo(algorithm), iter(iteration)
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) :
+		algo_name(name_id),
+		base_algo(algorithm),
+		iter(iteration)
 	{
 	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory)
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) :
+		algo_name(name_id),
+		base_algo(algorithm),
+		iter(iteration),
+		mem(memory)
 	{
 	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory), mask(mem_mask)
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) :
+		algo_name(name_id),
+		base_algo(algorithm),
+		iter(iteration),
+		mem(memory),
+		mask(mem_mask)
 	{
 	}
 
@@ -187,35 +200,33 @@ constexpr uint32_t CN_DOUBLE_ITER = 0x100000;
 
 inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 {
-	static std::array<xmrstak_algo, 18> pow = {{
-		{invalid_algo, invalid_algo},
+	static std::array<xmrstak_algo, 18> pow = {{{invalid_algo, invalid_algo},
 		{cryptonight, cryptonight, CN_ITER, CN_MEMORY},
-		{cryptonight_lite, cryptonight_lite, CN_ITER/2, CN_MEMORY/2},
+		{cryptonight_lite, cryptonight_lite, CN_ITER / 2, CN_MEMORY / 2},
 		{cryptonight_monero, cryptonight_monero, CN_ITER, CN_MEMORY},
-		{cryptonight_heavy, cryptonight_heavy, CN_ITER/2, CN_MEMORY*2},
-		{cryptonight_aeon, cryptonight_aeon, CN_ITER/2, CN_MEMORY/2},
-		{cryptonight_ipbc, cryptonight_ipbc, CN_ITER/2, CN_MEMORY/2}, // equal to cryptonight_aeon with a small tweak in the miner code
-		{cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change
-		{cryptonight_masari, cryptonight_masari, CN_ITER/2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari
-		{cryptonight_haven, cryptonight_haven, CN_ITER/2, CN_MEMORY*2}, // equal to cryptonight_heavy with a small tweak
-		{cryptonight_bittube2, cryptonight_bittube2, CN_ITER/2, CN_MEMORY*2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
+		{cryptonight_heavy, cryptonight_heavy, CN_ITER / 2, CN_MEMORY * 2},
+		{cryptonight_aeon, cryptonight_aeon, CN_ITER / 2, CN_MEMORY / 2},
+		{cryptonight_ipbc, cryptonight_ipbc, CN_ITER / 2, CN_MEMORY / 2},		  // equal to cryptonight_aeon with a small tweak in the miner code
+		{cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY},		  //equal to cryptonight_monero but with one tiny change
+		{cryptonight_masari, cryptonight_masari, CN_ITER / 2, CN_MEMORY},		  //equal to cryptonight_monero but with less iterations, used by masari
+		{cryptonight_haven, cryptonight_haven, CN_ITER / 2, CN_MEMORY * 2},		  // equal to cryptonight_heavy with a small tweak
+		{cryptonight_bittube2, cryptonight_bittube2, CN_ITER / 2, CN_MEMORY * 2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
 		{cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY},
-		{cryptonight_superfast, cryptonight_superfast, CN_ITER/4, CN_MEMORY},
+		{cryptonight_superfast, cryptonight_superfast, CN_ITER / 4, CN_MEMORY},
 		{cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK},
-		{cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY},
+		{cryptonight_conceal, cryptonight_conceal, CN_ITER / 2, CN_MEMORY},
 		{cryptonight_r_wow, cryptonight_r_wow, CN_ITER, CN_MEMORY},
 		{cryptonight_r, cryptonight_r, CN_ITER, CN_MEMORY},
-		{cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}
-	}};
+		{cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}}};
 
 	static std::array<xmrstak_algo, 4> derived_pow =
-	{{
-		{cryptonight_turtle, cryptonight_monero_v8, CN_ITER/8, CN_MEMORY/8, CN_TURTLE_MASK},
-		{cryptonight_v8_half, cryptonight_monero_v8, CN_ITER/2, CN_MEMORY},
-		{cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY},
-		{cryptonight_v8_double, cryptonight_monero_v8, CN_DOUBLE_ITER, CN_MEMORY}
-		// {cryptonight_derived}
-	}};
+		{{
+			{cryptonight_turtle, cryptonight_monero_v8, CN_ITER / 8, CN_MEMORY / 8, CN_TURTLE_MASK},
+			{cryptonight_v8_half, cryptonight_monero_v8, CN_ITER / 2, CN_MEMORY},
+			{cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY},
+			{cryptonight_v8_double, cryptonight_monero_v8, CN_DOUBLE_ITER, CN_MEMORY}
+			// {cryptonight_derived}
+		}};
 
 	if(algo_id < start_derived_algo_id)
 		return pow[algo_id];
diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp
index 52ef3f39150ea9bb9b4607c89fd867a15dfc764f..5b4332ba41e877436a0bcc5ea2eead6394bbb749 100644
--- a/xmrstak/backend/globalStates.cpp
+++ b/xmrstak/backend/globalStates.cpp
@@ -21,15 +21,14 @@
   *
   */
 
-#include "miner_work.hpp"
 #include "globalStates.hpp"
+#include "miner_work.hpp"
 
 #include <assert.h>
-#include <cmath>
 #include <chrono>
+#include <cmath>
 #include <cstring>
 
-
 namespace xmrstak
 {
 
diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp
index d6966c4a2ed68f91acd63f2c801aca2327eaabea..a3ff30eea6ae829b88574d25c8c5d19be58ed400 100644
--- a/xmrstak/backend/globalStates.hpp
+++ b/xmrstak/backend/globalStates.hpp
@@ -1,10 +1,10 @@
 #pragma once
 
 #include "xmrstak/backend/miner_work.hpp"
-#include "xmrstak/misc/environment.hpp"
-#include "xmrstak/misc/console.hpp"
 #include "xmrstak/backend/pool_data.hpp"
 #include "xmrstak/cpputil/read_write_lock.h"
+#include "xmrstak/misc/console.hpp"
+#include "xmrstak/misc/environment.hpp"
 
 #include <atomic>
 
@@ -32,7 +32,7 @@ struct globalStates
 			nonce = iGlobalNonce.fetch_add(reserve_count);
 	}
 
-	void consume_work( miner_work& threadWork, uint64_t& currentJobId);
+	void consume_work(miner_work& threadWork, uint64_t& currentJobId);
 
 	miner_work oGlobalWork;
 	std::atomic<uint64_t> iGlobalJobNo;
@@ -41,8 +41,11 @@ struct globalStates
 	uint64_t iThreadCount;
 	size_t pool_id = invalid_pool_id;
 
-private:
-	globalStates() : iThreadCount(0), iGlobalJobNo(0), iConsumeCnt(0)
+  private:
+	globalStates() :
+		iThreadCount(0),
+		iGlobalJobNo(0),
+		iConsumeCnt(0)
 	{
 	}
 
diff --git a/xmrstak/backend/iBackend.hpp b/xmrstak/backend/iBackend.hpp
index 18411b79c0c1cd9062a183c3b78f9b32ddd3927c..dd59b6c52e906a7da61402dbc86c842b91f3ca43 100644
--- a/xmrstak/backend/iBackend.hpp
+++ b/xmrstak/backend/iBackend.hpp
@@ -1,12 +1,13 @@
 #pragma once
 
 #include "xmrstak/backend/globalStates.hpp"
+#include "xmrstak/net/msgstruct.hpp"
 
 #include <atomic>
-#include <cstdint>
 #include <climits>
-#include <vector>
+#include <cstdint>
 #include <string>
+#include <vector>
 
 template <typename T, std::size_t N>
 constexpr std::size_t countof(T const (&)[N]) noexcept
@@ -16,35 +17,65 @@ constexpr std::size_t countof(T const (&)[N]) noexcept
 
 namespace xmrstak
 {
-	struct iBackend
+struct iBackend
+{
+
+	enum BackendType : uint32_t
 	{
+		UNKNOWN = 0u,
+		CPU = 1u,
+		AMD = 2u,
+		NVIDIA = 3u
+	};
 
-		enum BackendType : uint32_t { UNKNOWN = 0u, CPU = 1u, AMD = 2u, NVIDIA = 3u };
+	static const char* getName(const BackendType type)
+	{
+		const char* backendNames[] = {
+			"unknown",
+			"cpu",
+			"amd",
+			"nvidia"};
 
-		static const char* getName(const BackendType type)
-		{
-			const char* backendNames[] = {
-				"unknown",
-				"cpu",
-				"amd",
-				"nvidia"
-			};
-
-			uint32_t i = static_cast<uint32_t>(type);
-			if(i >= countof(backendNames))
-				i = 0;
-
-			return backendNames[i];
-		}
+		uint32_t i = static_cast<uint32_t>(type);
+		if(i >= countof(backendNames))
+			i = 0;
+
+		return backendNames[i];
+	}
 
-		std::atomic<uint64_t> iHashCount;
-		std::atomic<uint64_t> iTimestamp;
-		uint32_t iThreadNo;
-		BackendType backendType = UNKNOWN;
+	std::atomic<uint64_t> iHashCount;
+	std::atomic<uint64_t> iTimestamp;
+	uint32_t iThreadNo;
+	BackendType backendType = UNKNOWN;
+	uint64_t iLastStamp = get_timestamp_ms();
+	double avgHashPerMsec = 0.0;
 
-		iBackend() : iHashCount(0), iTimestamp(0)
+	void updateStats(uint64_t numNewHashes, size_t poolId)
+	{
+		uint64_t iStamp = get_timestamp_ms();
+		double timeDiff = static_cast<double>(iStamp - iLastStamp);
+		iLastStamp = iStamp;
+
+		if(poolId == 0)
 		{
+			// if dev pool is active interpolate the number of shares (avoid hash rate drops)
+			numNewHashes = static_cast<uint64_t>(avgHashPerMsec * timeDiff);
 		}
-	};
+		else
+		{
+			const double hashRatePerMs = static_cast<double>(numNewHashes) / timeDiff;
+			constexpr double averagingBias = 0.1;
+			avgHashPerMsec = avgHashPerMsec * (1.0 - averagingBias) + hashRatePerMs * averagingBias;
+		}
+		iHashCount.fetch_add(numNewHashes, std::memory_order_relaxed);
+		iTimestamp.store(iStamp, std::memory_order_relaxed);
+	}
+
+	iBackend() :
+		iHashCount(0),
+		iTimestamp(0)
+	{
+	}
+};
 
 } // namespace xmrstak
diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp
index d0e5237f27b3a632e3a9d501492325d02714d0c6..114f2db8eb080d7b7d0eef893a6aad1e6595bd8f 100644
--- a/xmrstak/backend/miner_work.hpp
+++ b/xmrstak/backend/miner_work.hpp
@@ -2,95 +2,110 @@
 
 #include "xmrstak/backend/pool_data.hpp"
 
-#include <thread>
 #include <atomic>
-#include <mutex>
-#include <cstdint>
-#include <iostream>
 #include <cassert>
+#include <cstdint>
 #include <cstring>
+#include <iostream>
+#include <mutex>
+#include <thread>
 
 namespace xmrstak
 {
-	struct miner_work
+struct miner_work
+{
+	char sJobID[64];
+	uint8_t bWorkBlob[128];
+	uint32_t iWorkSize;
+	uint64_t iTarget;
+	bool bNiceHash;
+	bool bStall;
+	size_t iPoolId;
+	uint64_t iBlockHeight;
+	uint8_t* ref_ptr;
+
+	miner_work() :
+		iWorkSize(0),
+		bNiceHash(false),
+		bStall(true),
+		iPoolId(invalid_pool_id),
+		ref_ptr((uint8_t*)&iBlockHeight) {}
+
+	miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
+		uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) :
+		iWorkSize(iWorkSize),
+		iTarget(iTarget),
+		bNiceHash(bNiceHash),
+		bStall(false),
+		iPoolId(iPoolId),
+		iBlockHeight(iBlockHeiht),
+		ref_ptr((uint8_t*)&iBlockHeight)
 	{
-		char        sJobID[64];
-		uint8_t     bWorkBlob[128];
-		uint32_t    iWorkSize;
-		uint64_t    iTarget;
-		bool        bNiceHash;
-		bool        bStall;
-		size_t      iPoolId;
-		uint64_t	iBlockHeight;
-		uint8_t*	ref_ptr;
-
-		miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(invalid_pool_id), ref_ptr((uint8_t*)&iBlockHeight) { }
-
-		miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
-			uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : iWorkSize(iWorkSize),
-			iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight)
-		{
-			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(this->bWorkBlob, bWork, iWorkSize);
-			memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
-		}
-
-		miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget),
-			bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight)
-		{
-			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
-			memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
-		}
-
-		miner_work(miner_work const&) = delete;
-
-		miner_work& operator=(miner_work&& from)
-		{
-			assert(this != &from);
-
-			iBlockHeight = from.iBlockHeight;
-			iPoolId = from.iPoolId;
-			bStall = from.bStall;
-			iWorkSize = from.iWorkSize;
-			bNiceHash = from.bNiceHash;
-			iTarget = from.iTarget;
-
-			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(sJobID, from.sJobID, sizeof(sJobID));
-			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+		assert(iWorkSize <= sizeof(bWorkBlob));
+		memcpy(this->bWorkBlob, bWork, iWorkSize);
+		memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
+	}
+
+	miner_work(miner_work&& from) :
+		iWorkSize(from.iWorkSize),
+		iTarget(from.iTarget),
+		bStall(from.bStall),
+		iPoolId(from.iPoolId),
+		iBlockHeight(from.iBlockHeight),
+		ref_ptr((uint8_t*)&iBlockHeight)
+	{
+		assert(iWorkSize <= sizeof(bWorkBlob));
+		memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+		memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
+	}
 
-			return *this;
-		}
+	miner_work(miner_work const&) = delete;
+
+	miner_work& operator=(miner_work&& from)
+	{
+		assert(this != &from);
 
-		miner_work& operator=(miner_work const& from)
-		{
-			assert(this != &from);
+		iBlockHeight = from.iBlockHeight;
+		iPoolId = from.iPoolId;
+		bStall = from.bStall;
+		iWorkSize = from.iWorkSize;
+		bNiceHash = from.bNiceHash;
+		iTarget = from.iTarget;
 
-			iBlockHeight = from.iBlockHeight;
-			iPoolId = from.iPoolId;
-			bStall = from.bStall;
-			iWorkSize = from.iWorkSize;
-			bNiceHash = from.bNiceHash;
-			iTarget = from.iTarget;
+		assert(iWorkSize <= sizeof(bWorkBlob));
+		memcpy(sJobID, from.sJobID, sizeof(sJobID));
+		memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
 
-			if(!ref_ptr)
-				return *this;
+		return *this;
+	}
 
-			for(size_t i=0; i <= 7 && iPoolId; i++)
-				ref_ptr[i] = from.ref_ptr[7-i];
+	miner_work& operator=(miner_work const& from)
+	{
+		assert(this != &from);
 
-			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(sJobID, from.sJobID, sizeof(sJobID));
-			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+		iBlockHeight = from.iBlockHeight;
+		iPoolId = from.iPoolId;
+		bStall = from.bStall;
+		iWorkSize = from.iWorkSize;
+		bNiceHash = from.bNiceHash;
+		iTarget = from.iTarget;
 
+		if(!ref_ptr)
 			return *this;
-		}
 
-		uint8_t getVersion() const
-		{
-			return bWorkBlob[0];
-		}
+		for(size_t i = 0; i <= 7 && iPoolId; i++)
+			ref_ptr[i] = from.ref_ptr[7 - i];
+
+		assert(iWorkSize <= sizeof(bWorkBlob));
+		memcpy(sJobID, from.sJobID, sizeof(sJobID));
+		memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
 
-	};
+		return *this;
+	}
+
+	uint8_t getVersion() const
+	{
+		return bWorkBlob[0];
+	}
+};
 } // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
index f1bf7581995c95d45a030a690f1b8a82b3a924f6..a7587cbe0387cdeb62ff010944d046181e701e7c 100644
--- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
+++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
@@ -14,17 +14,17 @@
  *
  */
 
-#include <string>
-#include <sstream>
-#include <mutex>
 #include <cstring>
+#include <mutex>
 #include <nvrtc.h>
+#include <sstream>
+#include <string>
 #include <thread>
 
-#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp"
 #include "xmrstak/backend/cpu/crypto/variant4_random_math.h"
-#include "xmrstak/misc/console.hpp"
+#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp"
 #include "xmrstak/cpputil/read_write_lock.h"
+#include "xmrstak/misc/console.hpp"
 
 namespace xmrstak
 {
@@ -33,80 +33,82 @@ namespace nvidia
 
 static std::string get_code(const V4_Instruction* code, int code_size)
 {
-    std::stringstream s;
+	std::stringstream s;
 
-    for (int i = 0; i < code_size; ++i)
-    {
-        const V4_Instruction inst = code[i];
+	for(int i = 0; i < code_size; ++i)
+	{
+		const V4_Instruction inst = code[i];
 
-        const uint32_t a = inst.dst_index;
-        const uint32_t b = inst.src_index;
+		const uint32_t a = inst.dst_index;
+		const uint32_t b = inst.src_index;
 
-        switch (inst.opcode)
-        {
-        case MUL:
-            s << 'r' << a << "*=r" << b << ';';
-            break;
+		switch(inst.opcode)
+		{
+		case MUL:
+			s << 'r' << a << "*=r" << b << ';';
+			break;
 
-        case ADD:
-            s << 'r' << a << "+=r" << b << '+' << inst.C << "U;";
-            break;
+		case ADD:
+			s << 'r' << a << "+=r" << b << '+' << inst.C << "U;";
+			break;
 
-        case SUB:
-            s << 'r' << a << "-=r" << b << ';';
-            break;
+		case SUB:
+			s << 'r' << a << "-=r" << b << ';';
+			break;
 
-        case ROR:
-            s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");";
-            break;
+		case ROR:
+			s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");";
+			break;
 
-        case ROL:
-            s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");";
-            break;
+		case ROL:
+			s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");";
+			break;
 
-        case XOR:
-            s << 'r' << a << "^=r" << b << ';';
-            break;
-        }
+		case XOR:
+			s << 'r' << a << "^=r" << b << ';';
+			break;
+		}
 
-        s << '\n';
-    }
+		s << '\n';
+	}
 
-    return s.str();
+	return s.str();
 }
 
 struct CacheEntry
 {
-    CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector<char>& ptx, const std::string& lowered_name) :
-        algo(algo),
-        height(height),
-        arch_major(arch_major),
-        arch_minor(arch_minor),
-        ptx(ptx),
-        lowered_name(lowered_name)
-    {}
-
-    xmrstak_algo algo;
-    uint64_t height;
-    int arch_major;
-    int arch_minor;
-    std::vector<char> ptx;
-    std::string lowered_name;
+	CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector<char>& ptx, const std::string& lowered_name) :
+		algo(algo),
+		height(height),
+		arch_major(arch_major),
+		arch_minor(arch_minor),
+		ptx(ptx),
+		lowered_name(lowered_name)
+	{
+	}
+
+	xmrstak_algo algo;
+	uint64_t height;
+	int arch_major;
+	int arch_minor;
+	std::vector<char> ptx;
+	std::string lowered_name;
 };
 
 struct BackgroundTaskBase
 {
-    virtual ~BackgroundTaskBase() {}
-    virtual void exec() = 0;
+	virtual ~BackgroundTaskBase() {}
+	virtual void exec() = 0;
 };
 
-template<typename T>
+template <typename T>
 struct BackgroundTask : public BackgroundTaskBase
 {
-    BackgroundTask(T&& func) : m_func(std::move(func)) {}
-    void exec() override { m_func(); }
+	BackgroundTask(T&& func) :
+		m_func(std::move(func)) {}
+	void exec() override { m_func(); }
 
-    T m_func;
+	T m_func;
 };
 
 static ::cpputil::RWLock CryptonightR_cache_mutex;
@@ -119,155 +121,165 @@ static std::thread* background_thread = nullptr;
 
 static void background_thread_proc()
 {
-    std::vector<BackgroundTaskBase*> tasks;
-    for (;;) {
-        tasks.clear();
-        {
-            std::lock_guard<std::mutex> g(background_tasks_mutex);
-            background_tasks.swap(tasks);
-        }
-
-        for (BackgroundTaskBase* task : tasks) {
-            task->exec();
-            delete task;
-        }
-
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    }
+	std::vector<BackgroundTaskBase*> tasks;
+	for(;;)
+	{
+		tasks.clear();
+		{
+			std::lock_guard<std::mutex> g(background_tasks_mutex);
+			background_tasks.swap(tasks);
+		}
+
+		for(BackgroundTaskBase* task : tasks)
+		{
+			task->exec();
+			delete task;
+		}
+
+		std::this_thread::sleep_for(std::chrono::milliseconds(500));
+	}
 }
 
-template<typename T>
+template <typename T>
 static void background_exec(T&& func)
 {
-    BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
-
-    std::lock_guard<std::mutex> g(background_tasks_mutex);
-    background_tasks.push_back(task);
-    if (!background_thread) {
-        background_thread = new std::thread(background_thread_proc);
-    }
+	BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
+
+	std::lock_guard<std::mutex> g(background_tasks_mutex);
+	background_tasks.push_back(task);
+	if(!background_thread)
+	{
+		background_thread = new std::thread(background_thread_proc);
+	}
 }
 
 static void CryptonightR_build_program(
-    std::vector<char>& ptx,
-    std::string& lowered_name,
-    const xmrstak_algo& algo,
-    uint64_t height,
-    uint32_t precompile_count,
-    int arch_major,
-    int arch_minor,
-    std::string source)
+	std::vector<char>& ptx,
+	std::string& lowered_name,
+	const xmrstak_algo& algo,
+	uint64_t height,
+	uint32_t precompile_count,
+	int arch_major,
+	int arch_minor,
+	std::string source)
 {
-    {
+	{
 		CryptonightR_cache_mutex.WriteLock();
 
-        // Remove old programs from cache
-        for (size_t i = 0; i < CryptonightR_cache.size();)
-        {
-            const CacheEntry& entry = CryptonightR_cache[i];
-            if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
-            {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
-                CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
-                CryptonightR_cache.pop_back();
-            }
-            else
-            {
-                ++i;
-            }
-        }
+		// Remove old programs from cache
+		for(size_t i = 0; i < CryptonightR_cache.size();)
+		{
+			const CacheEntry& entry = CryptonightR_cache[i];
+			if((entry.algo == algo) && (entry.height + 2 + precompile_count < height))
+			{
+				printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
+				CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
+				CryptonightR_cache.pop_back();
+			}
+			else
+			{
+				++i;
+			}
+		}
 		CryptonightR_cache_mutex.UnLock();
-    }
+	}
 
-    ptx.clear();
-    ptx.reserve(65536);
+	ptx.clear();
+	ptx.reserve(65536);
 
-    std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
-    {
+	std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
+	{
 		CryptonightR_cache_mutex.ReadLock();
 
-        // Check if the cache already has this program (some other thread might have added it first)
-        for (const CacheEntry& entry : CryptonightR_cache)
-        {
-            if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
-            {
-                ptx = entry.ptx;
-                lowered_name = entry.lowered_name;
+		// Check if the cache already has this program (some other thread might have added it first)
+		for(const CacheEntry& entry : CryptonightR_cache)
+		{
+			if((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
+			{
+				ptx = entry.ptx;
+				lowered_name = entry.lowered_name;
 				CryptonightR_cache_mutex.UnLock();
-                return;
-            }
-        }
+				return;
+			}
+		}
 		CryptonightR_cache_mutex.UnLock();
-    }
-
-    nvrtcProgram prog;
-    nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL);
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result));
-        return;
-    }
-
-    result = nvrtcAddNameExpression(prog, "CryptonightR_phase2");
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result));
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-    char opt0[64];
-    sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor);
-
-    char opt1[64];
-    sprintf(opt1, "-DALGO=%d", static_cast<int>(algo.Id()));
-
-	const char* opts[2] = { opt0, opt1 };
-
-    result = nvrtcCompileProgram(prog, 2, opts);
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result));
-
-        size_t logSize;
-        if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) {
-            char *log = new char[logSize];
-            if (nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) {
-                printer::inst()->print_msg(L0, "Program compile log: %s", log);
-            }
-            delete[]log;
-        }
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-
-    const char* name;
-    result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name);
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result));
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-    size_t ptxSize;
-    result = nvrtcGetPTXSize(prog, &ptxSize);
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result));
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-    ptx.resize(ptxSize);
-    result = nvrtcGetPTX(prog, ptx.data());
-    if (result != NVRTC_SUCCESS) {
-        printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result));
-        nvrtcDestroyProgram(&prog);
-        return;
-    }
-
-    lowered_name = name;
-
-    nvrtcDestroyProgram(&prog);
-
-    printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height);
+	}
+
+	nvrtcProgram prog;
+	nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL);
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result));
+		return;
+	}
+
+	result = nvrtcAddNameExpression(prog, "CryptonightR_phase2");
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result));
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	char opt0[64];
+	sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor);
+
+	char opt1[64];
+	sprintf(opt1, "-DALGO=%d", static_cast<int>(algo.Id()));
+
+	const char* opts[2] = {opt0, opt1};
+
+	result = nvrtcCompileProgram(prog, 2, opts);
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result));
+
+		size_t logSize;
+		if(nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS)
+		{
+			char* log = new char[logSize];
+			if(nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS)
+			{
+				printer::inst()->print_msg(L0, "Program compile log: %s", log);
+			}
+			delete[] log;
+		}
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	const char* name;
+	result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name);
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result));
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	size_t ptxSize;
+	result = nvrtcGetPTXSize(prog, &ptxSize);
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result));
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	ptx.resize(ptxSize);
+	result = nvrtcGetPTX(prog, ptx.data());
+	if(result != NVRTC_SUCCESS)
+	{
+		printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result));
+		nvrtcDestroyProgram(&prog);
+		return;
+	}
+
+	lowered_name = name;
+
+	nvrtcDestroyProgram(&prog);
+
+	printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height);
 
 	CryptonightR_cache_mutex.WriteLock();
 	CryptonightR_cache.emplace_back(algo, height, arch_major, arch_minor, ptx, lowered_name);
@@ -276,62 +288,63 @@ static void CryptonightR_build_program(
 
 void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name, const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background)
 {
-    if (background) {
-        background_exec([=]() { std::vector<char> tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, precompile_count, arch_major, arch_minor, false); });
-        return;
-    }
-
-    ptx.clear();
-
-    const char* source_code_template =
-        #include "nvcc_code/cuda_cryptonight_r.curt"
-    ;
-    const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
-    const char* offset = strstr(source_code_template, include_name);
-    if (!offset)
-    {
-        printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt");
-        return;
-    }
-
-    V4_Instruction code[256];
-    int code_size;
-    switch (algo.Id())
-    {
-    case cryptonight_r_wow:
-        code_size = v4_random_math_init<cryptonight_r_wow>(code, height);
-        break;
-    case cryptonight_r:
-        code_size = v4_random_math_init<cryptonight_r>(code, height);
-        break;
-        printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo);
-        return;
-    }
-
-    std::string source_code(source_code_template, offset);
-    source_code.append(get_code(code, code_size));
-    source_code.append(offset + sizeof(include_name) - 1);
-
-    {
+	if(background)
+	{
+		background_exec([=]() { std::vector<char> tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, precompile_count, arch_major, arch_minor, false); });
+		return;
+	}
+
+	ptx.clear();
+
+	const char* source_code_template =
+#include "nvcc_code/cuda_cryptonight_r.curt"
+		;
+	const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
+	const char* offset = strstr(source_code_template, include_name);
+	if(!offset)
+	{
+		printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt");
+		return;
+	}
+
+	V4_Instruction code[256];
+	int code_size;
+	switch(algo.Id())
+	{
+	case cryptonight_r_wow:
+		code_size = v4_random_math_init<cryptonight_r_wow>(code, height);
+		break;
+	case cryptonight_r:
+		code_size = v4_random_math_init<cryptonight_r>(code, height);
+		break;
+		printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo);
+		return;
+	}
+
+	std::string source_code(source_code_template, offset);
+	source_code.append(get_code(code, code_size));
+	source_code.append(offset + sizeof(include_name) - 1);
+
+	{
 		CryptonightR_cache_mutex.ReadLock();
 
-        // Check if the cache has this program
-        for (const CacheEntry& entry : CryptonightR_cache)
-        {
-            if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
-            {
-                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height);
-                ptx = entry.ptx;
-                lowered_name = entry.lowered_name;
+		// Check if the cache has this program
+		for(const CacheEntry& entry : CryptonightR_cache)
+		{
+			if((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
+			{
+				printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height);
+				ptx = entry.ptx;
+				lowered_name = entry.lowered_name;
 				CryptonightR_cache_mutex.UnLock();
-                return;
-            }
-        }
+				return;
+			}
+		}
 		CryptonightR_cache_mutex.UnLock();
-    }
+	}
 
-    CryptonightR_build_program(ptx, lowered_name, algo, height, precompile_count, arch_major, arch_minor, source_code);
+	CryptonightR_build_program(ptx, lowered_name, algo, height, precompile_count, arch_major, arch_minor, source_code);
 }
 
+} // namespace nvidia
 } // namespace xmrstak
-} //namespace nvidia
diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
index c3d8827b064276157f1b596878381f206d68c463..30abf2e59825664af659bcff3a286006d3b1186f 100644
--- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
+++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
@@ -19,9 +19,8 @@
 #include "xmrstak/backend/cryptonight.hpp"
 
 #include <stdint.h>
-#include <vector>
 #include <string>
-
+#include <vector>
 
 namespace xmrstak
 {
@@ -29,9 +28,7 @@ namespace nvidia
 {
 
 void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name,
-	const xmrstak_algo algo, uint64_t height,  uint32_t precompile_count, int arch_major, int arch_minor, bool background = false);
-
+	const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background = false);
 
+} // namespace nvidia
 } // namespace xmrstak
-} //namespace nvidia
-
diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index 2755e03d2f66fbc9668f1b1e2a0d0f3d165d6e91..a7f35b18b8632aeca1e1b6df3d71ba7ebb9b126b 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -3,17 +3,16 @@
 
 #include "autoAdjust.hpp"
 
-#include "nvcc_code/cryptonight.hpp"
 #include "jconf.hpp"
-#include "xmrstak/misc/console.hpp"
+#include "nvcc_code/cryptonight.hpp"
 #include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
 
-#include <vector>
 #include <cstdio>
 #include <sstream>
 #include <string>
-
+#include <vector>
 
 namespace xmrstak
 {
@@ -22,11 +21,9 @@ namespace nvidia
 
 class autoAdjust
 {
-public:
-
+  public:
 	autoAdjust()
 	{
-
 	}
 
 	/** print the adjusted values if needed
@@ -63,25 +60,22 @@ public:
 				nvidCtxVec.push_back(ctx);
 			else
 				printer::inst()->print_msg(L0, "WARNING: NVIDIA setup failed for GPU %d.\n", i);
-
 		}
 
 		generateThreadConfig();
 		return true;
-
 	}
 
-private:
-
+  private:
 	void generateThreadConfig()
 	{
 		// load the template of the backend config into a char variable
-		const char *tpl =
-			#include "./config.tpl"
-		;
+		const char* tpl =
+#include "./config.tpl"
+			;
 
 		configEditor configTpl{};
-		configTpl.set( std::string(tpl) );
+		configTpl.set(std::string(tpl));
 
 		constexpr size_t byte2mib = 1024u * 1024u;
 		std::string conf;
@@ -90,18 +84,18 @@ private:
 			if(ctx.device_threads * ctx.device_blocks > 0)
 			{
 				conf += std::string("  // gpu: ") + ctx.name + " architecture: " + std::to_string(ctx.device_arch[0] * 10 + ctx.device_arch[1]) + "\n";
-				conf += std::string("  //      memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/"  + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n";
+				conf += std::string("  //      memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/" + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n";
 				conf += std::string("  //      smx: ") + std::to_string(ctx.device_mpcount) + "\n";
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" +
-					"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
-					"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
-					"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
-					"    \"mem_mode\" : 1,\n" +
-					"  },\n";
+						"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
+						"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
+						"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
+						"    \"mem_mode\" : 1,\n" +
+						"  },\n";
 			}
 		}
 
-		configTpl.replace("GPUCONFIG",conf);
+		configTpl.replace("GPUCONFIG", conf);
 		configTpl.write(params::inst().configFileNVIDIA);
 		printer::inst()->print_msg(L0, "NVIDIA: GPU configuration stored in file '%s'", params::inst().configFileNVIDIA.c_str());
 	}
diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp
index 6c443343b91310d9698d9466f241b16018c884e4..1cd113c4d25915be4b2a63c62fd5656d9216e6ee 100644
--- a/xmrstak/backend/nvidia/jconf.cpp
+++ b/xmrstak/backend/nvidia/jconf.cpp
@@ -22,8 +22,8 @@
   */
 
 #include "jconf.hpp"
-#include "xmrstak/misc/jext.hpp"
 #include "xmrstak/misc/console.hpp"
+#include "xmrstak/misc/jext.hpp"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -36,7 +36,6 @@
 #include <cpuid.h>
 #endif
 
-
 namespace xmrstak
 {
 namespace nvidia
@@ -47,9 +46,13 @@ using namespace rapidjson;
 /*
  * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
  */
-enum configEnum { aGpuThreadsConf };
+enum configEnum
+{
+	aGpuThreadsConf
+};
 
-struct configVal {
+struct configVal
+{
 	configEnum iName;
 	const char* sName;
 	Type iType;
@@ -58,8 +61,7 @@ struct configVal {
 // Same order as in configEnum, as per comment above
 // kNullType means any type
 configVal oConfigValues[] = {
-	{ aGpuThreadsConf, "gpu_threads_conf", kNullType }
-};
+	{aGpuThreadsConf, "gpu_threads_conf", kNullType}};
 
 inline bool checkType(Type have, Type want)
 {
@@ -75,9 +77,7 @@ inline bool checkType(Type have, Type want)
 		return false;
 }
 
-constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
-
-
+constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0]));
 
 struct jconf::opaque_private
 {
@@ -89,7 +89,6 @@ struct jconf::opaque_private
 	}
 };
 
-
 bool jconf::NeedsAutoconf()
 {
 	return !prv->configValues[aGpuThreadsConf]->IsArray();
@@ -110,7 +109,7 @@ size_t jconf::GetGPUThreadCount()
 		return 0;
 }
 
-bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
+bool jconf::GetGPUThreadConfig(size_t id, thd_cfg& cfg)
 {
 	if(!prv->configValues[aGpuThreadsConf]->IsArray())
 		return false;
@@ -170,7 +169,6 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 		return false;
 	}
 
-
 	cfg.id = gid->GetInt();
 	cfg.blocks = blocks->GetInt();
 	cfg.threads = threads->GetInt();
@@ -178,7 +176,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	cfg.bsleep = bsleep->GetInt();
 	cfg.syncMode = syncMode->GetInt();
 	cfg.memMode = memMode->GetInt();
-	
+
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt();
 	else
@@ -189,22 +187,22 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 
 bool jconf::parse_config(const char* sFilename)
 {
-	FILE * pFile;
-	char * buffer;
+	FILE* pFile;
+	char* buffer;
 	size_t flen;
 
 	pFile = fopen(sFilename, "rb");
-	if (pFile == NULL)
+	if(pFile == NULL)
 	{
 		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
 		return false;
 	}
 
-	fseek(pFile,0,SEEK_END);
+	fseek(pFile, 0, SEEK_END);
 	flen = ftell(pFile);
 	rewind(pFile);
 
-	if(flen >= 64*1024)
+	if(flen >= 64 * 1024)
 	{
 		fclose(pFile);
 		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
@@ -219,7 +217,7 @@ bool jconf::parse_config(const char* sFilename)
 	}
 
 	buffer = (char*)malloc(flen + 3);
-	if(fread(buffer+1, flen, 1, pFile) != 1)
+	if(fread(buffer + 1, flen, 1, pFile) != 1)
 	{
 		free(buffer);
 		fclose(pFile);
@@ -241,7 +239,7 @@ bool jconf::parse_config(const char* sFilename)
 	buffer[flen] = '}';
 	buffer[flen + 1] = '\0';
 
-	prv->jsonDoc.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	prv->jsonDoc.Parse<kParseCommentsFlag | kParseTrailingCommasFlag>(buffer, flen + 2);
 	free(buffer);
 
 	if(prv->jsonDoc.HasParseError())
@@ -251,7 +249,6 @@ bool jconf::parse_config(const char* sFilename)
 		return false;
 	}
 
-
 	if(!prv->jsonDoc.IsObject())
 	{ //This should never happen as we created the root ourselves
 		printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename);
@@ -262,7 +259,7 @@ bool jconf::parse_config(const char* sFilename)
 	{
 		if(oConfigValues[i].iName != i)
 		{
-			printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s",oConfigValues[i].sName);
+			printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s", oConfigValues[i].sName);
 			return false;
 		}
 
diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp
index 40b72f880898485058c2775b11afdc9c0dfaff1d..e924c75a9a90a7bb42d62f5ce27c51b52f281024 100644
--- a/xmrstak/backend/nvidia/jconf.hpp
+++ b/xmrstak/backend/nvidia/jconf.hpp
@@ -1,7 +1,7 @@
 #pragma once
+#include "xmrstak/params.hpp"
 #include <stdlib.h>
 #include <string>
-#include "xmrstak/params.hpp"
 
 namespace xmrstak
 {
@@ -10,16 +10,18 @@ namespace nvidia
 
 class jconf
 {
-public:
+  public:
 	static jconf* inst()
 	{
-		if (oInst == nullptr) oInst = new jconf;
+		if(oInst == nullptr)
+			oInst = new jconf;
 		return oInst;
 	};
 
 	bool parse_config(const char* sFilename = params::inst().configFileNVIDIA.c_str());
 
-	struct thd_cfg {
+	struct thd_cfg
+	{
 		uint32_t id;
 		uint32_t blocks;
 		uint32_t threads;
@@ -36,17 +38,16 @@ public:
 
 	size_t GetGPUThreadCount();
 
-	bool GetGPUThreadConfig(size_t id, thd_cfg &cfg);
+	bool GetGPUThreadConfig(size_t id, thd_cfg& cfg);
 
 	bool NeedsAutoconf();
 
-private:
+  private:
 	jconf();
 	static jconf* oInst;
 
 	struct opaque_private;
 	opaque_private* prv;
-
 };
 
 } // namespace nvidia
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 80615d7a34262f2809302b15c29327e00f629b1b..32b21dc7105079ccad14e67baedea9772f5d062d 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -23,23 +23,23 @@
 
 #include "minethd.hpp"
 #include "autoAdjust.hpp"
-#include "xmrstak/misc/console.hpp"
-#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
 #include "xmrstak/backend/cpu/crypto/cryptonight.h"
+#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
+#include "xmrstak/backend/cpu/hwlocMemory.hpp"
 #include "xmrstak/backend/cpu/minethd.hpp"
-#include "xmrstak/params.hpp"
-#include "xmrstak/misc/executor.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/environment.hpp"
-#include "xmrstak/backend/cpu/hwlocMemory.hpp"
-#include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/misc/executor.hpp"
 #include "xmrstak/misc/utility.hpp"
+#include "xmrstak/params.hpp"
 
 #include <assert.h>
-#include <cmath>
+#include <bitset>
 #include <chrono>
+#include <cmath>
 #include <thread>
-#include <bitset>
 #include <vector>
 
 #ifndef USE_PRECOMPILED_HEADERS
@@ -47,8 +47,8 @@
 #include <direct.h>
 #include <windows.h>
 #else
-#include <sys/types.h>
 #include <dlfcn.h>
+#include <sys/types.h>
 #endif
 #include <iostream>
 #endif
@@ -59,9 +59,9 @@ namespace nvidia
 {
 
 #ifdef WIN32
-	HINSTANCE lib_handle;
+HINSTANCE lib_handle;
 #else
-	void *lib_handle;
+void* lib_handle;
 #endif
 
 minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
@@ -101,23 +101,21 @@ void minethd::start_mining()
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
 }
 
-
 bool minethd::self_test()
 {
 	return true;
 }
 
-
 extern "C"
 {
 #ifdef WIN32
-__declspec(dllexport)
+	__declspec(dllexport)
 #endif
-std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
-{
-	environment::inst(&env);
-	return nvidia::minethd::thread_starter(threadOffset, pWork);
-}
+		std::vector<iBackend*>* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env)
+	{
+		environment::inst(&env);
+		return nvidia::minethd::thread_starter(threadOffset, pWork);
+	}
 } // extern "C"
 
 std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork)
@@ -141,12 +139,12 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	int deviceCount = 0;
 	if(cuda_get_devicecount(&deviceCount) != 1)
 	{
-		std::cout<<"WARNING: NVIDIA no device found"<<std::endl;
+		std::cout << "WARNING: NVIDIA no device found" << std::endl;
 		return pvThreads;
 	}
 	else
 	{
-		std::cout<<"NVIDIA: found "<< deviceCount <<" potential device's"<<std::endl;
+		std::cout << "NVIDIA: found " << deviceCount << " potential device's" << std::endl;
 	}
 
 	size_t i, n = jconf::inst()->GetGPUThreadCount();
@@ -155,7 +153,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	cuInit(0);
 
 	jconf::thd_cfg cfg;
-	for (i = 0; i < n; i++)
+	for(i = 0; i < n; i++)
 	{
 		jconf::inst()->GetGPUThreadConfig(i, cfg);
 
@@ -172,10 +170,9 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 
 		minethd* thd = new minethd(pWork, i + threadOffset, cfg);
 		pvThreads->push_back(thd);
-
 	}
 
-	for (i = 0; i < n; i++)
+	for(i = 0; i < n; i++)
 	{
 		static_cast<minethd*>((*pvThreads)[i])->start_mining();
 	}
@@ -201,7 +198,6 @@ void minethd::work_main()
 	// wait until all NVIDIA devices are initialized
 	thread_work_guard.wait();
 
-	uint64_t iCount = 0;
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
 
@@ -216,16 +212,16 @@ void minethd::work_main()
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
-	while (bQuit == 0)
+	while(bQuit == 0)
 	{
-		if (oWork.bStall)
+		if(oWork.bStall)
 		{
 			/* We are stalled here because the executor didn't find a job for us yet,
 			 * either because of network latency, or a socket problem. Since we are
 			 * raison d'etre of this software it us sensible to just wait until we have something
 			 */
 
-			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+			while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 				std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
 			globalStates::inst().consume_work(oWork, iJobNo);
@@ -285,8 +281,8 @@ void minethd::work_main()
 			for(size_t i = 0; i < foundCount; i++)
 			{
 
-				uint8_t	bWorkBlob[128];
-				uint8_t	bResult[32];
+				uint8_t bWorkBlob[128];
+				uint8_t bResult[32];
 
 				memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
 				memset(bResult, 0, sizeof(job_result::bResult));
@@ -294,19 +290,14 @@ void minethd::work_main()
 				*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
 
 				cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
-				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
+				if((*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
 					executor::inst()->push_event(ex_event("NVIDIA Invalid Result", ctx.device_id, oWork.iPoolId));
 			}
 
-			iCount += h_per_round;
 			iNonce += h_per_round;
-
-			using namespace std::chrono;
-			uint64_t iStamp = get_timestamp_ms();
-			iHashCount.store(iCount, std::memory_order_relaxed);
-			iTimestamp.store(iStamp, std::memory_order_relaxed);
+			updateStats(h_per_round, oWork.iPoolId);
 			std::this_thread::yield();
 		}
 
@@ -314,5 +305,5 @@ void minethd::work_main()
 	}
 }
 
+} // namespace nvidia
 } // namespace xmrstak
-} //namespace nvidia
diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp
index 3863c93e8721e099b7fcdcdd9b9c3a85902178a9..66c49bb1fa0501536aa6142412b2e277bb100d07 100644
--- a/xmrstak/backend/nvidia/minethd.hpp
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -1,19 +1,18 @@
 #pragma once
 
-#include "xmrstak/jconf.hpp"
 #include "jconf.hpp"
 #include "nvcc_code/cryptonight.hpp"
+#include "xmrstak/jconf.hpp"
 
 #include "xmrstak/backend/cpu/minethd.hpp"
 #include "xmrstak/backend/iBackend.hpp"
 #include "xmrstak/misc/environment.hpp"
 
+#include <atomic>
+#include <future>
 #include <iostream>
 #include <thread>
-#include <atomic>
 #include <vector>
-#include <future>
-
 
 namespace xmrstak
 {
@@ -22,12 +21,11 @@ namespace nvidia
 
 class minethd : public iBackend
 {
-public:
-
+  public:
 	static std::vector<iBackend*>* thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
-private:
+  private:
 	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 906701893e21cffb7da46104ea287f58768ea0fc..78abd7a3d84a04a7017554bef5a11b368e48ebe4 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -3,14 +3,15 @@
 #include <stdint.h>
 #include <string>
 
-#include "xmrstak/jconf.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/jconf.hpp"
 
 #include <cuda.h>
 
-typedef struct {
+typedef struct
+{
 	int device_id;
-	const char *device_name;
+	const char* device_name;
 	int device_arch[2];
 	int device_mpcount;
 	int device_blocks;
@@ -20,18 +21,18 @@ typedef struct {
 	int syncMode;
 	bool memMode;
 
-	uint32_t *d_input;
+	uint32_t* d_input;
 	uint32_t inputlen;
-	uint32_t *d_result_count;
-	uint32_t *d_result_nonce;
-	uint32_t *d_long_state;
-	uint32_t *d_ctx_state;
-	uint32_t *d_ctx_state2;
-	uint32_t *d_ctx_a;
-	uint32_t *d_ctx_b;
-	uint32_t *d_ctx_key1;
-	uint32_t *d_ctx_key2;
-	uint32_t *d_ctx_text;
+	uint32_t* d_result_count;
+	uint32_t* d_result_nonce;
+	uint32_t* d_long_state;
+	uint32_t* d_ctx_state;
+	uint32_t* d_ctx_state2;
+	uint32_t* d_ctx_a;
+	uint32_t* d_ctx_b;
+	uint32_t* d_ctx_key1;
+	uint32_t* d_ctx_key2;
+	uint32_t* d_ctx_text;
 	std::string name;
 	size_t free_device_memory;
 	size_t total_device_memory;
@@ -43,19 +44,20 @@ typedef struct {
 	xmrstak_algo cached_algo = {xmrstak_algo_id::invalid_algo};
 } nvid_ctx;
 
-extern "C" {
+extern "C"
+{
 
-/** get device count
+	/** get device count
  *
  * @param deviceCount[out] cuda device count
  * @return error code: 0 == error is occurred, 1 == no error
  */
-int cuda_get_devicecount( int* deviceCount);
-int cuda_get_deviceinfo(nvid_ctx *ctx);
-int cryptonight_extra_cpu_init(nvid_ctx *ctx);
-void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
-void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo);
-void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo);
+	int cuda_get_devicecount(int* deviceCount);
+	int cuda_get_deviceinfo(nvid_ctx* ctx);
+	int cryptonight_extra_cpu_init(nvid_ctx* ctx);
+	void cryptonight_extra_cpu_set_data(nvid_ctx* ctx, const void* data, uint32_t len);
+	void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo);
+	void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t* resnonce, const xmrstak_algo& miner_algo);
 }
 
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce, uint64_t chain_height);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
index 1990256351abd47fa70ef813208eabb8d4bf796e..d33e76715f14656fdf7547b2e6efa609587ef370 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
@@ -3,287 +3,285 @@
 
 #include <stdint.h>
 
-#define N_COLS          4
-#define WPOLY           0x011b
+#define N_COLS 4
+#define WPOLY 0x011b
 
 static __constant__ uint32_t d_t_fn[1024] =
-{
-	0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U,
-	0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U,
-	0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U,
-	0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU,
-	0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU,
-	0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU,
-	0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U,
-	0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU,
-	0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU,
-	0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U,
-	0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U,
-	0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU,
-	0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU,
-	0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU,
-	0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU,
-	0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU,
-	0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U,
-	0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU,
-	0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU,
-	0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U,
-	0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U,
-	0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U,
-	0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U,
-	0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U,
-	0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU,
-	0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U,
-	0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU,
-	0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU,
-	0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U,
-	0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U,
-	0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U,
-	0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU,
-	0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U,
-	0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU,
-	0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU,
-	0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U,
-	0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U,
-	0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU,
-	0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U,
-	0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU,
-	0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U,
-	0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U,
-	0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U,
-	0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U,
-	0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU,
-	0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U,
-	0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU,
-	0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U,
-	0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU,
-	0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U,
-	0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU,
-	0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU,
-	0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU,
-	0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU,
-	0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U,
-	0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U,
-	0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U,
-	0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U,
-	0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U,
-	0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U,
-	0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU,
-	0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U,
-	0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU,
-	0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU,
-	0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU,
-	0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U,
-	0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU,
-	0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU,
-	0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U,
-	0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU,
-	0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU,
-	0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU,
-	0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU,
-	0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU,
-	0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U,
-	0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU,
-	0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU,
-	0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U,
-	0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU,
-	0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU,
-	0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU,
-	0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU,
-	0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU,
-	0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U,
-	0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU,
-	0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU,
-	0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU,
-	0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU,
-	0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U,
-	0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U,
-	0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U,
-	0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U,
-	0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU,
-	0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U,
-	0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U,
-	0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU,
-	0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU,
-	0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U,
-	0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U,
-	0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U,
-	0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU,
-	0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U,
-	0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU,
-	0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U,
-	0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU,
-	0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U,
-	0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U,
-	0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU,
-	0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U,
-	0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U,
-	0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U,
-	0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U,
-	0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U,
-	0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U,
-	0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U,
-	0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U,
-	0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU,
-	0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U,
-	0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U,
-	0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U,
-	0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U,
-	0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U,
-	0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U,
-	0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU,
-	0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U,
-	0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U,
-	0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U,
-	0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU,
-	0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU,
-	0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U,
-	0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU,
-	0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U,
-	0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU,
-	0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U,
-	0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU,
-	0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U,
-	0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U,
-	0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU,
-	0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U,
-	0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U,
-	0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U,
-	0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU,
-	0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U,
-	0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U,
-	0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU,
-	0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U,
-	0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U,
-	0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U,
-	0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU,
-	0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU,
-	0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U,
-	0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU,
-	0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU,
-	0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U,
-	0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU,
-	0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U,
-	0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU,
-	0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U,
-	0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U,
-	0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U,
-	0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU,
-	0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U,
-	0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU,
-	0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U,
-	0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU,
-	0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U,
-	0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U,
-	0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU,
-	0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU,
-	0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU,
-	0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U,
-	0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U,
-	0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU,
-	0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U,
-	0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU,
-	0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U,
-	0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU,
-	0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U,
-	0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU,
-	0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU,
-	0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U,
-	0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU,
-	0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U,
-	0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU,
-	0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U,
-	0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U,
-	0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U,
-	0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU,
-	0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU,
-	0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U,
-	0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU,
-	0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U,
-	0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU,
-	0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U,
-	0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU,
-	0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U,
-	0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU,
-	0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U,
-	0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU,
-	0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U,
-	0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U,
-	0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU,
-	0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U,
-	0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U,
-	0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U,
-	0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU,
-	0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U,
-	0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U,
-	0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU,
-	0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U,
-	0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U,
-	0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U,
-	0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU,
-	0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU,
-	0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U,
-	0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU,
-	0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU,
-	0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U,
-	0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU,
-	0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U,
-	0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU,
-	0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U,
-	0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U,
-	0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U,
-	0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU,
-	0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U,
-	0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU,
-	0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U,
-	0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU,
-	0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U,
-	0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U,
-	0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU,
-	0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU,
-	0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU,
-	0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U,
-	0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U,
-	0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU,
-	0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U,
-	0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU,
-	0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U,
-	0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU,
-	0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U,
-	0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU,
-	0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU,
-	0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U,
-	0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU,
-	0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U,
-	0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU,
-	0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U,
-	0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U,
-	0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U,
-	0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU,
-	0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU,
-	0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U,
-	0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU,
-	0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U
-};
+	{
+		0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U,
+		0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U,
+		0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U,
+		0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU,
+		0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU,
+		0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU,
+		0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U,
+		0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU,
+		0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU,
+		0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U,
+		0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U,
+		0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU,
+		0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU,
+		0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU,
+		0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU,
+		0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU,
+		0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U,
+		0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU,
+		0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU,
+		0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U,
+		0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U,
+		0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U,
+		0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U,
+		0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U,
+		0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU,
+		0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U,
+		0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU,
+		0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU,
+		0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U,
+		0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U,
+		0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U,
+		0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU,
+		0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U,
+		0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU,
+		0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU,
+		0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U,
+		0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U,
+		0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU,
+		0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U,
+		0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU,
+		0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U,
+		0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U,
+		0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U,
+		0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U,
+		0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU,
+		0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U,
+		0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU,
+		0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U,
+		0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU,
+		0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U,
+		0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU,
+		0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU,
+		0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU,
+		0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU,
+		0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U,
+		0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U,
+		0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U,
+		0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U,
+		0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U,
+		0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U,
+		0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU,
+		0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U,
+		0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU,
+		0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU,
+		0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU,
+		0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U,
+		0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU,
+		0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU,
+		0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U,
+		0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU,
+		0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU,
+		0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU,
+		0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU,
+		0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU,
+		0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U,
+		0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU,
+		0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU,
+		0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U,
+		0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU,
+		0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU,
+		0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU,
+		0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU,
+		0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU,
+		0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U,
+		0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU,
+		0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU,
+		0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU,
+		0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU,
+		0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U,
+		0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U,
+		0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U,
+		0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U,
+		0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU,
+		0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U,
+		0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U,
+		0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU,
+		0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU,
+		0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U,
+		0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U,
+		0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U,
+		0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU,
+		0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U,
+		0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU,
+		0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U,
+		0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU,
+		0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U,
+		0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U,
+		0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU,
+		0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U,
+		0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U,
+		0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U,
+		0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U,
+		0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U,
+		0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U,
+		0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U,
+		0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U,
+		0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU,
+		0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U,
+		0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U,
+		0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U,
+		0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U,
+		0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U,
+		0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U,
+		0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU,
+		0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U,
+		0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U,
+		0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U,
+		0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU,
+		0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU,
+		0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U,
+		0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU,
+		0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U,
+		0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU,
+		0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U,
+		0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU,
+		0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U,
+		0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U,
+		0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU,
+		0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U,
+		0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U,
+		0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U,
+		0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU,
+		0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U,
+		0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U,
+		0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU,
+		0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U,
+		0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U,
+		0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U,
+		0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU,
+		0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU,
+		0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U,
+		0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU,
+		0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU,
+		0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U,
+		0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU,
+		0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U,
+		0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU,
+		0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U,
+		0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U,
+		0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U,
+		0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU,
+		0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U,
+		0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU,
+		0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U,
+		0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU,
+		0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U,
+		0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U,
+		0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU,
+		0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU,
+		0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU,
+		0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U,
+		0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U,
+		0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU,
+		0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U,
+		0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU,
+		0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U,
+		0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU,
+		0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U,
+		0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU,
+		0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU,
+		0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U,
+		0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU,
+		0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U,
+		0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU,
+		0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U,
+		0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U,
+		0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U,
+		0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU,
+		0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU,
+		0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U,
+		0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU,
+		0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U,
+		0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU,
+		0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U,
+		0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU,
+		0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U,
+		0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU,
+		0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U,
+		0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU,
+		0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U,
+		0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U,
+		0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU,
+		0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U,
+		0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U,
+		0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U,
+		0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU,
+		0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U,
+		0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U,
+		0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU,
+		0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U,
+		0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U,
+		0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U,
+		0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU,
+		0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU,
+		0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U,
+		0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU,
+		0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU,
+		0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U,
+		0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU,
+		0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U,
+		0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU,
+		0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U,
+		0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U,
+		0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U,
+		0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU,
+		0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U,
+		0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU,
+		0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U,
+		0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU,
+		0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U,
+		0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U,
+		0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU,
+		0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU,
+		0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU,
+		0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U,
+		0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U,
+		0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU,
+		0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U,
+		0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU,
+		0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U,
+		0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU,
+		0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U,
+		0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU,
+		0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU,
+		0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U,
+		0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU,
+		0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U,
+		0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU,
+		0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U,
+		0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U,
+		0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U,
+		0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU,
+		0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU,
+		0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U,
+		0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU,
+		0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U};
 
-#define t_fn0(x) (sharedMemory[      (x)])
+#define t_fn0(x) (sharedMemory[(x)])
 #define t_fn1(x) (sharedMemory[256 + (x)])
 #define t_fn2(x) (sharedMemory[512 + (x)])
 #define t_fn3(x) (sharedMemory[768 + (x)])
 
+#define round(dummy, y, x, k)                                                                                            \
+	y[0] = (k)[0] ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24))); \
+	y[1] = (k)[1] ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24))); \
+	y[2] = (k)[2] ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24))); \
+	y[3] = (k)[3] ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24)));
 
-#define round(dummy,y,x,k) \
-	y[0] = (k)[0]  ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24))); \
-	y[1] = (k)[1]  ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24))); \
-	y[2] = (k)[2]  ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24))); \
-	y[3] = (k)[3]  ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) ));
-
-__device__ __forceinline__ static void cn_aes_single_round(uint32_t * __restrict__ sharedMemory, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t * __restrict__ expandedKey)
+__device__ __forceinline__ static void cn_aes_single_round(uint32_t* __restrict__ sharedMemory, const uint32_t* __restrict__ in, uint32_t* __restrict__ out, const uint32_t* __restrict__ expandedKey)
 {
 	round(sharedMemory, out, in, expandedKey);
 }
 
-__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * __restrict__ sharedMemory, uint32_t * __restrict__ val, const uint32_t * __restrict__ expandedKey)
+__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t* __restrict__ sharedMemory, uint32_t* __restrict__ val, const uint32_t* __restrict__ expandedKey)
 {
 	uint32_t b1[4];
 	round(sharedMemory, b1, val, expandedKey);
@@ -298,14 +296,14 @@ __device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t *
 	round(sharedMemory, val, b1, expandedKey + 9 * N_COLS);
 }
 
-__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory)
+__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t* sharedMemory)
 {
 	for(int i = threadIdx.x; i < 1024; i += blockDim.x)
 		sharedMemory[i] = d_t_fn[i];
 }
 
-__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory)
+__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t* sharedMemory)
 {
-        for(int i = threadIdx.x; i < 512; i += blockDim.x)
-                sharedMemory[i] = d_t_fn[i];
+	for(int i = threadIdx.x; i < 512; i += blockDim.x)
+		sharedMemory[i] = d_t_fn[i];
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
index 611fe1c8c60e243cbf6c9e7445a082f8b9047a38..efd57c94454fefc8f69cf43668b570e875ba3b28 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp
@@ -1,64 +1,68 @@
 #pragma once
 
-typedef struct {
+#include "cuda_extra.hpp"
+
+typedef struct
+{
 	uint32_t h[8], s[4], t[2];
 	uint32_t buflen;
 	int nullt;
 	uint8_t buf[64];
 } blake_state;
 
-#define U8TO32(p) \
+#define U8TO32(p)                                              \
 	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
-	((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
-
-#define U32TO8(p, v) \
-	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
-	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
-
-#define BLAKE_ROT(x,n) ROTR32(x, n)
-#define BLAKE_G(a,b,c,d,e) \
-	v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \
-	v[d] = BLAKE_ROT(v[d] ^ v[a],16); \
-	v[c] += v[d];                     \
-	v[b] = BLAKE_ROT(v[b] ^ v[c],12); \
-	v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \
-	v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \
-	v[c] += v[d];                     \
+		((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3])))
+
+#define U32TO8(p, v)               \
+	(p)[0] = (uint8_t)((v) >> 24); \
+	(p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >> 8);  \
+	(p)[3] = (uint8_t)((v));
+
+#define BLAKE_ROT(x, n) ROTR32(x, n)
+#define BLAKE_G(a, b, c, d, e)                                                      \
+	v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e + 1]]) + v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a], 16);                                              \
+	v[c] += v[d];                                                                   \
+	v[b] = BLAKE_ROT(v[b] ^ v[c], 12);                                              \
+	v[a] += (m[d_blake_sigma[i][e + 1]] ^ d_blake_cst[d_blake_sigma[i][e]]) + v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a], 8);                                               \
+	v[c] += v[d];                                                                   \
 	v[b] = BLAKE_ROT(v[b] ^ v[c], 7);
 
 __constant__ uint8_t d_blake_sigma[14][16] =
-{
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
-	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
-	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
-	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
-	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
-	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
-	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
-	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
-	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}
-};
-__constant__ uint32_t d_blake_cst[16]
-= {
+	{
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+		{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+		{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+		{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+		{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+		{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+		{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+		{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+		{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+		{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+		{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}};
+__constant__ uint32_t d_blake_cst[16] = {
 	0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
 	0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
 	0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
-	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
-};
+	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917};
 
-__device__ void cn_blake_compress(blake_state *  S, const uint8_t *  block)
+__device__ void cn_blake_compress(blake_state* S, const uint8_t* block)
 {
 	uint32_t v[16], m[16], i;
 
-	for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
-	for (i = 0; i < 8;  ++i) v[i] = S->h[i];
-	v[ 8] = S->s[0] ^ 0x243F6A88;
-	v[ 9] = S->s[1] ^ 0x85A308D3;
+	for(i = 0; i < 16; ++i)
+		m[i] = U8TO32(block + i * 4);
+	for(i = 0; i < 8; ++i)
+		v[i] = S->h[i];
+	v[8] = S->s[0] ^ 0x243F6A88;
+	v[9] = S->s[1] ^ 0x85A308D3;
 	v[10] = S->s[2] ^ 0x13198A2E;
 	v[11] = S->s[3] ^ 0x03707344;
 	v[12] = 0xA4093822;
@@ -66,7 +70,7 @@ __device__ void cn_blake_compress(blake_state *  S, const uint8_t *  block)
 	v[14] = 0x082EFA98;
 	v[15] = 0xEC4E6C89;
 
-	if (S->nullt == 0)
+	if(S->nullt == 0)
 	{
 		v[12] ^= S->t[0];
 		v[13] ^= S->t[0];
@@ -74,50 +78,54 @@ __device__ void cn_blake_compress(blake_state *  S, const uint8_t *  block)
 		v[15] ^= S->t[1];
 	}
 
-	for (i = 0; i < 14; ++i)
+	for(i = 0; i < 14; ++i)
 	{
-		BLAKE_G(0, 4,  8, 12,  0);
-		BLAKE_G(1, 5,  9, 13,  2);
-		BLAKE_G(2, 6, 10, 14,  4);
-		BLAKE_G(3, 7, 11, 15,  6);
-		BLAKE_G(3, 4,  9, 14, 14);
-		BLAKE_G(2, 7,  8, 13, 12);
-		BLAKE_G(0, 5, 10, 15,  8);
+		BLAKE_G(0, 4, 8, 12, 0);
+		BLAKE_G(1, 5, 9, 13, 2);
+		BLAKE_G(2, 6, 10, 14, 4);
+		BLAKE_G(3, 7, 11, 15, 6);
+		BLAKE_G(3, 4, 9, 14, 14);
+		BLAKE_G(2, 7, 8, 13, 12);
+		BLAKE_G(0, 5, 10, 15, 8);
 		BLAKE_G(1, 6, 11, 12, 10);
 	}
 
-	for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
-	for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
+	for(i = 0; i < 16; ++i)
+		S->h[i % 8] ^= v[i];
+	for(i = 0; i < 8; ++i)
+		S->h[i] ^= S->s[i % 4];
 }
 
-__device__ void cn_blake_update(blake_state *  S, const uint8_t *  data, uint64_t datalen)
+__device__ void cn_blake_update(blake_state* S, const uint8_t* data, uint64_t datalen)
 {
 	uint32_t left = S->buflen >> 3;
 	uint32_t fill = 64 - left;
 
-	if (left && (((datalen >> 3) & 0x3F) >= fill))
+	if(left && (((datalen >> 3) & 0x3F) >= fill))
 	{
-		memcpy((void *) (S->buf + left), (void *) data, fill);
+		memcpy((void*)(S->buf + left), (void*)data, fill);
 		S->t[0] += 512;
-		if (S->t[0] == 0) S->t[1]++;
+		if(S->t[0] == 0)
+			S->t[1]++;
 		cn_blake_compress(S, S->buf);
 		data += fill;
 		datalen -= (fill << 3);
 		left = 0;
 	}
 
-	while (datalen >= 512)
+	while(datalen >= 512)
 	{
 		S->t[0] += 512;
-		if (S->t[0] == 0) S->t[1]++;
+		if(S->t[0] == 0)
+			S->t[1]++;
 		cn_blake_compress(S, data);
 		data += 64;
 		datalen -= 512;
 	}
 
-	if (datalen > 0)
+	if(datalen > 0)
 	{
-		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
+		memcpy((void*)(S->buf + left), (void*)data, datalen >> 3);
 		S->buflen = (left << 3) + datalen;
 	}
 	else
@@ -126,31 +134,32 @@ __device__ void cn_blake_update(blake_state *  S, const uint8_t *  data, uint64_
 	}
 }
 
-__device__ void cn_blake_final(blake_state *  S, uint8_t *  digest)
+__device__ void cn_blake_final(blake_state* S, uint8_t* digest)
 {
 	const uint8_t padding[] =
-	{
-		0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-	};
+		{
+			0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
 	uint8_t pa = 0x81, pb = 0x01;
 	uint8_t msglen[8];
 	uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
-	if (lo < (unsigned) S->buflen) hi++;
+	if(lo < (unsigned)S->buflen)
+		hi++;
 	U32TO8(msglen + 0, hi);
 	U32TO8(msglen + 4, lo);
 
-	if (S->buflen == 440)
+	if(S->buflen == 440)
 	{
 		S->t[0] -= 8;
 		cn_blake_update(S, &pa, 8);
 	}
 	else
 	{
-		if (S->buflen < 440)
+		if(S->buflen < 440)
 		{
-			if (S->buflen == 0) S->nullt = 1;
+			if(S->buflen == 0)
+				S->nullt = 1;
 			S->t[0] -= 440 - S->buflen;
 			cn_blake_update(S, padding, 440 - S->buflen);
 		}
@@ -168,9 +177,9 @@ __device__ void cn_blake_final(blake_state *  S, uint8_t *  digest)
 	S->t[0] -= 64;
 	cn_blake_update(S, msglen, 64);
 
-	U32TO8(digest +  0, S->h[0]);
-	U32TO8(digest +  4, S->h[1]);
-	U32TO8(digest +  8, S->h[2]);
+	U32TO8(digest + 0, S->h[0]);
+	U32TO8(digest + 4, S->h[1]);
+	U32TO8(digest + 8, S->h[2]);
 	U32TO8(digest + 12, S->h[3]);
 	U32TO8(digest + 16, S->h[4]);
 	U32TO8(digest + 20, S->h[5]);
@@ -178,17 +187,22 @@ __device__ void cn_blake_final(blake_state *  S, uint8_t *  digest)
 	U32TO8(digest + 28, S->h[7]);
 }
 
-__device__ void cn_blake(const uint8_t *  in, uint64_t inlen, uint8_t *  out)
+__device__ void cn_blake(const uint8_t* in, uint64_t inlen, uint8_t* out)
 {
 	blake_state bs;
-	blake_state *S = (blake_state *)&bs;
-
-	S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372;
-	S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C;
-	S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19;
+	blake_state* S = (blake_state*)&bs;
+
+	S->h[0] = 0x6A09E667;
+	S->h[1] = 0xBB67AE85;
+	S->h[2] = 0x3C6EF372;
+	S->h[3] = 0xA54FF53A;
+	S->h[4] = 0x510E527F;
+	S->h[5] = 0x9B05688C;
+	S->h[6] = 0x1F83D9AB;
+	S->h[7] = 0x5BE0CD19;
 	S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
 	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
 
-	cn_blake_update(S, (uint8_t *)in, inlen * 8);
-	cn_blake_final(S, (uint8_t *)out);
+	cn_blake_update(S, (uint8_t*)in, inlen * 8);
+	cn_blake_final(S, (uint8_t*)out);
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 718cff0c765eeb619d23b5fc7024e0a71d6222c9..7f610f9dc61fc174c289482213bdccea91f96557 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -1,55 +1,55 @@
 #include "xmrstak/backend/cryptonight.hpp"
 
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
+#include <bitset>
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <bitset>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
 
-#include "xmrstak/jconf.hpp"
-#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
-#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp"
-#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp"
 #include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp"
-
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
+#include "xmrstak/jconf.hpp"
 
 #ifdef _WIN32
 #include <windows.h>
 extern "C" void compat_usleep(uint64_t waitTime)
 {
-    if (waitTime > 0)
-    {
-        if (waitTime > 100)
-        {
-            // use a waitable timer for larger intervals > 0.1ms
-
-            HANDLE timer;
-            LARGE_INTEGER ft;
-
-            ft.QuadPart = -10ll * int64_t(waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time
-
-            timer = CreateWaitableTimer(NULL, TRUE, NULL);
-            SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0);
-            WaitForSingleObject(timer, INFINITE);
-            CloseHandle(timer);
-        }
-        else
-        {
-            // use a polling loop for short intervals <= 100ms
-
-            LARGE_INTEGER perfCnt, start, now;
-            __int64 elapsed;
-
-            QueryPerformanceFrequency(&perfCnt);
-            QueryPerformanceCounter(&start);
-            do {
-		SwitchToThread();
-                QueryPerformanceCounter((LARGE_INTEGER*) &now);
-                elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000);
-            } while ( elapsed < waitTime );
-        }
-    }
+	if(waitTime > 0)
+	{
+		if(waitTime > 100)
+		{
+			// use a waitable timer for larger intervals > 0.1ms
+
+			HANDLE timer;
+			LARGE_INTEGER ft;
+
+			ft.QuadPart = -10ll * int64_t(waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time
+
+			timer = CreateWaitableTimer(NULL, TRUE, NULL);
+			SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0);
+			WaitForSingleObject(timer, INFINITE);
+			CloseHandle(timer);
+		}
+		else
+		{
+			// use a polling loop for short intervals <= 100ms
+
+			LARGE_INTEGER perfCnt, start, now;
+			__int64 elapsed;
+
+			QueryPerformanceFrequency(&perfCnt);
+			QueryPerformanceCounter(&start);
+			do
+			{
+				SwitchToThread();
+				QueryPerformanceCounter((LARGE_INTEGER*)&now);
+				elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000);
+			} while(elapsed < waitTime);
+		}
+	}
 }
 #else
 #include <unistd.h>
@@ -60,9 +60,9 @@ extern "C" void compat_usleep(uint64_t waitTime)
 #endif
 
 #include "cryptonight.hpp"
-#include "cuda_extra.hpp"
 #include "cuda_aes.hpp"
 #include "cuda_device.hpp"
+#include "cuda_extra.hpp"
 
 /* sm_2X is limited to 2GB due to the small TLB
  * therefore we never use 64bit indices
@@ -73,106 +73,113 @@ typedef uint64_t IndexType;
 typedef int IndexType;
 #endif
 
-__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi )
+__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi)
 {
-	product_hi = __umul64hi( multiplier, multiplicand );
-	return (multiplier * multiplicand );
+	product_hi = __umul64hi(multiplier, multiplicand);
+	return (multiplier * multiplicand);
 }
 
-template< typename T >
-__device__ __forceinline__ T loadGlobal64( T * const addr )
+template <typename T>
+__device__ __forceinline__ T loadGlobal64(T* const addr)
 {
-#if (__CUDA_ARCH__ < 700)
+#if(__CUDA_ARCH__ < 700)
 	T x;
-	asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) );
+	asm volatile("ld.global.cg.u64 %0, [%1];"
+				 : "=l"(x)
+				 : "l"(addr));
 	return x;
 #else
 	return *addr;
 #endif
 }
 
-template< typename T >
-__device__ __forceinline__ T loadGlobal32( T * const addr )
+template <typename T>
+__device__ __forceinline__ T loadGlobal32(T* const addr)
 {
-#if (__CUDA_ARCH__ < 700)
+#if(__CUDA_ARCH__ < 700)
 	T x;
-	asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) );
+	asm volatile("ld.global.cg.u32 %0, [%1];"
+				 : "=r"(x)
+				 : "l"(addr));
 	return x;
 #else
 	return *addr;
 #endif
 }
 
-
-template< typename T >
-__device__ __forceinline__ void storeGlobal32( T* addr, T const & val )
+template <typename T>
+__device__ __forceinline__ void storeGlobal32(T* addr, T const& val)
 {
-#if (__CUDA_ARCH__ < 700)
-	asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) );
+#if(__CUDA_ARCH__ < 700)
+	asm volatile("st.global.cg.u32 [%0], %1;"
+				 :
+				 : "l"(addr), "r"(val));
 #else
 	*addr = val;
 #endif
 }
 
-template< typename T >
-__device__ __forceinline__ void storeGlobal64( T* addr, T const & val )
+template <typename T>
+__device__ __forceinline__ void storeGlobal64(T* addr, T const& val)
 {
-#if (__CUDA_ARCH__ < 700)
-	asm volatile( "st.global.cg.u64 [%0], %1;" : : "l"( addr ), "l"( val ) );
+#if(__CUDA_ARCH__ < 700)
+	asm volatile("st.global.cg.u64 [%0], %1;"
+				 :
+				 : "l"(addr), "l"(val));
 #else
 	*addr = val;
 #endif
 }
 
-__device__ __forceinline__ uint32_t rotate16( const uint32_t n )
+__device__ __forceinline__ uint32_t rotate16(const uint32_t n)
 {
 	return (n >> 16u) | (n << 16u);
 }
 
 __global__ void cryptonight_core_gpu_phase1(
-	const uint32_t ITERATIONS,  const size_t MEMORY,
-	int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
+	const uint32_t ITERATIONS, const size_t MEMORY,
+	int threads, int bfactor, int partidx, uint32_t* __restrict__ long_state, uint32_t* __restrict__ ctx_state2, uint32_t* __restrict__ ctx_key1)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	cn_aes_gpu_init( sharedMemory );
-	__syncthreads( );
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
 
-	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
-	const int sub = ( threadIdx.x & 7 ) << 2;
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	const int sub = (threadIdx.x & 7) << 2;
 
 	const int batchsize = MEMORY >> bfactor;
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
 	uint32_t key[40], text[4];
 
-	MEMCPY8( key, ctx_key1 + thread * 40, 20 );
+	MEMCPY8(key, ctx_key1 + thread * 40, 20);
 
-	if( partidx == 0 )
+	if(partidx == 0)
 	{
 		// first round
-		MEMCPY8( text, ctx_state2 + thread * 50 + sub + 16, 2 );
+		MEMCPY8(text, ctx_state2 + thread * 50 + sub + 16, 2);
 	}
 	else
 	{
 		// load previous text data
-		MEMCPY8( text, &long_state[( (uint64_t) thread * MEMORY ) + sub + start - 32], 2 );
+		MEMCPY8(text, &long_state[((uint64_t)thread * MEMORY) + sub + start - 32], 2);
 	}
-	__syncthreads( );
-	for ( int i = start; i < end; i += 32 )
+	__syncthreads();
+	for(int i = start; i < end; i += 32)
 	{
-		cn_aes_pseudo_round_mut( sharedMemory, text, key );
-		MEMCPY8(&long_state[((uint64_t) thread * MEMORY) + (sub + i)], text, 2);
+		cn_aes_pseudo_round_mut(sharedMemory, text, key);
+		MEMCPY8(&long_state[((uint64_t)thread * MEMORY) + (sub + i)], text, 2);
 	}
 }
 
 /** avoid warning `unused parameter` */
-template< typename T >
-__forceinline__ __device__ void unusedVar( const T& )
+template <typename T>
+__forceinline__ __device__ void unusedVar(const T&)
 {
 }
 
@@ -189,25 +196,25 @@ __forceinline__ __device__ void unusedVar( const T& )
  * @param value value to share with other threads within the group
  * @param src thread number within the group from where the data is read, range [0:group_n]
  */
-template<size_t group_n>
-__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src)
+template <size_t group_n>
+__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr, const uint32_t sub, const int val, const uint32_t src)
 {
-#if( __CUDA_ARCH__ < 300 )
-    ptr[sub] = val;
-    return ptr[src & (group_n-1)];
+#if(__CUDA_ARCH__ < 300)
+	ptr[sub] = val;
+	return ptr[src & (group_n - 1)];
+#else
+	unusedVar(ptr);
+	unusedVar(sub);
+#if(__CUDACC_VER_MAJOR__ >= 9)
+	return __shfl_sync(__activemask(), val, src, group_n);
 #else
-    unusedVar( ptr );
-    unusedVar( sub );
-#   if(__CUDACC_VER_MAJOR__ >= 9)
-    return __shfl_sync(__activemask(), val, src, group_n );
-#	else
-	return __shfl( val, src, group_n );
-#	endif
+	return __shfl(val, src, group_n);
+#endif
 #endif
 }
 
-template<size_t group_n>
-__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2)
+template <size_t group_n>
+__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr, const uint32_t sub, const int val, const uint32_t src, const uint32_t src2)
 {
 	uint64_t tmp;
 	((uint32_t*)&tmp)[0] = shuffle<group_n>(ptr, sub, val, src);
@@ -218,9 +225,9 @@ __forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint3
 struct u64 : public uint2
 {
 
-	__forceinline__ __device__ u64(){}
+	__forceinline__ __device__ u64() {}
 
-	__forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1)
+	__forceinline__ __device__ u64(const uint32_t x0, const uint32_t x1)
 	{
 		uint2::x = x0;
 		uint2::y = x1;
@@ -231,7 +238,7 @@ struct u64 : public uint2
 		return *((uint64_t*)this);
 	}
 
-	__forceinline__ __device__ u64( const uint64_t x0)
+	__forceinline__ __device__ u64(const uint64_t x0)
 	{
 		((uint64_t*)&this->x)[0] = x0;
 	}
@@ -259,7 +266,7 @@ struct u64 : public uint2
 
 	__forceinline__ __device__ void print(int i) const
 	{
-		if(i<2)
+		if(i < 2)
 			printf("gpu: %lu\n", ((uint64_t*)&this->x)[0]);
 	}
 };
@@ -269,42 +276,42 @@ struct u64 : public uint2
  * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory
  *                   else if `1` 256bit operations will be used
  */
-template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+template <xmrstak_algo_id ALGO, uint32_t MEM_MODE>
 #ifdef XMR_STAK_THREADS
-__launch_bounds__( XMR_STAK_THREADS * 2 )
+__launch_bounds__(XMR_STAK_THREADS * 2)
 #endif
-__global__ void cryptonight_core_gpu_phase2_double(
-	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
-	int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
-		uint32_t startNonce, uint32_t * __restrict__ d_input )
+	__global__ void cryptonight_core_gpu_phase2_double(
+		const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK,
+		int threads, int bfactor, int partidx, uint32_t* d_long_state, uint32_t* d_ctx_a, uint32_t* d_ctx_b, uint32_t* d_ctx_state,
+		uint32_t startNonce, uint32_t* __restrict__ d_input)
 {
 	__shared__ uint32_t sharedMemory[512];
 
-	cn_aes_gpu_init_half( sharedMemory );
+	cn_aes_gpu_init_half(sharedMemory);
 
-#if( __CUDA_ARCH__ < 300 )
+#if(__CUDA_ARCH__ < 300)
 	extern __shared__ uint64_t externShared[];
 	// 8 x 64bit values
 	volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8);
-    volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8)  + (threadIdx.x & 0xFFFFFFFE);
+	volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE);
 #else
 	extern __shared__ uint64_t chunkMem[];
-    volatile uint32_t* sPtr = NULL;
+	volatile uint32_t* sPtr = NULL;
 	// 8 x 64bit values
 	volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8);
 
 #endif
 
-	__syncthreads( );
+	__syncthreads();
 
 	const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x);
 	const uint32_t thread = tid >> 1;
 	const uint32_t sub = tid & 1;
 
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
-	uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY];
+	uint8_t* l0 = (uint8_t*)&d_long_state[(IndexType)thread * MEMORY];
 
 	uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub];
 	uint64_t bx0;
@@ -324,22 +331,22 @@ __global__ void cryptonight_core_gpu_phase2_double(
 		sqrt_result = (d_ctx_b + thread * 16 + 4 * 2 + 2)[0];
 	}
 	else
-		 bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub];
+		bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub];
 
-	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+	const int batchsize = (ITERATIONS * 2) >> (1 + bfactor);
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
 
 	for(int i = start; i < end; ++i)
 	{
-		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+		ptr0 = (uint64_t*)&l0[idx0 & MASK & 0x1FFFC0];
 
 		if(MEM_MODE == 0)
 		{
-			#pragma unroll 4
+#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
 			{
-				myChunks[x + sub] = ptr0[ x + sub ];
+				myChunks[x + sub] = ptr0[x + sub];
 			}
 		}
 		else
@@ -347,52 +354,51 @@ __global__ void cryptonight_core_gpu_phase2_double(
 
 		uint32_t idx1 = (idx0 & 0x30) >> 3;
 
-		const u64 cx = myChunks[ idx1 + sub ];
-		const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ];
+		const u64 cx = myChunks[idx1 + sub];
+		const u64 cx2 = myChunks[idx1 + ((sub + 1) & 1)];
 
 		u64 cx_aes = ax0 ^ u64(
-			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )),
-			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) ))
-		);
+							   t_fn0(cx.x & 0xff) ^ t_fn1((cx.y >> 8) & 0xff) ^ rotate16(t_fn0((cx2.x >> 16) & 0xff) ^ t_fn1((cx2.y >> 24))),
+							   t_fn0(cx.y & 0xff) ^ t_fn1((cx2.x >> 8) & 0xff) ^ rotate16(t_fn0((cx2.y >> 16) & 0xff) ^ t_fn1((cx.x >> 24))));
 
 		if(ALGO == cryptonight_monero_v8)
 		{
 
-			const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ];
-			const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
-			const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
-#if (__CUDACC_VER_MAJOR__ >= 9)
+			const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub];
+			const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub];
+			const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub];
+#if(__CUDACC_VER_MAJOR__ >= 9)
 			__syncwarp();
 #else
-			__syncthreads( );
+			__syncthreads();
 #endif
-			myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
-			myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
-			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+			myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+			myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+			myChunks[idx1 ^ 6 + sub] = chunk2 + ax0;
 		}
 		else if(ALGO == cryptonight_v8_reversewaltz)
 		{
 
-			const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ];
-			const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
-			const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ];
-#if (__CUDACC_VER_MAJOR__ >= 9)
+			const uint64_t chunk3 = myChunks[idx1 ^ 2 + sub];
+			const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub];
+			const uint64_t chunk1 = myChunks[idx1 ^ 6 + sub];
+#if(__CUDACC_VER_MAJOR__ >= 9)
 			__syncwarp();
 #else
-			__syncthreads( );
+			__syncthreads();
 #endif
-			myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
-			myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
-			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+			myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+			myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+			myChunks[idx1 ^ 6 + sub] = chunk2 + ax0;
 		}
 
-		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
+		myChunks[idx1 + sub] = cx_aes ^ bx0;
 		if(MEM_MODE == 0)
 		{
-			#pragma unroll 4
+#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
 			{
-				ptr0[ x + sub ] = myChunks[x + sub];
+				ptr0[x + sub] = myChunks[x + sub];
 			}
 		}
 		else
@@ -400,14 +406,14 @@ __global__ void cryptonight_core_gpu_phase2_double(
 
 		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
 		idx1 = (idx0 & 0x30) >> 3;
-		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+		ptr0 = (uint64_t*)&l0[idx0 & MASK & 0x1FFFC0];
 
 		if(MEM_MODE == 0)
 		{
-			#pragma unroll 4
+#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
 			{
-				myChunks[x + sub] = ptr0[ x + sub ];
+				myChunks[x + sub] = ptr0[x + sub];
 			}
 		}
 		else
@@ -417,15 +423,15 @@ __global__ void cryptonight_core_gpu_phase2_double(
 			bx0 = cx_aes;
 
 		uint64_t cx_mul;
-		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
-		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
+		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x, 0);
+		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y, 0);
 
 		if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && sub == 1)
 		{
 			// Use division and square root results from the _previous_ iteration to hide the latency
 			((uint32_t*)&division_result)[1] ^= sqrt_result;
 
-			((uint64_t*)myChunks)[ idx1 ] ^= division_result;
+			((uint64_t*)myChunks)[idx1] ^= division_result;
 
 			const uint32_t dd = (static_cast<uint32_t>(cx_mul) + (sqrt_result << 1)) | 0x80000001UL;
 			division_result = fast_div_v2(cx_aes, dd);
@@ -433,46 +439,46 @@ __global__ void cryptonight_core_gpu_phase2_double(
 			// Use division_result as an input for the square root to prevent parallel implementation in hardware
 			sqrt_result = fast_sqrt_v2(cx_mul + division_result);
 		}
-#if (__CUDACC_VER_MAJOR__ >= 9)
-				__syncwarp();
+#if(__CUDACC_VER_MAJOR__ >= 9)
+		__syncwarp();
 #else
-				__syncthreads( );
+		__syncthreads();
 #endif
-		uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ];
+		uint64_t c = ((uint64_t*)myChunks)[idx1 + sub];
 
 		{
-			uint64_t cl = ((uint64_t*)myChunks)[ idx1 ];
+			uint64_t cl = ((uint64_t*)myChunks)[idx1];
 			// sub 0 -> hi, sub 1 -> lo
-			uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
+			uint64_t res = sub == 0 ? __umul64hi(cx_mul, cl) : cx_mul * cl;
 			if(ALGO == cryptonight_monero_v8)
 			{
-				const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
-				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+				const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub] ^ res;
+				uint64_t chunk2 = myChunks[idx1 ^ 4 + sub];
 				res ^= ((uint64_t*)&chunk2)[0];
-				const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
-#if (__CUDACC_VER_MAJOR__ >= 9)
+				const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub];
+#if(__CUDACC_VER_MAJOR__ >= 9)
 				__syncwarp();
 #else
-				__syncthreads( );
+				__syncthreads();
 #endif
-				myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
-				myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
-				myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+				myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+				myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+				myChunks[idx1 ^ 6 + sub] = chunk2 + ax0;
 			}
 			if(ALGO == cryptonight_v8_reversewaltz)
 			{
-				const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ] ^ res;
-				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+				const uint64_t chunk3 = myChunks[idx1 ^ 2 + sub] ^ res;
+				uint64_t chunk2 = myChunks[idx1 ^ 4 + sub];
 				res ^= ((uint64_t*)&chunk2)[0];
-				const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ];
-#if (__CUDACC_VER_MAJOR__ >= 9)
+				const uint64_t chunk1 = myChunks[idx1 ^ 6 + sub];
+#if(__CUDACC_VER_MAJOR__ >= 9)
 				__syncwarp();
 #else
-				__syncthreads( );
+				__syncthreads();
 #endif
-				myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
-				myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
-				myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+				myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+				myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+				myChunks[idx1 ^ 6 + sub] = chunk2 + ax0;
 			}
 			ax0 += res;
 		}
@@ -481,13 +487,13 @@ __global__ void cryptonight_core_gpu_phase2_double(
 			bx1 = bx0;
 			bx0 = cx_aes;
 		}
-		myChunks[ idx1 + sub ] = ax0;
+		myChunks[idx1 + sub] = ax0;
 		if(MEM_MODE == 0)
 		{
-			#pragma unroll 4
+#pragma unroll 4
 			for(int x = 0; x < 8; x += 2)
 			{
-				ptr0[ x + sub ] = myChunks[x + sub];
+				ptr0[x + sub] = myChunks[x + sub];
 			}
 		}
 		else
@@ -496,7 +502,7 @@ __global__ void cryptonight_core_gpu_phase2_double(
 		idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
 	}
 
-	if ( bfactor > 0 )
+	if(bfactor > 0)
 	{
 		((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
 		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
@@ -516,41 +522,41 @@ __global__ void cryptonight_core_gpu_phase2_double(
 	}
 }
 
-template<xmrstak_algo_id ALGO>
+template <xmrstak_algo_id ALGO>
 #ifdef XMR_STAK_THREADS
-__launch_bounds__( XMR_STAK_THREADS * 4 )
+__launch_bounds__(XMR_STAK_THREADS * 4)
 #endif
-__global__ void cryptonight_core_gpu_phase2_quad(
-	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
-	int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
-		uint32_t startNonce, uint32_t * __restrict__ d_input )
+	__global__ void cryptonight_core_gpu_phase2_quad(
+		const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK,
+		int threads, int bfactor, int partidx, uint32_t* d_long_state, uint32_t* d_ctx_a, uint32_t* d_ctx_b, uint32_t* d_ctx_state,
+		uint32_t startNonce, uint32_t* __restrict__ d_input)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	cn_aes_gpu_init( sharedMemory );
+	cn_aes_gpu_init(sharedMemory);
 
-	__syncthreads( );
+	__syncthreads();
 
-	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
 	const uint32_t nonce = startNonce + thread;
 	const int sub = threadIdx.x & 3;
 	const int sub2 = sub & 2;
 
-#if( __CUDA_ARCH__ < 300 )
-        extern __shared__ uint32_t shuffleMem[];
-        volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
+#if(__CUDA_ARCH__ < 300)
+	extern __shared__ uint32_t shuffleMem[];
+	volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x & 0xFFFFFFFC));
 #else
-        volatile uint32_t* sPtr = NULL;
+	volatile uint32_t* sPtr = NULL;
 #endif
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
 	int i, k;
 	uint32_t j;
-	const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor );
+	const int batchsize = (ITERATIONS * 2) >> (2 + bfactor);
 	const int start = partidx * batchsize;
 	const int end = start + batchsize;
-	uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY];
+	uint32_t* long_state = &d_long_state[(IndexType)thread * MEMORY];
 	uint32_t a, d[2], idx0;
 	uint32_t t1[2], t2[2], res;
 
@@ -564,9 +570,9 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 	}
 
 	uint32_t tweak1_2[2];
-	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 	{
-		uint32_t * state = d_ctx_state + thread * 50;
+		uint32_t* state = d_ctx_state + thread * 50;
 		tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8);
 		tweak1_2[0] ^= state[48];
 		tweak1_2[1] = nonce;
@@ -574,7 +580,7 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 	}
 
 	a = (d_ctx_a + thread * 4)[sub];
-	idx0 = shuffle<4>(sPtr,sub, a, 0);
+	idx0 = shuffle<4>(sPtr, sub, a, 0);
 	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		if(partidx != 0)
@@ -585,33 +591,33 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 	}
 	d[1] = (d_ctx_b + thread * 4)[sub];
 
-	#pragma unroll 2
-	for ( i = start; i < end; ++i )
+#pragma unroll 2
+	for(i = start; i < end; ++i)
 	{
-		#pragma unroll 2
-		for ( int x = 0; x < 2; ++x )
+#pragma unroll 2
+		for(int x = 0; x < 2; ++x)
 		{
-			j = ( ( idx0 & MASK ) >> 2 ) + sub;
+			j = ((idx0 & MASK) >> 2) + sub;
 
 			if(ALGO == cryptonight_bittube2)
 			{
 				uint32_t k[4];
-				k[0] = ~loadGlobal32<uint32_t>( long_state + j );
-				k[1] = shuffle<4>(sPtr,sub, k[0], sub + 1);
-				k[2] = shuffle<4>(sPtr,sub, k[0], sub + 2);
-				k[3] = shuffle<4>(sPtr,sub, k[0], sub + 3);
+				k[0] = ~loadGlobal32<uint32_t>(long_state + j);
+				k[1] = shuffle<4>(sPtr, sub, k[0], sub + 1);
+				k[2] = shuffle<4>(sPtr, sub, k[0], sub + 2);
+				k[3] = shuffle<4>(sPtr, sub, k[0], sub + 3);
 
-				#pragma unroll 4
+#pragma unroll 4
 				for(int i = 0; i < 4; ++i)
 				{
 					// only calculate the key if all data are up to date
 					if(i == sub)
 					{
 						d[x] = a ^
-							t_fn0( k[0] & 0xff ) ^
-							t_fn1( (k[1] >> 8) & 0xff ) ^
-							t_fn2( (k[2] >> 16) & 0xff ) ^
-							t_fn3( (k[3] >> 24 ) );
+							   t_fn0(k[0] & 0xff) ^
+							   t_fn1((k[1] >> 8) & 0xff) ^
+							   t_fn2((k[2] >> 16) & 0xff) ^
+							   t_fn3((k[3] >> 24));
 					}
 					// the last shuffle is not needed
 					if(i != 3)
@@ -619,13 +625,13 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 						/* avoid negative number for modulo
 						 * load valid key (k) depending on the round
 						 */
-						k[(4 - sub + i)%4] = shuffle<4>(sPtr,sub, k[0] ^ d[x], i);
+						k[(4 - sub + i) % 4] = shuffle<4>(sPtr, sub, k[0] ^ d[x], i);
 					}
 				}
 			}
 			else
 			{
-				uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
+				uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);
 
 				if(ALGO == cryptonight_conceal)
 				{
@@ -642,18 +648,18 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 					x_0 = (uint32_t)(((int32_t)x_0) ^ ((int32_t)c_old));
 				}
 
-				const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
-				const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
-				const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
+				const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1);
+				const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2);
+				const uint32_t x_3 = shuffle<4>(sPtr, sub, x_0, sub + 3);
 				d[x] = a ^
-					t_fn0( x_0 & 0xff ) ^
-					t_fn1( (x_1 >> 8) & 0xff ) ^
-					t_fn2( (x_2 >> 16) & 0xff ) ^
-					t_fn3( ( x_3 >> 24 ) );
+					   t_fn0(x_0 & 0xff) ^
+					   t_fn1((x_1 >> 8) & 0xff) ^
+					   t_fn2((x_2 >> 16) & 0xff) ^
+					   t_fn3((x_3 >> 24));
 			}
 
 			//XOR_BLOCKS_DST(c, b, &long_state[j]);
-			t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
+			t1[0] = shuffle<4>(sPtr, sub, d[x], 0);
 
 			const uint32_t z = d[0] ^ d[1];
 			if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
@@ -663,157 +669,157 @@ __global__ void cryptonight_core_gpu_phase2_quad(
 				{
 					const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2);
 					const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
-					storeGlobal32( long_state + j, sub == 2 ? fork_7 : z );
+					storeGlobal32(long_state + j, sub == 2 ? fork_7 : z);
 				}
 				else if(ALGO == cryptonight_stellite)
 				{
 					const uint32_t index = ((z >> 27) & 12) | ((z >> 23) & 2);
 					const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
-					storeGlobal32( long_state + j, sub == 2 ? fork_7 : z );
+					storeGlobal32(long_state + j, sub == 2 ? fork_7 : z);
 				}
 			}
 			else
-				storeGlobal32( long_state + j, z );
+				storeGlobal32(long_state + j, z);
 
 			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]);
-			j = ( ( *t1 & MASK ) >> 2 ) + sub;
+			j = ((*t1 & MASK) >> 2) + sub;
 
 			uint32_t yy[2];
-			*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
+			*((uint64_t*)yy) = loadGlobal64<uint64_t>(((uint64_t*)long_state) + (j >> 1));
 			uint32_t zz[2];
-			zz[0] = shuffle<4>(sPtr,sub, yy[0], 0);
-			zz[1] = shuffle<4>(sPtr,sub, yy[1], 0);
+			zz[0] = shuffle<4>(sPtr, sub, yy[0], 0);
+			zz[1] = shuffle<4>(sPtr, sub, yy[1], 0);
 
-			t1[1] = shuffle<4>(sPtr,sub, d[x], 1);
-			#pragma unroll
-			for ( k = 0; k < 2; k++ )
-				t2[k] = shuffle<4>(sPtr,sub, a, k + sub2);
+			t1[1] = shuffle<4>(sPtr, sub, d[x], 1);
+#pragma unroll
+			for(k = 0; k < 2; k++)
+				t2[k] = shuffle<4>(sPtr, sub, a, k + sub2);
 
-            *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) );
+			*((uint64_t*)t2) += sub2 ? (*((uint64_t*)t1) * *((uint64_t*)zz)) : __umul64hi(*((uint64_t*)t1), *((uint64_t*)zz));
 
-			res = *( (uint64_t *) t2 )  >> ( sub & 1 ? 32 : 0 );
+			res = *((uint64_t*)t2) >> (sub & 1 ? 32 : 0);
 
 			if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 			{
 				const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res;
 				uint32_t long_state_update = sub2 ? tweaked_res : res;
 
-				if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
+				if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
 				{
-					uint32_t value = shuffle<4>(sPtr,sub, long_state_update, sub & 1) ^ long_state_update;
+					uint32_t value = shuffle<4>(sPtr, sub, long_state_update, sub & 1) ^ long_state_update;
 					long_state_update = sub >= 2 ? value : long_state_update;
 				}
 
-				storeGlobal32( long_state + j, long_state_update );
+				storeGlobal32(long_state + j, long_state_update);
 			}
 			else
-				storeGlobal32( long_state + j, res );
+				storeGlobal32(long_state + j, res);
 
-			a = ( sub & 1 ? yy[1] : yy[0] ) ^ res;
-			idx0 = shuffle<4>(sPtr,sub, a, 0);
+			a = (sub & 1 ? yy[1] : yy[0]) ^ res;
+			idx0 = shuffle<4>(sPtr, sub, a, 0);
 			if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
 			{
-				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
-				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
+				int64_t n = loadGlobal64<uint64_t>(((uint64_t*)long_state) + ((idx0 & MASK) >> 3));
+				int32_t d = loadGlobal32<uint32_t>((uint32_t*)(((uint64_t*)long_state) + ((idx0 & MASK) >> 3) + 1u));
 				int64_t q = fast_div_heavy(n, (d | 0x5));
 
-				if(sub&1)
-					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
+				if(sub & 1)
+					storeGlobal64<uint64_t>(((uint64_t*)long_state) + ((idx0 & MASK) >> 3), n ^ q);
 
 				idx0 = d ^ q;
 			}
 			else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast)
 			{
-				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
-				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
+				int64_t n = loadGlobal64<uint64_t>(((uint64_t*)long_state) + ((idx0 & MASK) >> 3));
+				int32_t d = loadGlobal32<uint32_t>((uint32_t*)(((uint64_t*)long_state) + ((idx0 & MASK) >> 3) + 1u));
 				int64_t q = fast_div_heavy(n, (d | 0x5));
 
-				if(sub&1)
-					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
+				if(sub & 1)
+					storeGlobal64<uint64_t>(((uint64_t*)long_state) + ((idx0 & MASK) >> 3), n ^ q);
 
 				idx0 = (~d) ^ q;
 			}
 		}
 	}
 
-	if ( bfactor > 0 )
+	if(bfactor > 0)
 	{
 		(d_ctx_a + thread * 4)[sub] = a;
 		(d_ctx_b + thread * 4)[sub] = d[1];
 		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
-			if(sub&1)
+			if(sub & 1)
 				*(d_ctx_b + threads * 4 + thread) = idx0;
 		if(ALGO == cryptonight_conceal)
 			*(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var);
 	}
 }
 
-template<xmrstak_algo_id ALGO>
+template <xmrstak_algo_id ALGO>
 __global__ void cryptonight_core_gpu_phase3(
-	const uint32_t ITERATIONS,  const size_t MEMORY,
-	int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
+	const uint32_t ITERATIONS, const size_t MEMORY,
+	int threads, int bfactor, int partidx, const uint32_t* __restrict__ long_state, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_key2)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	cn_aes_gpu_init( sharedMemory );
-	__syncthreads( );
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
 
-	int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
-	int subv = ( threadIdx.x & 7 );
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	int subv = (threadIdx.x & 7);
 	int sub = subv << 2;
 
 	const int batchsize = MEMORY >> bfactor;
 	const int start = (partidx % (1 << bfactor)) * batchsize;
 	const int end = start + batchsize;
 
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
 	uint32_t key[40], text[4];
-	MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
-	MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 );
+	MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
+	MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
 
-	__syncthreads( );
+	__syncthreads();
 
-#if( __CUDA_ARCH__ < 300 )
+#if(__CUDA_ARCH__ < 300)
 	extern __shared__ uint32_t shuffleMem[];
-	volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8));
+	volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x & 0xFFFFFFF8));
 #else
 	volatile uint32_t* sPtr = NULL;
 #endif
 
-	for ( int i = start; i < end; i += 32 )
+	for(int i = start; i < end; i += 32)
 	{
-		#pragma unroll
-		for ( int j = 0; j < 4; ++j )
-			text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)];
+#pragma unroll
+		for(int j = 0; j < 4; ++j)
+			text[j] ^= long_state[((IndexType)thread * MEMORY) + (sub + i + j)];
 
-		cn_aes_pseudo_round_mut( sharedMemory, text, key );
+		cn_aes_pseudo_round_mut(sharedMemory, text, key);
 
 		if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
 			ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 		{
-			#pragma unroll
-			for ( int j = 0; j < 4; ++j )
-				text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7);
+#pragma unroll
+			for(int j = 0; j < 4; ++j)
+				text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv + 1) & 7);
 		}
 	}
 
-	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
+	MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
 }
 
-template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+template <xmrstak_algo_id ALGO, uint32_t MEM_MODE>
 void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo)
 {
 	uint32_t MASK = algo.Mask();
 	uint32_t ITERATIONS = algo.Iter();
-	size_t MEM = algo.Mem()/4;
+	size_t MEM = algo.Mem() / 4;
 
-	dim3 grid( ctx->device_blocks );
-	dim3 block( ctx->device_threads );
-	dim3 block2( ctx->device_threads << 1 );
-	dim3 block4( ctx->device_threads << 2 );
-	dim3 block8( ctx->device_threads << 3 );
+	dim3 grid(ctx->device_blocks);
+	dim3 block(ctx->device_threads);
+	dim3 block2(ctx->device_threads << 1);
+	dim3 block4(ctx->device_threads << 2);
+	dim3 block8(ctx->device_threads << 3);
 
 	int partcount = 1 << ctx->device_bfactor;
 
@@ -823,27 +829,29 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 	 * kernel splitting if the user defined a `bfactor >= 5`
 	 */
 	int bfactorOneThree = ctx->device_bfactor - 4;
-	if( bfactorOneThree < 0 )
+	if(bfactorOneThree < 0)
 		bfactorOneThree = 0;
 
 	int partcountOneThree = 1 << bfactorOneThree;
 
-	for ( int i = 0; i < partcountOneThree; i++ )
+	for(int i = 0; i < partcountOneThree; i++)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>(
-			ITERATIONS,
-			MEM,
-			ctx->device_blocks*ctx->device_threads,
-			bfactorOneThree, i,
-			ctx->d_long_state,
-			(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state),
-			ctx->d_ctx_key1 ));
-
-		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<<grid, block8>>>(
+											  ITERATIONS,
+											  MEM,
+											  ctx->device_blocks * ctx->device_threads,
+											  bfactorOneThree, i,
+											  ctx->d_long_state,
+											  (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state),
+											  ctx->d_ctx_key1));
+
+		if(partcount > 1 && ctx->device_bsleep > 0)
+			compat_usleep(ctx->device_bsleep);
 	}
-	if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+	if(partcount > 1 && ctx->device_bsleep > 0)
+		compat_usleep(ctx->device_bsleep);
 
-	for ( int i = 0; i < partcount; i++ )
+	for(int i = 0; i < partcount; i++)
 	{
 		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 		{
@@ -856,12 +864,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 					block2,
 					sizeof(uint64_t) * block.x * 8 +
 						// shuffle memory for fermi gpus
-						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-				>>>(
+						block2.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
 					ITERATIONS,
 					MEM,
 					MASK,
-					ctx->device_blocks*ctx->device_threads,
+					ctx->device_blocks * ctx->device_threads,
 					ctx->device_bfactor,
 					i,
 					ctx->d_long_state,
@@ -869,28 +876,24 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 					ctx->d_ctx_b,
 					ctx->d_ctx_state,
 					nonce,
-					ctx->d_input
-				)
-			);
+					ctx->d_input));
 		}
 		else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r)
 		{
-			int numThreads = ctx->device_blocks*ctx->device_threads;
+			int numThreads = ctx->device_blocks * ctx->device_threads;
 			void* args[] = {
 				&ITERATIONS, &MEM, &MASK,
 				&numThreads, &ctx->device_bfactor, &i,
-				&ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input
-			};
+				&ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input};
 			CU_CHECK(ctx->device_id, cuLaunchKernel(
-				ctx->kernel,
-				grid.x, grid.y, grid.z,
-				block2.x, block2.y, block2.z,
-				sizeof(uint64_t) * block.x * 8 +
-						// shuffle memory for fermi gpus
-						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ),
-				nullptr,
-				args, 0
-			));
+										 ctx->kernel,
+										 grid.x, grid.y, grid.z,
+										 block2.x, block2.y, block2.z,
+										 sizeof(uint64_t) * block.x * 8 +
+											 // shuffle memory for fermi gpus
+											 block2.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3),
+										 nullptr,
+										 args, 0));
 			CU_CHECK(ctx->device_id, cuCtxSynchronize());
 		}
 		else
@@ -901,12 +904,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 				cryptonight_core_gpu_phase2_quad<ALGO><<<
 					grid,
 					block4,
-					block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-				>>>(
+					block4.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
 					ITERATIONS,
 					MEM,
 					MASK,
-					ctx->device_blocks*ctx->device_threads,
+					ctx->device_blocks * ctx->device_threads,
 					ctx->device_bfactor,
 					i,
 					ctx->d_long_state,
@@ -914,57 +916,54 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 					ctx->d_ctx_b,
 					ctx->d_ctx_state,
 					nonce,
-					ctx->d_input
-				)
-			);
+					ctx->d_input));
 		}
 
-		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
+		if(partcount > 1 && ctx->device_bsleep > 0)
+			compat_usleep(ctx->device_bsleep);
 	}
 
 	int roundsPhase3 = partcountOneThree;
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven|| ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast )
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		// cryptonight_heavy used two full rounds over the scratchpad memory
 		roundsPhase3 *= 2;
 	}
 
-	for ( int i = 0; i < roundsPhase3; i++ )
+	for(int i = 0; i < roundsPhase3; i++)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
-			grid,
-			block8,
-			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-		>>>(
-			ITERATIONS,
-			MEM,
-			ctx->device_blocks*ctx->device_threads,
-			bfactorOneThree, i,
-			ctx->d_long_state,
-			ctx->d_ctx_state, ctx->d_ctx_key2 ));
+											  grid,
+											  block8,
+											  block8.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
+											  ITERATIONS,
+											  MEM,
+											  ctx->device_blocks * ctx->device_threads,
+											  bfactorOneThree, i,
+											  ctx->d_long_state,
+											  ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 }
 
-template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+template <xmrstak_algo_id ALGO, uint32_t MEM_MODE>
 void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo)
 {
 	const uint32_t MASK = algo.Mask();
 	const uint32_t ITERATIONS = algo.Iter();
 	const size_t MEM = algo.Mem();
 
-	dim3 grid( ctx->device_blocks );
-	dim3 block( ctx->device_threads );
-	dim3 block2( ctx->device_threads << 1 );
-	dim3 block4( ctx->device_threads << 2 );
-	dim3 block8( ctx->device_threads << 3 );
+	dim3 grid(ctx->device_blocks);
+	dim3 block(ctx->device_threads);
+	dim3 block2(ctx->device_threads << 1);
+	dim3 block4(ctx->device_threads << 2);
+	dim3 block8(ctx->device_threads << 3);
 
 	size_t intensity = ctx->device_blocks * ctx->device_threads;
 
 	CUDA_CHECK_KERNEL(
 		ctx->device_id,
-		xmrstak::nvidia::cn_explode_gpu<<<intensity,32>>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state)
-	);
+		xmrstak::nvidia::cn_explode_gpu<<<intensity, 32>>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state));
 
 	int partcount = 1 << ctx->device_bfactor;
 	for(int i = 0; i < partcount; i++)
@@ -972,20 +971,16 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_
 		CUDA_CHECK_KERNEL(
 			ctx->device_id,
 			// 36 x 16byte x numThreads
-			xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu
-				<<<ctx->device_blocks, ctx->device_threads * 16,  32 * 16 * ctx->device_threads>>>
-				(
-					ITERATIONS,
-					MEM,
-					MASK,
-					(int*)ctx->d_ctx_state,
-					(int*)ctx->d_long_state,
-					ctx->device_bfactor,
-					i,
-					ctx->d_ctx_a,
-					ctx->d_ctx_b
-				)
-		);
+			xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu<<<ctx->device_blocks, ctx->device_threads * 16, 32 * 16 * ctx->device_threads>>>(
+				ITERATIONS,
+				MEM,
+				MASK,
+				(int*)ctx->d_ctx_state,
+				(int*)ctx->d_long_state,
+				ctx->device_bfactor,
+				i,
+				ctx->d_ctx_a,
+				ctx->d_ctx_b));
 	}
 
 	/* bfactor for phase 3
@@ -994,32 +989,31 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_
 	 * kernel splitting if the user defined a `bfactor >= 5`
 	 */
 	int bfactorOneThree = ctx->device_bfactor - 4;
-	if( bfactorOneThree < 0 )
+	if(bfactorOneThree < 0)
 		bfactorOneThree = 0;
 
 	int partcountOneThree = 1 << bfactorOneThree;
 	int roundsPhase3 = partcountOneThree;
 
 	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
-		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast )
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		// cryptonight_heavy used two full rounds over the scratchpad memory
 		roundsPhase3 *= 2;
 	}
 
-	for ( int i = 0; i < roundsPhase3; i++ )
+	for(int i = 0; i < roundsPhase3; i++)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
-			grid,
-			block8,
-			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-		>>>(
-			ITERATIONS,
-			MEM/4,
-			ctx->device_blocks*ctx->device_threads,
-			bfactorOneThree, i,
-			ctx->d_long_state,
-			ctx->d_ctx_state, ctx->d_ctx_key2 ));
+											  grid,
+											  block8,
+											  block8.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
+											  ITERATIONS,
+											  MEM / 4,
+											  ctx->device_blocks * ctx->device_threads,
+											  bfactorOneThree, i,
+											  ctx->d_long_state,
+											  ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 }
 
@@ -1030,7 +1024,7 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 	{
 		if(ctx->kernel_height != chain_height || ctx->cached_algo != miner_algo)
 		{
-			 if(ctx->module)
+			if(ctx->module)
 				cuModuleUnload(ctx->module);
 
 			uint32_t PRECOMPILATION_DEPTH = 4;
@@ -1045,15 +1039,16 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 			ctx->kernel_height = chain_height;
 			ctx->cached_algo = miner_algo;
 
-			for (int i = 1; i <= PRECOMPILATION_DEPTH; ++i)
+			for(int i = 1; i <= PRECOMPILATION_DEPTH; ++i)
 				xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo,
 					chain_height + i, PRECOMPILATION_DEPTH, ctx->device_arch[0], ctx->device_arch[1], true);
 		}
 	}
 
-	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo);
+	typedef void (*cuda_hash_fn)(nvid_ctx * ctx, uint32_t nonce, const xmrstak_algo& algo);
 
-	if(miner_algo == invalid_algo) return;
+	if(miner_algo == invalid_algo)
+		return;
 
 	static const cuda_hash_fn func_table[] = {
 		cryptonight_core_gpu_hash<cryptonight, 0>,
@@ -1105,13 +1100,11 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 		cryptonight_core_gpu_hash<cryptonight_r, 1>,
 
 		cryptonight_core_gpu_hash<cryptonight_v8_reversewaltz, 0>,
-		cryptonight_core_gpu_hash<cryptonight_v8_reversewaltz, 1>
-	};
+		cryptonight_core_gpu_hash<cryptonight_v8_reversewaltz, 1>};
 
 	std::bitset<1> digit;
 	digit.set(0, ctx->memMode == 1);
 
-	cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ];
+	cuda_hash_fn selected_function = func_table[((miner_algo - 1u) << 1) | digit.to_ulong()];
 	selected_function(ctx, startNonce, miner_algo);
-
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
index fee7e13d14daac353893114f35a035b5479eed24..a66804ecffd9a622881c2bba891432b6a5fc681f 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
@@ -1,11 +1,11 @@
 #pragma once
 
+#include <cstdint>
 #include <cuda_runtime.h>
 #include <stdio.h>
-#include <cstdint>
 
-#include "cuda_keccak.hpp"
 #include "cuda_extra.hpp"
+#include "cuda_keccak.hpp"
 
 namespace xmrstak
 {
@@ -15,7 +15,7 @@ namespace nvidia
 struct __m128i : public int4
 {
 
-	__forceinline__ __device__ __m128i(){}
+	__forceinline__ __device__ __m128i() {}
 
 	__forceinline__ __device__ __m128i(
 		const uint32_t x0, const uint32_t x1,
@@ -27,7 +27,7 @@ struct __m128i : public int4
 		w = x3;
 	}
 
-	__forceinline__ __device__ __m128i( const int x0)
+	__forceinline__ __device__ __m128i(const int x0)
 	{
 		x = x0;
 		y = x0;
@@ -41,8 +41,7 @@ struct __m128i : public int4
 			x | other.x,
 			y | other.y,
 			z | other.z,
-			w | other.w
-		);
+			w | other.w);
 	}
 
 	__forceinline__ __device__ __m128i operator^(const __m128i& other)
@@ -51,15 +50,14 @@ struct __m128i : public int4
 			x ^ other.x,
 			y ^ other.y,
 			z ^ other.z,
-			w ^ other.w
-		);
+			w ^ other.w);
 	}
 };
 
 struct __m128 : public float4
 {
 
-	__forceinline__ __device__ __m128(){}
+	__forceinline__ __device__ __m128() {}
 
 	__forceinline__ __device__ __m128(
 		const float x0, const float x1,
@@ -71,7 +69,7 @@ struct __m128 : public float4
 		float4::w = x3;
 	}
 
-	__forceinline__ __device__ __m128( const float x0)
+	__forceinline__ __device__ __m128(const float x0)
 	{
 		float4::x = x0;
 		float4::y = x0;
@@ -79,7 +77,7 @@ struct __m128 : public float4
 		float4::w = x0;
 	}
 
-	__forceinline__ __device__ __m128( const __m128i& x0)
+	__forceinline__ __device__ __m128(const __m128i& x0)
 	{
 		float4::x = int2float(x0.x);
 		float4::y = int2float(x0.y);
@@ -87,14 +85,13 @@ struct __m128 : public float4
 		float4::w = int2float(x0.w);
 	}
 
-	__forceinline__ __device__ __m128i get_int( )
+	__forceinline__ __device__ __m128i get_int()
 	{
 		return __m128i(
 			(int)x,
 			(int)y,
 			(int)z,
-			(int)w
-		);
+			(int)w);
 	}
 
 	__forceinline__ __device__ __m128 operator+(const __m128& other)
@@ -103,8 +100,7 @@ struct __m128 : public float4
 			x + other.x,
 			y + other.y,
 			z + other.z,
-			w + other.w
-		);
+			w + other.w);
 	}
 
 	__forceinline__ __device__ __m128 operator-(const __m128& other)
@@ -113,8 +109,7 @@ struct __m128 : public float4
 			x - other.x,
 			y - other.y,
 			z - other.z,
-			w - other.w
-		);
+			w - other.w);
 	}
 
 	__forceinline__ __device__ __m128 operator*(const __m128& other)
@@ -123,8 +118,7 @@ struct __m128 : public float4
 			x * other.x,
 			y * other.y,
 			z * other.z,
-			w * other.w
-		);
+			w * other.w);
 	}
 
 	__forceinline__ __device__ __m128 operator/(const __m128& other)
@@ -133,67 +127,64 @@ struct __m128 : public float4
 			x / other.x,
 			y / other.y,
 			z / other.z,
-			w / other.w
-		);
+			w / other.w);
 	}
 
 	__forceinline__ __device__ __m128& trunc()
 	{
-		x=::truncf(x);
-		y=::truncf(y);
-		z=::truncf(z);
-		w=::truncf(w);
+		x = ::truncf(x);
+		y = ::truncf(y);
+		z = ::truncf(z);
+		w = ::truncf(w);
 
 		return *this;
 	}
 
 	__forceinline__ __device__ __m128& abs()
 	{
-		x=::fabsf(x);
-		y=::fabsf(y);
-		z=::fabsf(z);
-		w=::fabsf(w);
+		x = ::fabsf(x);
+		y = ::fabsf(y);
+		z = ::fabsf(z);
+		w = ::fabsf(w);
 
 		return *this;
 	}
 
 	__forceinline__ __device__ __m128& floor()
 	{
-		x=::floorf(x);
-		y=::floorf(y);
-		z=::floorf(z);
-		w=::floorf(w);
+		x = ::floorf(x);
+		y = ::floorf(y);
+		z = ::floorf(z);
+		w = ::floorf(w);
 
 		return *this;
 	}
 };
 
-
-template<typename T>
+template <typename T>
 __device__ void print(const char* name, T value)
 {
 	printf("g %s: ", name);
 	for(int i = 0; i < 4; ++i)
 	{
-		printf("%08X ",((uint32_t*)&value)[i]);
+		printf("%08X ", ((uint32_t*)&value)[i]);
 	}
 	printf("\n");
 }
 
-template<>
+template <>
 __device__ void print<__m128>(const char* name, __m128 value)
 {
 	printf("g %s: ", name);
 	for(int i = 0; i < 4; ++i)
 	{
-		printf("%f ",((float*)&value)[i]);
+		printf("%f ", ((float*)&value)[i]);
 	}
 	printf("\n");
 }
 
 #define SHOW(name) print(#name, name)
 
-
 __forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b)
 {
 	return a + b;
@@ -220,8 +211,7 @@ __forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b)
 		int_as_float(float_as_int(a.x) & b),
 		int_as_float(float_as_int(a.y) & b),
 		int_as_float(float_as_int(a.z) & b),
-		int_as_float(float_as_int(a.w) & b)
-	);
+		int_as_float(float_as_int(a.w) & b));
 }
 
 __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b)
@@ -230,8 +220,7 @@ __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b)
 		int_as_float(float_as_int(a.x) | b),
 		int_as_float(float_as_int(a.y) | b),
 		int_as_float(float_as_int(a.z) | b),
-		int_as_float(float_as_int(a.w) | b)
-	);
+		int_as_float(float_as_int(a.w) | b));
 }
 
 __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b)
@@ -240,20 +229,18 @@ __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b)
 		int_as_float(float_as_int(a.x) ^ b),
 		int_as_float(float_as_int(a.y) ^ b),
 		int_as_float(float_as_int(a.z) ^ b),
-		int_as_float(float_as_int(a.w) ^ b)
-	);
+		int_as_float(float_as_int(a.w) ^ b));
 }
 
 __forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc)
 {
 	__m128 d(dc);
 	__m128 c = _mm_div_ps(v, d);
-	c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+	c.trunc(); //_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
 	// c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2
 	c = _mm_mul_ps(c, d);
 	return _mm_sub_ps(v, c);
 
-
 	//return a.fmodf(b);
 }
 
@@ -262,23 +249,20 @@ __forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b)
 	return a ^ b;
 }
 
-
 __forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot)
 {
 	const uint32_t right = 8 * rot;
 	const uint32_t left = (32 - 8 * rot);
 	return __m128i(
-		((uint32_t)a.x >> right) | ( a.y << left ),
-		((uint32_t)a.y >> right) | ( a.z << left ),
-		((uint32_t)a.z >> right) | ( a.w << left ),
-		((uint32_t)a.w >> right) | ( a.x << left )
-	);
+		((uint32_t)a.x >> right) | (a.y << left),
+		((uint32_t)a.y >> right) | (a.z << left),
+		((uint32_t)a.z >> right) | (a.w << left),
+		((uint32_t)a.w >> right) | (a.x << left));
 }
 
-__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); }
-
+__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int* lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); }
 
-__forceinline__ __device__  __m128 fma_break(__m128 x)
+__forceinline__ __device__ __m128 fma_break(__m128 x)
 {
 	// Break the dependency chain by setitng the exp to ?????01
 	x = _mm_and_ps(x, 0xFEFFFFFF);
@@ -290,13 +274,13 @@ __forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m12
 {
 	n1 = _mm_add_ps(n1, c);
 	__m128 nn = _mm_mul_ps(n0, c);
-	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn, nn));
 	nn = fma_break(nn);
 	n = _mm_add_ps(n, nn);
 
 	n3 = _mm_sub_ps(n3, c);
 	__m128 dd = _mm_mul_ps(n2, c);
-	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd, dd));
 	dd = fma_break(dd);
 	d = _mm_add_ps(d, dd);
 
@@ -326,7 +310,7 @@ __forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, _
 	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
 	d = _mm_and_ps(d, 0xFF7FFFFF);
 	d = _mm_or_ps(d, 0x40000000);
-	r =_mm_add_ps(r, _mm_div_ps(n,d));
+	r = _mm_add_ps(r, _mm_div_ps(n, d));
 }
 
 // 74*8 = 595
@@ -335,15 +319,14 @@ __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n
 	__m128 c(cnt);
 	// 35 maths calls follow (140 FLOPS)
 	__m128 r = __m128(0.0f);
-	for(int i=0; i< 4; ++i)
+	for(int i = 0; i < 4; ++i)
 		round_compute(n0, n1, n2, n3, rnd_c, c, r);
 	// do a quick fmod by setting exp to 2
 	r = _mm_and_ps(r, 0x807FFFFF);
 	r = _mm_or_ps(r, 0x40000000);
-	sum = r; // 34
+	sum = r;								 // 34
 	r = _mm_mul_ps(r, __m128(536870880.0f)); // 35
 	return r.get_int();
-
 }
 
 __forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
@@ -376,8 +359,7 @@ __constant__ uint32_t look[16][4] = {
 	{3, 1, 2, 0},
 	{3, 2, 0, 1},
 	{3, 0, 1, 2},
-	{3, 0, 2, 1}
-};
+	{3, 0, 2, 1}};
 
 __constant__ float ccnt[16] = {
 	1.34375f,
@@ -398,16 +380,14 @@ __constant__ float ccnt[16] = {
 	1.3203125f,
 	1.3515625f,
 	1.3359375f,
-	1.4609375f
-};
-
+	1.4609375f};
 
 __forceinline__ __device__ void sync()
 {
-#if (__CUDACC_VER_MAJOR__ >= 9)
+#if(__CUDACC_VER_MAJOR__ >= 9)
 	__syncwarp();
 #else
-	__syncthreads( );
+	__syncthreads();
 #endif
 }
 
@@ -418,11 +398,11 @@ struct SharedMemChunk
 };
 
 __global__ void cryptonight_core_gpu_phase2_gpu(
-	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
-	int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS)
+	const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK,
+	int32_t* spad, int* lpad_in, int bfactor, int partidx, uint32_t* roundVs, uint32_t* roundS)
 {
 
-	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+	const int batchsize = (ITERATIONS * 2) >> (1 + bfactor);
 
 	extern __shared__ SharedMemChunk smemExtern_in[];
 
@@ -435,7 +415,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 
 	uint32_t tid = threadIdx.x % 16;
 
-	const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16;
+	const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x / 16;
 	uint32_t s = 0;
 
 	__m128 vs(0);
@@ -470,8 +450,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 			*(smem->out + look[tid][2]),
 			*(smem->out + look[tid][3]),
 			ccnt[tid], rc, smem->va[tid],
-			smem->out[tid]
-		);
+			smem->out[tid]);
 
 		sync();
 
@@ -483,7 +462,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 		((int*)smem->out)[tid] = outXor;
 
 		float va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4];
-		float va_tmp2 = ((float*)smem->va)[block+ 8] + ((float*)smem->va)[block + 12];
+		float va_tmp2 = ((float*)smem->va)[block + 8] + ((float*)smem->va)[block + 12];
 		((float*)smem->va)[tid] = va_tmp1 + va_tmp2;
 
 		sync();
@@ -505,10 +484,10 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 		vs = _mm_div_ps(vs, __m128(64.0f));
 		s = out2.x ^ out2.y ^ out2.z ^ out2.w;
 	}
-	if(partidx != ((1<<bfactor) - 1) && threadIdx.x % 16 == 0)
+	if(partidx != ((1 << bfactor) - 1) && threadIdx.x % 16 == 0)
 	{
 		const uint32_t numHashPerBlock2 = blockDim.x / 16;
-		const uint32_t idxHash2 = blockIdx.x * numHashPerBlock2 + threadIdx.x/16;
+		const uint32_t idxHash2 = blockIdx.x * numHashPerBlock2 + threadIdx.x / 16;
 		((__m128*)roundVs)[idxHash2] = vs;
 		roundS[idxHash2] = s;
 	}
@@ -519,30 +498,29 @@ __forceinline__ __device__ void generate_512(uint64_t idx, const uint64_t* in, u
 	uint64_t hash[25];
 
 	hash[0] = in[0] ^ idx;
-	#pragma unroll 24
+#pragma unroll 24
 	for(int i = 1; i < 25; ++i)
 		hash[i] = in[i];
 
 	cn_keccakf2(hash);
-	#pragma unroll 10
+#pragma unroll 10
 	for(int i = 0; i < 10; ++i)
 		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
-	out+=160;
+	out += 160;
 
 	cn_keccakf2(hash);
-	#pragma unroll 11
+#pragma unroll 11
 	for(int i = 0; i < 11; ++i)
 		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
-	out+=176;
+	out += 176;
 
 	cn_keccakf2(hash);
-	#pragma unroll 11
+#pragma unroll 11
 	for(int i = 0; i < 11; ++i)
 		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
 }
 
-
-__global__ void cn_explode_gpu(const size_t MEMORY, int32_t *spad_in, int *lpad_in)
+__global__ void cn_explode_gpu(const size_t MEMORY, int32_t* spad_in, int* lpad_in)
 {
 	__shared__ uint64_t state[25];
 
@@ -554,11 +532,11 @@ __global__ void cn_explode_gpu(const size_t MEMORY, int32_t *spad_in, int *lpad_
 
 	sync();
 
-	for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x)
+	for(uint64_t i = threadIdx.x; i < MEMORY / 512; i += blockDim.x)
 	{
-		generate_512(i, state, (uint8_t*)lpad + i*512);
+		generate_512(i, state, (uint8_t*)lpad + i * 512);
 	}
 }
 
-} // namespace xmrstak
 } // namespace nvidia
+} // namespace xmrstak
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt
index bcf49508009f691fdafbdf48f14690c91ab501e2..214114c7ec0a83dd166800d6024dc9006c185b16 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt
@@ -462,10 +462,10 @@ __global__ void CryptonightR_phase2(
     uint64_t bx0             = ((uint64_t*)(d_ctx_b + thread * 16))[sub];
     uint64_t bx1             = ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub];
 
-    uint32_t r0 = d_ctx_b[thread * 16 + 4 * 2];
-    uint32_t r1 = d_ctx_b[thread * 16 + 4 * 2 + 1];
-    uint32_t r2 = d_ctx_b[thread * 16 + 4 * 2 + 2];
-    uint32_t r3 = d_ctx_b[thread * 16 + 4 * 2 + 3];
+    volatile uint32_t r0 = d_ctx_b[thread * 16 + 4 * 2];
+    volatile uint32_t r1 = d_ctx_b[thread * 16 + 4 * 2 + 1];
+    volatile uint32_t r2 = d_ctx_b[thread * 16 + 4 * 2 + 2];
+    volatile uint32_t r3 = d_ctx_b[thread * 16 + 4 * 2 + 3];
 
     const int batchsize      = (ITERATIONS * 2) >> ( 1 + bfactor );
     const int start          = partidx * batchsize;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
index 96cb679f5a165be4f3f82cea2a6d0ace88f47ee6..48ebe4bd7a0bf783e40ebfd895e868df86f84e03 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
@@ -2,8 +2,8 @@
 #pragma once
 
 #include <cuda_runtime.h>
-#include <stdexcept>
 #include <iostream>
+#include <stdexcept>
 #include <string>
 
 /** execute and check a CUDA api command
@@ -12,27 +12,30 @@
  * @param msg message string which should be added to the error message
  * @param ... CUDA api command
  */
-#define CUDA_CHECK_MSG(id, msg, ...) { \
-	cudaError_t error = __VA_ARGS__; \
-	if(error!=cudaSuccess){	\
-		std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__; \
-		std::cerr << msg << std::endl;                                         \
-		throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \
-	} \
-} \
-( (void) 0 )
-
-#define CU_CHECK(id, ...) {                                                                             \
-    CUresult result = __VA_ARGS__;                                                                      \
-    if(result != CUDA_SUCCESS){                                                                         \
-        const char* s;                                                                                  \
-        cuGetErrorString(result, &s);                                                                   \
-        std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \
-        throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error")); \
-    }                                                                                                   \
-}                                                                                                       \
-( (void) 0 )
+#define CUDA_CHECK_MSG(id, msg, ...)                                                                          \
+	{                                                                                                         \
+		cudaError_t error = __VA_ARGS__;                                                                      \
+		if(error != cudaSuccess)                                                                              \
+		{                                                                                                     \
+			std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__;                  \
+			std::cerr << msg << std::endl;                                                                    \
+			throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \
+		}                                                                                                     \
+	}                                                                                                         \
+	((void)0)
 
+#define CU_CHECK(id, ...)                                                                                                                                   \
+	{                                                                                                                                                       \
+		CUresult result = __VA_ARGS__;                                                                                                                      \
+		if(result != CUDA_SUCCESS)                                                                                                                          \
+		{                                                                                                                                                   \
+			const char* s;                                                                                                                                  \
+			cuGetErrorString(result, &s);                                                                                                                   \
+			std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \
+			throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error"));                                                 \
+		}                                                                                                                                                   \
+	}                                                                                                                                                       \
+	((void)0)
 
 /** execute and check a CUDA api command
  *
@@ -47,7 +50,7 @@
  * @param ... CUDA kernel call
  */
 #define CUDA_CHECK_KERNEL(id, ...) \
-	__VA_ARGS__; \
+	__VA_ARGS__;                   \
 	CUDA_CHECK(id, cudaGetLastError())
 
 /** execute and check a CUDA kernel
@@ -57,5 +60,5 @@
  * @param ... CUDA kernel call
  */
 #define CUDA_CHECK_MSG_KERNEL(id, msg, ...) \
-	__VA_ARGS__; \
+	__VA_ARGS__;                            \
 	CUDA_CHECK_MSG(id, msg, cudaGetLastError())
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index b6e41c61960aac53f04e5788921da7b1573fcc09..aa7c1705726864f4e8b0e2072403c2402ac34253 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -1,83 +1,80 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-#include <sstream>
+#include "xmrstak/jconf.hpp"
 #include <algorithm>
-#include <vector>
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include  <algorithm>
-#include "xmrstak/jconf.hpp"
-
+#include <sstream>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <vector>
 
 typedef unsigned char BitSequence;
 typedef unsigned long long DataLength;
 
-#include "xmrstak/backend/cryptonight.hpp"
 #include "cryptonight.hpp"
-#include "cuda_extra.hpp"
-#include "cuda_keccak.hpp"
+#include "cuda_aes.hpp"
 #include "cuda_blake.hpp"
+#include "cuda_device.hpp"
+#include "cuda_extra.hpp"
 #include "cuda_groestl.hpp"
 #include "cuda_jh.hpp"
+#include "cuda_keccak.hpp"
 #include "cuda_skein.hpp"
-#include "cuda_device.hpp"
-#include "cuda_aes.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
 
-__constant__ uint8_t d_sub_byte[16][16] ={
-	{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
-	{0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 },
-	{0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 },
-	{0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 },
-	{0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 },
-	{0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf },
-	{0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 },
-	{0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 },
-	{0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 },
-	{0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb },
-	{0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 },
-	{0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 },
-	{0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a },
-	{0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e },
-	{0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf },
-	{0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }
-};
-
-__device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__ key, const uint32_t * __restrict__ data )
+__constant__ uint8_t d_sub_byte[16][16] = {
+	{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76},
+	{0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0},
+	{0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15},
+	{0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75},
+	{0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84},
+	{0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf},
+	{0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8},
+	{0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2},
+	{0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73},
+	{0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb},
+	{0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79},
+	{0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08},
+	{0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a},
+	{0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e},
+	{0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf},
+	{0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}};
+
+__device__ __forceinline__ void cryptonight_aes_set_key(uint32_t* __restrict__ key, const uint32_t* __restrict__ data)
 {
 	int i, j;
 	uint8_t temp[4];
-	const uint32_t aes_gf[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
+	const uint32_t aes_gf[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36};
 
-	MEMSET4( key, 0, 40 );
-	MEMCPY4( key, data, 8 );
+	MEMSET4(key, 0, 40);
+	MEMCPY4(key, data, 8);
 
 #pragma unroll
-	for ( i = 8; i < 40; i++ )
+	for(i = 8; i < 40; i++)
 	{
-		*(uint32_t *) temp = key[i - 1];
-		if ( i % 8 == 0 )
+		*(uint32_t*)temp = key[i - 1];
+		if(i % 8 == 0)
 		{
-			*(uint32_t *) temp = ROTR32( *(uint32_t *) temp, 8 );
-			for ( j = 0; j < 4; j++ )
-				temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f];
-			*(uint32_t *) temp ^= aes_gf[i / 8 - 1];
+			*(uint32_t*)temp = ROTR32(*(uint32_t*)temp, 8);
+			for(j = 0; j < 4; j++)
+				temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f];
+			*(uint32_t*)temp ^= aes_gf[i / 8 - 1];
 		}
 		else
 		{
-			if ( i % 8 == 4 )
+			if(i % 8 == 4)
 			{
 #pragma unroll
-				for ( j = 0; j < 4; j++ )
-					temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f];
+				for(j = 0; j < 4; j++)
+					temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f];
 			}
 		}
 
-		key[i] = key[( i - 8 )] ^ *(uint32_t *) temp;
+		key[i] = key[(i - 8)] ^ *(uint32_t*)temp;
 	}
 }
 
-__device__ __forceinline__ void mix_and_propagate( uint32_t* state )
+__device__ __forceinline__ void mix_and_propagate(uint32_t* state)
 {
 	uint32_t tmp0[4];
 	for(size_t x = 0; x < 4; ++x)
@@ -93,18 +90,18 @@ __device__ __forceinline__ void mix_and_propagate( uint32_t* state )
 		(state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x];
 }
 
-template<xmrstak_algo_id ALGO>
-__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
+template <xmrstak_algo_id ALGO>
+__global__ void cryptonight_extra_gpu_prepare(int threads, uint32_t* __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_state2, uint32_t* __restrict__ d_ctx_a, uint32_t* __restrict__ d_ctx_b, uint32_t* __restrict__ d_ctx_key1, uint32_t* __restrict__ d_ctx_key2)
 {
-	int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	__shared__ uint32_t sharedMemory[1024];
 
 	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
-		cn_aes_gpu_init( sharedMemory );
-		__syncthreads( );
+		cn_aes_gpu_init(sharedMemory);
+		__syncthreads();
 	}
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
 	uint32_t ctx_state[50];
@@ -114,29 +111,29 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	uint32_t ctx_key2[40];
 	uint32_t input[32];
 
-	memcpy( input, d_input, len );
+	memcpy(input, d_input, len);
 	//*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
 	uint32_t nonce = startNonce + thread;
-	for ( int i = 0; i < sizeof (uint32_t ); ++i )
-		( ( (char *) input ) + 39 )[i] = ( (char*) ( &nonce ) )[i]; //take care of pointer alignment
+	for(int i = 0; i < sizeof(uint32_t); ++i)
+		(((char*)input) + 39)[i] = ((char*)(&nonce))[i]; //take care of pointer alignment
 
-	cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state );
-	cryptonight_aes_set_key( ctx_key1, ctx_state );
-	cryptonight_aes_set_key( ctx_key2, ctx_state + 8 );
+	cn_keccak((uint8_t*)input, len, (uint8_t*)ctx_state);
+	cryptonight_aes_set_key(ctx_key1, ctx_state);
+	cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
 
-	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
-	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
-	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
+	XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
+	XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
+	memcpy(d_ctx_a + thread * 4, ctx_a, 4 * 4);
 	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 	{
-		memcpy( d_ctx_b + thread * 16, ctx_b, 4 * 4 );
+		memcpy(d_ctx_b + thread * 16, ctx_b, 4 * 4);
 		// bx1
-		XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b );
-		memcpy( d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4 );
+		XOR_BLOCKS_DST(ctx_state + 16, ctx_state + 20, ctx_b);
+		memcpy(d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4);
 		// division_result
-		memcpy( d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2 );
+		memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2);
 		// sqrt_result
-		memcpy( d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2 );
+		memcpy(d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2);
 	}
 	else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r)
 	{
@@ -148,31 +145,31 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 		memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 8);
 	}
 	else
-		memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+		memcpy(d_ctx_b + thread * 4, ctx_b, 4 * 4);
 
-	memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 );
-	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
-	memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
+	memcpy(d_ctx_key1 + thread * 40, ctx_key1, 40 * 4);
+	memcpy(d_ctx_key2 + thread * 40, ctx_key2, 40 * 4);
+	memcpy(d_ctx_state + thread * 50, ctx_state, 50 * 4);
 
 	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 
-		for(int i=0; i < 16; i++)
+		for(int i = 0; i < 16; i++)
 		{
 			for(size_t t = 4; t < 12; ++t)
 			{
-				cn_aes_pseudo_round_mut( sharedMemory, ctx_state + 4u * t, ctx_key1 );
+				cn_aes_pseudo_round_mut(sharedMemory, ctx_state + 4u * t, ctx_key1);
 			}
 			// scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
 			mix_and_propagate(ctx_state + 4 * 4);
 		}
 		// double buffer to move manipulated state into phase1
-		memcpy( d_ctx_state2 + thread * 50, ctx_state, 50 * 4 );
+		memcpy(d_ctx_state2 + thread * 50, ctx_state, 50 * 4);
 	}
 }
 
-template<xmrstak_algo_id ALGO>
-__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
+template <xmrstak_algo_id ALGO>
+__global__ void cryptonight_extra_gpu_final(int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t* __restrict__ d_res_nonce, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_key2)
 {
 	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
 
@@ -181,19 +178,19 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
 		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
-		cn_aes_gpu_init( sharedMemory );
-		__syncthreads( );
+		cn_aes_gpu_init(sharedMemory);
+		__syncthreads();
 	}
-	if ( thread >= threads )
+	if(thread >= threads)
 		return;
 
 	int i;
-	uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50;
+	uint32_t* __restrict__ ctx_state = d_ctx_state + thread * 50;
 	uint64_t hash[4];
 	uint32_t state[50];
 
-	#pragma unroll
-	for ( i = 0; i < 50; i++ )
+#pragma unroll
+	for(i = 0; i < 50; i++)
 		state[i] = ctx_state[i];
 
 	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
@@ -202,25 +199,25 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 		uint32_t key[40];
 
 		// load keys
-		MEMCPY8( key, d_ctx_key2 + thread * 40, 20 );
+		MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
 
-		for(int i=0; i < 16; i++)
+		for(int i = 0; i < 16; i++)
 		{
 			for(size_t t = 4; t < 12; ++t)
 			{
-				cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key );
+				cn_aes_pseudo_round_mut(sharedMemory, state + 4u * t, key);
 			}
 			// scipt first 4 * 128bit blocks = 4 * 4 uint32_t values
 			mix_and_propagate(state + 4 * 4);
 		}
 	}
-	cn_keccakf2( (uint64_t *) state );
+	cn_keccakf2((uint64_t*)state);
 
 	if(ALGO == cryptonight_gpu)
 	{
-		if ( ((uint64_t*)state)[3] < target )
+		if(((uint64_t*)state)[3] < target)
 		{
-			uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+			uint32_t idx = atomicInc(d_res_count, 0xFFFFFFFF);
 
 			if(idx < 10)
 				d_res_nonce[idx] = thread;
@@ -228,19 +225,19 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	}
 	else
 	{
-		switch ( ( (uint8_t *) state )[0] & 0x03 )
+		switch(((uint8_t*)state)[0] & 0x03)
 		{
 		case 0:
-			cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash );
+			cn_blake((const uint8_t*)state, 200, (uint8_t*)hash);
 			break;
 		case 1:
-			cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash );
+			cn_groestl((const BitSequence*)state, 200, (BitSequence*)hash);
 			break;
 		case 2:
-			cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash );
+			cn_jh((const BitSequence*)state, 200, (BitSequence*)hash);
 			break;
 		case 3:
-			cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash );
+			cn_skein((const BitSequence*)state, 200, (BitSequence*)hash);
 			break;
 		default:
 			break;
@@ -249,9 +246,9 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
 		// and expect an accurate result for target > 32-bit without implementing carries
 
-		if ( hash[3] < target )
+		if(hash[3] < target)
 		{
-			uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+			uint32_t idx = atomicInc(d_res_count, 0xFFFFFFFF);
 
 			if(idx < 10)
 				d_res_nonce[idx] = thread;
@@ -259,10 +256,10 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	}
 }
 
-extern "C" void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len )
+extern "C" void cryptonight_extra_cpu_set_data(nvid_ctx* ctx, const void* data, uint32_t len)
 {
 	ctx->inputlen = len;
-	CUDA_CHECK(ctx->device_id, cudaMemcpy( ctx->d_input, data, len, cudaMemcpyHostToDevice ));
+	CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, data, len, cudaMemcpyHostToDevice));
 }
 
 extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
@@ -290,7 +287,6 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	case 3:
 		CUDA_CHECK(ctx->device_id, cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 		break;
-
 	};
 
 	// prefer shared memory over L1 cache
@@ -314,8 +310,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() ||
 		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() ||
 		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() ||
-		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()
-	)
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end())
 	{
 		// extent ctx_b to hold the state of idx0
 		ctx_b_size += sizeof(uint32_t) * wsize;
@@ -326,16 +321,14 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	{
 		ctx_b_size += sizeof(uint32_t) * 4 * wsize;
 	}
-	else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
-		|| (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end()))
+	else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end()))
 	{
 		// bx0 (16byte), bx1 (16byte), division_result (8byte) and sqrt_result (8byte), padding (16byte)
 		ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize;
 	}
 	else if(
 		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end() ||
-		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end()
-	)
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end())
 	{
 		// bx0 (16byte), bx1 (16byte), and [r0, r1, r2, r3] (a 8byte)
 		ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize;
@@ -349,9 +342,9 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize));
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size));
 	// POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof (uint32_t ) ));
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
-	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof(uint32_t)));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof(uint32_t)));
+	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof(uint32_t)));
 	CUDA_CHECK_MSG(
 		ctx->device_id,
 		"\n**suggestion: Try to reduce the value of the attribute 'threads' in the NVIDIA config file.**",
@@ -364,106 +357,102 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
 
-	dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
-	dim3 block( threadsperblock );
+	dim3 grid((wsize + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
 
 	if(miner_algo == cryptonight_heavy)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_heavy><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_heavy><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_haven)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_haven><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_haven><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_superfast)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_superfast><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_superfast><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_monero_v8)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_gpu)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_gpu><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_gpu><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_r)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_r_wow)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r_wow><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r_wow><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_v8_reversewaltz)
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_v8_reversewaltz><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_v8_reversewaltz><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 	else
 	{
 		/* pass two times d_ctx_state because the second state is used later in phase1,
 		 * the first is used than in phase3
 		 */
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<invalid_algo><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
-			ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<invalid_algo><<<grid, block>>>(wsize, ctx->d_input, ctx->inputlen, startNonce,
+											  ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
 }
 
-extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo)
+extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t* resnonce, const xmrstak_algo& miner_algo)
 {
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
 
-	dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock );
-	dim3 block( threadsperblock );
+	dim3 grid((wsize + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
 
-	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) ));
-	CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) ));
+	CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0xFF, 10 * sizeof(uint32_t)));
+	CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_count, 0, sizeof(uint32_t)));
 
 	if(miner_algo == cryptonight_heavy)
 	{
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_heavy><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_heavy><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_haven)
 	{
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_haven><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_haven><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_superfast)
 	{
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_superfast><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_superfast><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_bittube2><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_bittube2><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else if(miner_algo == cryptonight_gpu)
 	{
@@ -471,8 +460,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<cryptonight_gpu><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<cryptonight_gpu><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 	else
 	{
@@ -480,16 +468,14 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 		CUDA_CHECK_MSG_KERNEL(
 			ctx->device_id,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
-			cryptonight_extra_gpu_final<invalid_algo><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
-		);
+			cryptonight_extra_gpu_final<invalid_algo><<<grid, block>>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2));
 	}
 
-	CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
+	CUDA_CHECK(ctx->device_id, cudaMemcpy(rescount, ctx->d_result_count, sizeof(uint32_t), cudaMemcpyDeviceToHost));
 	CUDA_CHECK_MSG(
 		ctx->device_id,
 		"\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**",
-		cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )
-	);
+		cudaMemcpy(resnonce, ctx->d_result_nonce, 10 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 	/* There is only a 32bit limit for the counter on the device side
 	 * therefore this value can be greater than 10, in that case limit rescount
@@ -497,11 +483,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 	 */
 	if(*rescount > 10)
 		*rescount = 10;
-	for(int i=0; i < *rescount; i++)
+	for(int i = 0; i < *rescount; i++)
 		resnonce[i] += startNonce;
 }
 
-extern "C" int cuda_get_devicecount( int* deviceCount)
+extern "C" int cuda_get_devicecount(int* deviceCount)
 {
 	cudaError_t err;
 	*deviceCount = 0;
@@ -587,17 +573,17 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	std::vector<int> arch;
 #define XMRSTAK_PP_TOSTRING1(str) #str
 #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str)
-	char const * archStringList = XMRSTAK_PP_TOSTRING(XMRSTAK_CUDA_ARCH_LIST);
+	char const* archStringList = XMRSTAK_PP_TOSTRING(XMRSTAK_CUDA_ARCH_LIST);
 #undef XMRSTAK_PP_TOSTRING
 #undef XMRSTAK_PP_TOSTRING1
 	std::stringstream ss(archStringList);
 
 	//transform string list separated with `+` into a vector of integers
 	int tmpArch;
-	while ( ss >> tmpArch )
-		arch.push_back( tmpArch );
+	while(ss >> tmpArch)
+		arch.push_back(tmpArch);
 
-	#define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n"
+#define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n"
 	if(gpuArch >= 20 && gpuArch < 30)
 	{
 		// compiled binary must support sm_20 for fermi
@@ -618,7 +604,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 *   with a sm_20 only compiled binary
 		 */
 		for(int i = 0; i < arch.size(); ++i)
-			if(arch[i] >= 30  && (minSupportedArch == 0 || arch[i] < minSupportedArch))
+			if(arch[i] >= 30 && (minSupportedArch == 0 || arch[i] < minSupportedArch))
 				minSupportedArch = arch[i];
 		if(minSupportedArch < 30 || gpuArch < minSupportedArch)
 		{
@@ -630,7 +616,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
 	bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end();
 
-
 	// set all device option those marked as auto (-1) to a valid value
 	if(ctx->device_blocks == -1)
 	{
@@ -700,7 +685,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 		int* tmp;
 		cudaError_t err;
-		#define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n"
+#define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n"
 		// a device must be selected to get the right memory usage later on
 		err = cudaSetDevice(ctx->device_id);
 		if(err != cudaSuccess)
@@ -716,7 +701,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			return 3;
 		}
 
-
 		size_t freeMemory = 0;
 		size_t totalMemory = 0;
 		CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -746,7 +730,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		size_t usedMem = totalMemory - freeMemory;
 		if(usedMem >= maxMemUsage)
 		{
-			printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem/byteToMiB).c_str());
+			printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem / byteToMiB).c_str());
 			return 4;
 		}
 		else
@@ -764,8 +748,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() ||
 			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() ||
 			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() ||
-			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()
-		)
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end())
 			perThread += 50 * 4; // state double buffer
 
 		size_t max_intensity = limitedMemory / perThread;
@@ -806,19 +789,18 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			size_t blockOptimal = 8 * ctx->device_mpcount;
 
 			// the following values are calculated with CUDA10 and the occupancy calculator
-			if(gpuArch == 35 || gpuArch/10 == 5 || gpuArch/10 == 6)
-				blockOptimal = 7 *  ctx->device_mpcount;
+			if(gpuArch == 35 || gpuArch / 10 == 5 || gpuArch / 10 == 6)
+				blockOptimal = 7 * ctx->device_mpcount;
 			if(gpuArch == 37)
-				blockOptimal = 14 *  ctx->device_mpcount;
+				blockOptimal = 14 * ctx->device_mpcount;
 			if(gpuArch >= 70)
-				blockOptimal = 6 *  ctx->device_mpcount;
+				blockOptimal = 6 * ctx->device_mpcount;
 
 			if(blockOptimal * threads * hashMemSize < limitedMemory)
 			{
 				ctx->device_threads = threads;
 				ctx->device_blocks = blockOptimal;
 			}
-
 		}
 	}
 	printf("device init succeeded\n");
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
index 4d369f843c3590a7dcd749fe2b023c766194a121..09cdd6646013d1c3d47b4a6176b7cd191370915e 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp
@@ -11,22 +11,22 @@ struct uint3
 	unsigned int x, y, z;
 };
 
-struct uint3  threadIdx;
-struct uint3  blockIdx;
-struct uint3  blockDim;
-#define __funnelshift_r(a,b,c) 1
+struct uint3 threadIdx;
+struct uint3 blockIdx;
+struct uint3 blockDim;
+#define __funnelshift_r(a, b, c) 1
 #define __syncthreads()
 #define asm(x)
-#define __shfl(a,b,c) 1
+#define __shfl(a, b, c) 1
 #endif
 
-#define AES_BLOCK_SIZE  16
-#define AES_KEY_SIZE    32
-#define INIT_SIZE_BLK   8
+#define AES_BLOCK_SIZE 16
+#define AES_KEY_SIZE 32
+#define INIT_SIZE_BLK 8
 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
 
-#define C32(x)    ((uint32_t)(x ## U))
-#define T32(x) ((x) & C32(0xFFFFFFFF))
+#define C32(x) ((uint32_t)(x##U))
+#define T32(x) ((x)&C32(0xFFFFFFFF))
 
 #if __CUDA_ARCH__ >= 350
 __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset)
@@ -34,71 +34,91 @@ __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int
 	uint2 result;
 	if(offset >= 32)
 	{
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.x)
+			: "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.y)
+			: "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
 	}
 	else
 	{
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.x)
+			: "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.y)
+			: "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 	}
-	return  __double_as_longlong(__hiloint2double(result.y, result.x));
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
 }
 #define ROTL64(x, n) (cuda_ROTL64(x, n))
 #else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
+#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
 #endif
 
 #if __CUDA_ARCH__ < 350
 #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
 #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #else
-#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#define ROTL32(x, n) __funnelshift_l((x), (x), (n))
+#define ROTR32(x, n) __funnelshift_r((x), (x), (n))
 #endif
 
-#define MEMSET8(dst,what,cnt) { \
-	int i_memset8; \
-	uint64_t *out_memset8 = (uint64_t *)(dst); \
-	for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
-		out_memset8[i_memset8] = (what); }
-
-#define MEMSET4(dst,what,cnt) { \
-	int i_memset4; \
-	uint32_t *out_memset4 = (uint32_t *)(dst); \
-	for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
-		out_memset4[i_memset4] = (what); }
-
-#define MEMCPY8(dst,src,cnt) { \
-	int i_memcpy8; \
-	uint64_t *in_memcpy8 = (uint64_t *)(src); \
-	uint64_t *out_memcpy8 = (uint64_t *)(dst); \
-	for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
-		out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }
-
-#define MEMCPY4(dst,src,cnt) { \
-	int i_memcpy4; \
-	uint32_t *in_memcpy4 = (uint32_t *)(src); \
-	uint32_t *out_memcpy4 = (uint32_t *)(dst); \
-	for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
-		out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
-
-#define XOR_BLOCKS(a,b) { \
-	((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \
-	((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; }
-
-#define XOR_BLOCKS_DST(x,y,z) { \
-	((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
-	((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
-
-#define MUL_SUM_XOR_DST(a,c,dst) { \
-	const uint64_t dst0 = ((uint64_t *)dst)[0]; \
-	uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], dst0, &hi) + ((uint64_t *)c)[1]; \
-	hi += ((uint64_t *)c)[0]; \
-	((uint64_t *)c)[0] = dst0 ^ hi; \
-	((uint64_t *)dst)[0] = hi; \
-	((uint64_t *)c)[1] = atomicExch(((unsigned long long int *)dst) + 1, (unsigned long long int)lo) ^ lo; \
+#define MEMSET8(dst, what, cnt)                          \
+	{                                                    \
+		int i_memset8;                                   \
+		uint64_t* out_memset8 = (uint64_t*)(dst);        \
+		for(i_memset8 = 0; i_memset8 < cnt; i_memset8++) \
+			out_memset8[i_memset8] = (what);             \
 	}
 
-#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
+#define MEMSET4(dst, what, cnt)                          \
+	{                                                    \
+		int i_memset4;                                   \
+		uint32_t* out_memset4 = (uint32_t*)(dst);        \
+		for(i_memset4 = 0; i_memset4 < cnt; i_memset4++) \
+			out_memset4[i_memset4] = (what);             \
+	}
+
+#define MEMCPY8(dst, src, cnt)                              \
+	{                                                       \
+		int i_memcpy8;                                      \
+		uint64_t* in_memcpy8 = (uint64_t*)(src);            \
+		uint64_t* out_memcpy8 = (uint64_t*)(dst);           \
+		for(i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++)    \
+			out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; \
+	}
 
+#define MEMCPY4(dst, src, cnt)                              \
+	{                                                       \
+		int i_memcpy4;                                      \
+		uint32_t* in_memcpy4 = (uint32_t*)(src);            \
+		uint32_t* out_memcpy4 = (uint32_t*)(dst);           \
+		for(i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++)    \
+			out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; \
+	}
+
+#define XOR_BLOCKS(a, b)                        \
+	{                                           \
+		((uint64_t*)a)[0] ^= ((uint64_t*)b)[0]; \
+		((uint64_t*)a)[1] ^= ((uint64_t*)b)[1]; \
+	}
+
+#define XOR_BLOCKS_DST(x, y, z)                                        \
+	{                                                                  \
+		((uint64_t*)z)[0] = ((uint64_t*)(x))[0] ^ ((uint64_t*)(y))[0]; \
+		((uint64_t*)z)[1] = ((uint64_t*)(x))[1] ^ ((uint64_t*)(y))[1]; \
+	}
+
+#define MUL_SUM_XOR_DST(a, c, dst)                                                                           \
+	{                                                                                                        \
+		const uint64_t dst0 = ((uint64_t*)dst)[0];                                                           \
+		uint64_t hi, lo = cuda_mul128(((uint64_t*)a)[0], dst0, &hi) + ((uint64_t*)c)[1];                     \
+		hi += ((uint64_t*)c)[0];                                                                             \
+		((uint64_t*)c)[0] = dst0 ^ hi;                                                                       \
+		((uint64_t*)dst)[0] = hi;                                                                            \
+		((uint64_t*)c)[1] = atomicExch(((unsigned long long int*)dst) + 1, (unsigned long long int)lo) ^ lo; \
+	}
+
+#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
index 555ccbef2a2f616a8816676e308154d39e281454..a8dd1fcb279e22edb89c20c46164ee5f89385883 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
@@ -2,7 +2,6 @@
 
 #include <stdint.h>
 
-
 __device__ __forceinline__ int64_t fast_div_heavy(int64_t _a, int _b)
 {
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
index 0d54f143640cd465e3a9d3d6afc1b78a69313d42..1fc85b2d0cad1454181e07db942dddd270897f33 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -18,19 +18,19 @@ __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b)
 {
 	const uint32_t r = get_reciprocal(b);
 	const uint32_t a1 = ((uint32_t*)&a)[1];
-	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * a1) + a;
+	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r)*a1) + a;
 
 	const uint32_t q = ((uint32_t*)&k)[1];
-	int64_t tmp = a - ((uint64_t)(q) * b);
+	int64_t tmp = a - ((uint64_t)(q)*b);
 	((int32_t*)(&tmp))[1] -= q < a1 ? b : 0;
-	
+
 	const int overshoot = ((int*)(&tmp))[1] >> 31;
 	const int64_t tmp_u = (uint32_t)(b - 1) - tmp;
 	const int undershoot = ((int*)&tmp_u)[1] >> 31;
 
 	uint64_t result;
 	((uint32_t*)&result)[0] = q + overshoot - undershoot;
-	((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot) & b) - ((uint32_t)(undershoot) & b);
+	((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot)&b) - ((uint32_t)(undershoot)&b);
 
 	return result;
 }
@@ -39,14 +39,18 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
 {
 	float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23));
 	float x1;
-	asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x));
-	asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x));
+	asm("rsqrt.approx.f32 %0, %1;"
+		: "=f"(x1)
+		: "f"(x));
+	asm("sqrt.approx.f32 %0, %1;"
+		: "=f"(x)
+		: "f"(x));
 
 	// The following line does x1 *= 4294967296.0f;
 	x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23));
 
 	const uint32_t x0 = __float_as_uint(x) - (158U << 23);
-	const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18);
+	const int64_t delta0 = n1 - (((int64_t)(x0)*x0) << 18);
 	const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1;
 
 	uint32_t result = (x0 << 10) + __float2int_rn(delta);
@@ -56,6 +60,6 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
 	const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1;
 	const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0;
 	const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0;
-	result += (overshoot+undershoot);
+	result += (overshoot + undershoot);
 	return result;
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
index d5a98b7da564fc4be9a539b180e5eb5dc2f95fb9..3bec5b1a2339f8a7cf83e38a1a9d6465e7180478 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp
@@ -4,173 +4,142 @@
 #define GROESTL_LENGTHFIELDLEN GROESTL_ROWS
 #define GROESTL_COLS512 8
 
-#define GROESTL_SIZE512 (GROESTL_ROWS*GROESTL_COLS512)
+#define GROESTL_SIZE512 (GROESTL_ROWS * GROESTL_COLS512)
 
 #define GROESTL_ROUNDS512 10
 #define GROESTL_HASH_BIT_LEN 256
 
 #define GROESTL_ROTL32(v, n) ROTL32(v, n)
 
-
 #define li_32(h) 0x##h##u
-#define GROESTL_EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
+#define GROESTL_EXT_BYTE(var, n) ((uint8_t)((uint32_t)(var) >> (8 * n)))
 
-#define u32BIG(a)	\
-	((GROESTL_ROTL32(a,8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a,24) & li_32(FF00FF00)))
+#define u32BIG(a) \
+	((GROESTL_ROTL32(a, 8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a, 24) & li_32(FF00FF00)))
 
-typedef struct {
-	uint32_t chaining[GROESTL_SIZE512/sizeof(uint32_t)];            /* actual state */
+typedef struct
+{
+	uint32_t chaining[GROESTL_SIZE512 / sizeof(uint32_t)]; /* actual state */
 	uint32_t block_counter1,
-	block_counter2;         /* message block counter(s) */
-	BitSequence buffer[GROESTL_SIZE512];      /* data buffer */
-	int buf_ptr;              /* data buffer pointer */
-	int bits_in_last_byte;    /* no. of message bits in last byte of data buffer */
+		block_counter2;					 /* message block counter(s) */
+	BitSequence buffer[GROESTL_SIZE512]; /* data buffer */
+	int buf_ptr;						 /* data buffer pointer */
+	int bits_in_last_byte;				 /* no. of message bits in last byte of data buffer */
 } groestlHashState;
 
-
 __constant__ uint32_t d_groestl_T[512] =
-{
-  0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc
-, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5
-, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d
-, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded
-, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1
-, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441
-, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4
-, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba
-, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616
-, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2
-, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c
-, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de
-, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7
-, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e
-, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c
-, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7
-, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b
-, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4
-, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e
-, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a
-, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37
-, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86
-, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b
-, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028
-, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3
-, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94
-, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836
-, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0
-, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2
-, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e
-, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3
-, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e
-};
-
-#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \
-	{ temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
-		v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
-		v1 = temp_var; }
-
-#define GROESTL_COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \
-	tu = d_groestl_T[2*(uint32_t)x[4*c0+0]];	\
-	tl = d_groestl_T[2*(uint32_t)x[4*c0+0]+1];	\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c1+1]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c1+1]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t)		\
-	tu ^= tv1;									\
-	tl ^= tv2;									\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c2+2]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c2+2]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t)		\
-	tu ^= tv1;									\
-	tl ^= tv2;   								\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c3+3]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c3+3]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t)		\
-	tu ^= tv1;									\
-	tl ^= tv2;									\
-	tl ^= d_groestl_T[2*(uint32_t)x[4*c4+0]];	\
-	tu ^= d_groestl_T[2*(uint32_t)x[4*c4+0]+1];	\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c5+1]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c5+1]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t)		\
-	tl ^= tv1;									\
-	tu ^= tv2;									\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c6+2]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c6+2]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t)		\
-	tl ^= tv1;									\
-	tu ^= tv2;   								\
-	tv1 = d_groestl_T[2*(uint32_t)x[4*c7+3]];	\
-	tv2 = d_groestl_T[2*(uint32_t)x[4*c7+3]+1];	\
-	GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t)		\
-	tl ^= tv1;									\
-	tu ^= tv2;									\
-	y[i] = tu;									\
-	y[i+1] = tl;
-
-__device__ void cn_groestl_RND512P(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+	{
+		0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e};
+
+#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var)                \
+	{                                                                             \
+		temp_var = (v1 << (8 * amount_bytes)) | (v2 >> (8 * (4 - amount_bytes))); \
+		v2 = (v2 << (8 * amount_bytes)) | (v1 >> (8 * (4 - amount_bytes)));       \
+		v1 = temp_var;                                                            \
+	}
+
+#define GROESTL_COLUMN(x, y, i, c0, c1, c2, c3, c4, c5, c6, c7, tv1, tv2, tu, tl, t) \
+	tu = d_groestl_T[2 * (uint32_t)x[4 * c0 + 0]];                                   \
+	tl = d_groestl_T[2 * (uint32_t)x[4 * c0 + 0] + 1];                               \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c1 + 1]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c1 + 1] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 1, t)                                       \
+	tu ^= tv1;                                                                       \
+	tl ^= tv2;                                                                       \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c2 + 2]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c2 + 2] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 2, t)                                       \
+	tu ^= tv1;                                                                       \
+	tl ^= tv2;                                                                       \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c3 + 3]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c3 + 3] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 3, t)                                       \
+	tu ^= tv1;                                                                       \
+	tl ^= tv2;                                                                       \
+	tl ^= d_groestl_T[2 * (uint32_t)x[4 * c4 + 0]];                                  \
+	tu ^= d_groestl_T[2 * (uint32_t)x[4 * c4 + 0] + 1];                              \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c5 + 1]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c5 + 1] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 1, t)                                       \
+	tl ^= tv1;                                                                       \
+	tu ^= tv2;                                                                       \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c6 + 2]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c6 + 2] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 2, t)                                       \
+	tl ^= tv1;                                                                       \
+	tu ^= tv2;                                                                       \
+	tv1 = d_groestl_T[2 * (uint32_t)x[4 * c7 + 3]];                                  \
+	tv2 = d_groestl_T[2 * (uint32_t)x[4 * c7 + 3] + 1];                              \
+	GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 3, t)                                       \
+	tl ^= tv1;                                                                       \
+	tu ^= tv2;                                                                       \
+	y[i] = tu;                                                                       \
+	y[i + 1] = tl;
+
+__device__ void cn_groestl_RND512P(uint8_t* __restrict__ x, uint32_t* __restrict__ y, uint32_t r)
 {
 	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
 	uint32_t* x32 = (uint32_t*)x;
-	x32[ 0] ^= 0x00000000^r;
-	x32[ 2] ^= 0x00000010^r;
-	x32[ 4] ^= 0x00000020^r;
-	x32[ 6] ^= 0x00000030^r;
-	x32[ 8] ^= 0x00000040^r;
-	x32[10] ^= 0x00000050^r;
-	x32[12] ^= 0x00000060^r;
-	x32[14] ^= 0x00000070^r;
-	GROESTL_COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	x32[0] ^= 0x00000000 ^ r;
+	x32[2] ^= 0x00000010 ^ r;
+	x32[4] ^= 0x00000020 ^ r;
+	x32[6] ^= 0x00000030 ^ r;
+	x32[8] ^= 0x00000040 ^ r;
+	x32[10] ^= 0x00000050 ^ r;
+	x32[12] ^= 0x00000060 ^ r;
+	x32[14] ^= 0x00000070 ^ r;
+	GROESTL_COLUMN(x, y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
 }
 
-__device__ void cn_groestl_RND512Q(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+__device__ void cn_groestl_RND512Q(uint8_t* __restrict__ x, uint32_t* __restrict__ y, uint32_t r)
 {
 	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
 	uint32_t* x32 = (uint32_t*)x;
-	x32[ 0] = ~x32[ 0];
-	x32[ 1] ^= 0xffffffff^r;
-	x32[ 2] = ~x32[ 2];
-	x32[ 3] ^= 0xefffffff^r;
-	x32[ 4] = ~x32[ 4];
-	x32[ 5] ^= 0xdfffffff^r;
-	x32[ 6] = ~x32[ 6];
-	x32[ 7] ^= 0xcfffffff^r;
-	x32[ 8] = ~x32[ 8];
-	x32[ 9] ^= 0xbfffffff^r;
+	x32[0] = ~x32[0];
+	x32[1] ^= 0xffffffff ^ r;
+	x32[2] = ~x32[2];
+	x32[3] ^= 0xefffffff ^ r;
+	x32[4] = ~x32[4];
+	x32[5] ^= 0xdfffffff ^ r;
+	x32[6] = ~x32[6];
+	x32[7] ^= 0xcfffffff ^ r;
+	x32[8] = ~x32[8];
+	x32[9] ^= 0xbfffffff ^ r;
 	x32[10] = ~x32[10];
-	x32[11] ^= 0xafffffff^r;
+	x32[11] ^= 0xafffffff ^ r;
 	x32[12] = ~x32[12];
-	x32[13] ^= 0x9fffffff^r;
+	x32[13] ^= 0x9fffffff ^ r;
 	x32[14] = ~x32[14];
-	x32[15] ^= 0x8fffffff^r;
-	GROESTL_COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
-	GROESTL_COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	x32[15] ^= 0x8fffffff ^ r;
+	GROESTL_COLUMN(x, y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x, y, 14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
 }
 
-__device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __restrict__ m)
+__device__ void cn_groestl_F512(uint32_t* __restrict__ h, const uint32_t* __restrict__ m)
 {
 	int i;
-	uint32_t Ptmp[2*GROESTL_COLS512];
-	uint32_t Qtmp[2*GROESTL_COLS512];
-	uint32_t y[2*GROESTL_COLS512];
-	uint32_t z[2*GROESTL_COLS512];
+	uint32_t Ptmp[2 * GROESTL_COLS512];
+	uint32_t Qtmp[2 * GROESTL_COLS512];
+	uint32_t y[2 * GROESTL_COLS512];
+	uint32_t z[2 * GROESTL_COLS512];
 
-	for (i = 0; i < 2*GROESTL_COLS512; i++)
+	for(i = 0; i < 2 * GROESTL_COLS512; i++)
 	{
 		z[i] = m[i];
-		Ptmp[i] = h[i]^m[i];
+		Ptmp[i] = h[i] ^ m[i];
 	}
 
 	cn_groestl_RND512Q((uint8_t*)z, y, 0x00000000);
@@ -195,18 +164,18 @@ __device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __re
 	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
 	cn_groestl_RND512P((uint8_t*)y, Ptmp, 0x00000009);
 
-	for (i = 0; i < 2*GROESTL_COLS512; i++)
-		h[i] ^= Ptmp[i]^Qtmp[i];
+	for(i = 0; i < 2 * GROESTL_COLS512; i++)
+		h[i] ^= Ptmp[i] ^ Qtmp[i];
 }
 
-__device__ void cn_groestl_outputtransformation(groestlHashState *ctx)
+__device__ void cn_groestl_outputtransformation(groestlHashState* ctx)
 {
 	int j;
-	uint32_t temp[2*GROESTL_COLS512];
-	uint32_t y[2*GROESTL_COLS512];
-	uint32_t z[2*GROESTL_COLS512];
+	uint32_t temp[2 * GROESTL_COLS512];
+	uint32_t y[2 * GROESTL_COLS512];
+	uint32_t z[2 * GROESTL_COLS512];
 
-	for (j = 0; j < 2*GROESTL_COLS512; j++)
+	for(j = 0; j < 2 * GROESTL_COLS512; j++)
 		temp[j] = ctx->chaining[j];
 
 	cn_groestl_RND512P((uint8_t*)temp, y, 0x00000000);
@@ -220,33 +189,33 @@ __device__ void cn_groestl_outputtransformation(groestlHashState *ctx)
 	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
 	cn_groestl_RND512P((uint8_t*)y, temp, 0x00000009);
 
-	for (j = 0; j < 2*GROESTL_COLS512; j++)
+	for(j = 0; j < 2 * GROESTL_COLS512; j++)
 		ctx->chaining[j] ^= temp[j];
 }
 
-__device__ void cn_groestl_transform(groestlHashState * __restrict__ ctx,
-	const uint8_t * __restrict__ input, int msglen)
+__device__ void cn_groestl_transform(groestlHashState* __restrict__ ctx,
+	const uint8_t* __restrict__ input, int msglen)
 {
-	for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512)
+	for(; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512)
 	{
-		cn_groestl_F512(ctx->chaining,(uint32_t*)input);
+		cn_groestl_F512(ctx->chaining, (uint32_t*)input);
 		ctx->block_counter1++;
 
-		if (ctx->block_counter1 == 0)
+		if(ctx->block_counter1 == 0)
 			ctx->block_counter2++;
 	}
 }
 
-__device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx,
-	BitSequence* __restrict__  output)
+__device__ void cn_groestl_final(groestlHashState* __restrict__ ctx,
+	BitSequence* __restrict__ output)
 {
-	int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8;
-	uint8_t *s = (BitSequence*)ctx->chaining;
+	int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN / 8;
+	uint8_t* s = (BitSequence*)ctx->chaining;
 
-	if (ctx->bits_in_last_byte)
+	if(ctx->bits_in_last_byte)
 	{
-		ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<ctx->bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte);
-		ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte);
+		ctx->buffer[(int)ctx->buf_ptr - 1] &= ((1 << ctx->bits_in_last_byte) - 1) << (8 - ctx->bits_in_last_byte);
+		ctx->buffer[(int)ctx->buf_ptr - 1] ^= 0x1 << (7 - ctx->bits_in_last_byte);
 		ctx->bits_in_last_byte = 0;
 	}
 	else
@@ -254,29 +223,29 @@ __device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx,
 		ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
 	}
 
-	if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+	if(ctx->buf_ptr > GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN)
 	{
-		while (ctx->buf_ptr < GROESTL_SIZE512)
+		while(ctx->buf_ptr < GROESTL_SIZE512)
 			ctx->buffer[(int)ctx->buf_ptr++] = 0;
 
 		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
 		ctx->buf_ptr = 0;
 	}
 
-	while (ctx->buf_ptr < GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+	while(ctx->buf_ptr < GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN)
 		ctx->buffer[(int)ctx->buf_ptr++] = 0;
 
 	ctx->block_counter1++;
-	if (ctx->block_counter1 == 0)
+	if(ctx->block_counter1 == 0)
 		ctx->block_counter2++;
 	ctx->buf_ptr = GROESTL_SIZE512;
 
-	while (ctx->buf_ptr > GROESTL_SIZE512-(int)sizeof(uint32_t))
+	while(ctx->buf_ptr > GROESTL_SIZE512 - (int)sizeof(uint32_t))
 	{
 		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
 		ctx->block_counter1 >>= 8;
 	}
-	while (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN)
+	while(ctx->buf_ptr > GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN)
 	{
 		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
 		ctx->block_counter2 >>= 8;
@@ -284,12 +253,12 @@ __device__ void cn_groestl_final(groestlHashState*  __restrict__ ctx,
 	cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
 	cn_groestl_outputtransformation(ctx);
 
-	for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++)
+	for(i = GROESTL_SIZE512 - hashbytelen; i < GROESTL_SIZE512; i++, j++)
 		output[j] = s[i];
 
-	for (i = 0; i < GROESTL_COLS512; i++)
+	for(i = 0; i < GROESTL_COLS512; i++)
 		ctx->chaining[i] = 0;
-	for (i = 0; i < GROESTL_SIZE512; i++)
+	for(i = 0; i < GROESTL_SIZE512; i++)
 		ctx->buffer[i] = 0;
 }
 
@@ -297,17 +266,17 @@ __device__ void cn_groestl_update(groestlHashState* __restrict__ ctx,
 	const BitSequence* __restrict__ input, DataLength databitlen)
 {
 	int index = 0;
-	int msglen = (int)(databitlen/8);
-	int rem = (int)(databitlen%8);
+	int msglen = (int)(databitlen / 8);
+	int rem = (int)(databitlen % 8);
 
-	if (ctx->buf_ptr)
+	if(ctx->buf_ptr)
 	{
-		while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen)
+		while(ctx->buf_ptr < GROESTL_SIZE512 && index < msglen)
 			ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
 
-		if (ctx->buf_ptr < GROESTL_SIZE512)
+		if(ctx->buf_ptr < GROESTL_SIZE512)
 		{
-			if (rem)
+			if(rem)
 			{
 				ctx->bits_in_last_byte = rem;
 				ctx->buffer[(int)ctx->buf_ptr++] = input[index];
@@ -319,13 +288,13 @@ __device__ void cn_groestl_update(groestlHashState* __restrict__ ctx,
 		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
 	}
 
-	cn_groestl_transform(ctx, input+index, msglen-index);
-	index += ((msglen-index)/GROESTL_SIZE512)*GROESTL_SIZE512;
+	cn_groestl_transform(ctx, input + index, msglen - index);
+	index += ((msglen - index) / GROESTL_SIZE512) * GROESTL_SIZE512;
 
-	while (index < msglen)
+	while(index < msglen)
 		ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
 
-	if (rem)
+	if(rem)
 	{
 		ctx->bits_in_last_byte = rem;
 		ctx->buffer[(int)ctx->buf_ptr++] = input[index];
@@ -336,17 +305,17 @@ __device__ void cn_groestl_init(groestlHashState* ctx)
 {
 	int i = 0;
 
-	for(;i<(GROESTL_SIZE512/sizeof(uint32_t));i++)
+	for(; i < (GROESTL_SIZE512 / sizeof(uint32_t)); i++)
 		ctx->chaining[i] = 0;
 
-	ctx->chaining[2*GROESTL_COLS512-1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN);
+	ctx->chaining[2 * GROESTL_COLS512 - 1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN);
 	ctx->buf_ptr = 0;
 	ctx->block_counter1 = 0;
 	ctx->block_counter2 = 0;
 	ctx->bits_in_last_byte = 0;
 }
 
-__device__ void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+__device__ void cn_groestl(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval)
 {
 	DataLength databitlen = len << 3;
 	groestlHashState context;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
index 284039ff41f7179d9a57143cffbbbb6b00a7b82f..1019a9b9ce58afa074336ec728c481e505168e87 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp
@@ -1,6 +1,7 @@
 #include <stdint.h>
 
-typedef struct {
+typedef struct
+{
 	int hashbitlen;
 	unsigned long long databitlen;
 	unsigned long long datasize_in_buffer;
@@ -9,159 +10,175 @@ typedef struct {
 } jhHashState;
 
 __constant__ unsigned char d_JH256_H0[512] =
-{
-	0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1,
-	0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3,
-	0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77,
-	0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8,
-	0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62,
-	0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c,
-	0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf,
-	0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69
-};
+	{
+		0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1,
+		0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3,
+		0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77,
+		0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8,
+		0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62,
+		0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c,
+		0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf,
+		0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69};
 
 __constant__ unsigned char d_E8_rc[42][32] =
-{
-	{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
-	{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
-	{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
-	{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
-	{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
-	{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
-	{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
-	{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
-	{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
-	{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
-	{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
-	{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
-	{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
-	{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
-	{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
-	{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
-	{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
-	{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
-	{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
-	{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
-	{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
-	{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
-	{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
-	{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
-	{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
-	{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
-	{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
-	{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
-	{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
-	{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
-	{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
-	{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
-	{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
-	{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
-	{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
-	{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
-	{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
-	{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
-	{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
-	{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
-	{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
-	{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}
-};
-
-#define JH_SWAP1(x)   (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
-#define JH_SWAP2(x)   (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
-#define JH_SWAP4(x)   (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
-#define JH_SWAP8(x)   (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
-#define JH_SWAP16(x)  (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
-#define JH_SWAP32(x)  (x) = (((x) << 32) | ((x) >> 32));
-
-#define JH_L(m0,m1,m2,m3,m4,m5,m6,m7) \
-	(m4) ^= (m1);                \
-	(m5) ^= (m2);                \
-	(m6) ^= (m0) ^ (m3);         \
-	(m7) ^= (m0);                \
-	(m0) ^= (m5);                \
-	(m1) ^= (m6);                \
-	(m2) ^= (m4) ^ (m7);         \
+	{
+		{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
+		{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
+		{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
+		{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
+		{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
+		{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
+		{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
+		{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
+		{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
+		{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
+		{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
+		{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
+		{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
+		{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
+		{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
+		{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
+		{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
+		{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
+		{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
+		{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
+		{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
+		{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
+		{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
+		{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
+		{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
+		{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
+		{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
+		{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
+		{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
+		{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
+		{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
+		{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
+		{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
+		{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
+		{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
+		{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
+		{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
+		{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
+		{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
+		{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
+		{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
+		{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}};
+
+#define JH_SWAP1(x) (x) = ((((x)&0x5555555555555555ULL) << 1) | (((x)&0xaaaaaaaaaaaaaaaaULL) >> 1));
+#define JH_SWAP2(x) (x) = ((((x)&0x3333333333333333ULL) << 2) | (((x)&0xccccccccccccccccULL) >> 2));
+#define JH_SWAP4(x) (x) = ((((x)&0x0f0f0f0f0f0f0f0fULL) << 4) | (((x)&0xf0f0f0f0f0f0f0f0ULL) >> 4));
+#define JH_SWAP8(x) (x) = ((((x)&0x00ff00ff00ff00ffULL) << 8) | (((x)&0xff00ff00ff00ff00ULL) >> 8));
+#define JH_SWAP16(x) (x) = ((((x)&0x0000ffff0000ffffULL) << 16) | (((x)&0xffff0000ffff0000ULL) >> 16));
+#define JH_SWAP32(x) (x) = (((x) << 32) | ((x) >> 32));
+
+#define JH_L(m0, m1, m2, m3, m4, m5, m6, m7) \
+	(m4) ^= (m1);                            \
+	(m5) ^= (m2);                            \
+	(m6) ^= (m0) ^ (m3);                     \
+	(m7) ^= (m0);                            \
+	(m0) ^= (m5);                            \
+	(m1) ^= (m6);                            \
+	(m2) ^= (m4) ^ (m7);                     \
 	(m3) ^= (m4);
 
-#define JH_SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1)   \
-	m3  = ~(m3);                  \
-	m7  = ~(m7);                  \
-	m0 ^= ((~(m2)) & (cc0));      \
-	m4 ^= ((~(m6)) & (cc1));      \
-	temp0 = (cc0) ^ ((m0) & (m1));\
-	temp1 = (cc1) ^ ((m4) & (m5));\
-	m0 ^= ((m2) & (m3));          \
-	m4 ^= ((m6) & (m7));          \
-	m3 ^= ((~(m1)) & (m2));       \
-	m7 ^= ((~(m5)) & (m6));       \
-	m1 ^= ((m0) & (m2));          \
-	m5 ^= ((m4) & (m6));          \
-	m2 ^= ((m0) & (~(m3)));       \
-	m6 ^= ((m4) & (~(m7)));       \
-	m0 ^= ((m1) | (m3));          \
-	m4 ^= ((m5) | (m7));          \
-	m3 ^= ((m1) & (m2));          \
-	m7 ^= ((m5) & (m6));          \
-	m1 ^= (temp0 & (m0));         \
-	m5 ^= (temp1 & (m4));         \
-	m2 ^= temp0;                  \
+#define JH_SS(m0, m1, m2, m3, m4, m5, m6, m7, cc0, cc1) \
+	m3 = ~(m3);                                         \
+	m7 = ~(m7);                                         \
+	m0 ^= ((~(m2)) & (cc0));                            \
+	m4 ^= ((~(m6)) & (cc1));                            \
+	temp0 = (cc0) ^ ((m0) & (m1));                      \
+	temp1 = (cc1) ^ ((m4) & (m5));                      \
+	m0 ^= ((m2) & (m3));                                \
+	m4 ^= ((m6) & (m7));                                \
+	m3 ^= ((~(m1)) & (m2));                             \
+	m7 ^= ((~(m5)) & (m6));                             \
+	m1 ^= ((m0) & (m2));                                \
+	m5 ^= ((m4) & (m6));                                \
+	m2 ^= ((m0) & (~(m3)));                             \
+	m6 ^= ((m4) & (~(m7)));                             \
+	m0 ^= ((m1) | (m3));                                \
+	m4 ^= ((m5) | (m7));                                \
+	m3 ^= ((m1) & (m2));                                \
+	m7 ^= ((m5) & (m6));                                \
+	m1 ^= (temp0 & (m0));                               \
+	m5 ^= (temp1 & (m4));                               \
+	m2 ^= temp0;                                        \
 	m6 ^= temp1;
 
-__device__ void cn_jh_E8(jhHashState *state)
+__device__ void cn_jh_E8(jhHashState* state)
 {
-	uint64_t i,roundnumber,temp0,temp1;
+	uint64_t i, roundnumber, temp0, temp1;
 
-	for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7)
+	for(roundnumber = 0; roundnumber < 42; roundnumber = roundnumber + 7)
 	{
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+0])[i],((uint64_t *)d_E8_rc[roundnumber+0])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP1(state->x[1][i]); JH_SWAP1(state->x[3][i]); JH_SWAP1(state->x[5][i]); JH_SWAP1(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 0])[i], ((uint64_t*)d_E8_rc[roundnumber + 0])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP1(state->x[1][i]);
+			JH_SWAP1(state->x[3][i]);
+			JH_SWAP1(state->x[5][i]);
+			JH_SWAP1(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+1])[i],((uint64_t *)d_E8_rc[roundnumber+1])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP2(state->x[1][i]); JH_SWAP2(state->x[3][i]); JH_SWAP2(state->x[5][i]); JH_SWAP2(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 1])[i], ((uint64_t*)d_E8_rc[roundnumber + 1])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP2(state->x[1][i]);
+			JH_SWAP2(state->x[3][i]);
+			JH_SWAP2(state->x[5][i]);
+			JH_SWAP2(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+2])[i],((uint64_t *)d_E8_rc[roundnumber+2])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP4(state->x[1][i]); JH_SWAP4(state->x[3][i]); JH_SWAP4(state->x[5][i]); JH_SWAP4(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 2])[i], ((uint64_t*)d_E8_rc[roundnumber + 2])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP4(state->x[1][i]);
+			JH_SWAP4(state->x[3][i]);
+			JH_SWAP4(state->x[5][i]);
+			JH_SWAP4(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+3])[i],((uint64_t *)d_E8_rc[roundnumber+3])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP8(state->x[1][i]); JH_SWAP8(state->x[3][i]); JH_SWAP8(state->x[5][i]); JH_SWAP8(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 3])[i], ((uint64_t*)d_E8_rc[roundnumber + 3])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP8(state->x[1][i]);
+			JH_SWAP8(state->x[3][i]);
+			JH_SWAP8(state->x[5][i]);
+			JH_SWAP8(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+4])[i],((uint64_t *)d_E8_rc[roundnumber+4])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP16(state->x[1][i]); JH_SWAP16(state->x[3][i]); JH_SWAP16(state->x[5][i]); JH_SWAP16(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 4])[i], ((uint64_t*)d_E8_rc[roundnumber + 4])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP16(state->x[1][i]);
+			JH_SWAP16(state->x[3][i]);
+			JH_SWAP16(state->x[5][i]);
+			JH_SWAP16(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+5])[i],((uint64_t *)d_E8_rc[roundnumber+5])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-			JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 5])[i], ((uint64_t*)d_E8_rc[roundnumber + 5])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
+			JH_SWAP32(state->x[1][i]);
+			JH_SWAP32(state->x[3][i]);
+			JH_SWAP32(state->x[5][i]);
+			JH_SWAP32(state->x[7][i]);
 		}
 
-		for (i = 0; i < 2; i++)
+		for(i = 0; i < 2; i++)
 		{
-			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] );
-			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 6])[i], ((uint64_t*)d_E8_rc[roundnumber + 6])[i + 2]);
+			JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]);
 		}
 
-		for (i = 1; i < 8; i = i+2)
+		for(i = 1; i < 8; i = i + 2)
 		{
 			temp0 = state->x[i][0];
 			state->x[i][0] = state->x[i][1];
@@ -170,75 +187,75 @@ __device__ void cn_jh_E8(jhHashState *state)
 	}
 }
 
-__device__ void cn_jh_F8(jhHashState *state)
+__device__ void cn_jh_F8(jhHashState* state)
 {
 	uint64_t i;
 
-	for (i = 0; i < 8; i++)
-		state->x[i >> 1][i & 1] ^= ((uint64_t *)state->buffer)[i];
+	for(i = 0; i < 8; i++)
+		state->x[i >> 1][i & 1] ^= ((uint64_t*)state->buffer)[i];
 
 	cn_jh_E8(state);
 
-	for (i = 0; i < 8; i++)
-		state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64_t *)state->buffer)[i];
+	for(i = 0; i < 8; i++)
+		state->x[(8 + i) >> 1][(8 + i) & 1] ^= ((uint64_t*)state->buffer)[i];
 }
 
-__device__ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
+__device__ void cn_jh_update(jhHashState* __restrict__ state, const BitSequence* __restrict__ data, DataLength databitlen)
 {
 	DataLength index;
 
 	state->databitlen += databitlen;
 	index = 0;
 
-	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  )
+	if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) < 512))
 	{
-		if ( (databitlen & 7) == 0 )
-			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3));
+		if((databitlen & 7) == 0)
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3));
 		else
-			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1);
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3) + 1);
 		state->datasize_in_buffer += databitlen;
 		databitlen = 0;
 	}
 
-	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  )
+	if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) >= 512))
 	{
-		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
-		index = 64-(state->datasize_in_buffer >> 3);
+		memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3));
+		index = 64 - (state->datasize_in_buffer >> 3);
 		databitlen = databitlen - (512 - state->datasize_in_buffer);
 		cn_jh_F8(state);
 		state->datasize_in_buffer = 0;
 	}
 
-	for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512)
+	for(; databitlen >= 512; index = index + 64, databitlen = databitlen - 512)
 	{
-		memcpy(state->buffer, data+index, 64);
+		memcpy(state->buffer, data + index, 64);
 		cn_jh_F8(state);
 	}
 
-	if ( databitlen > 0)
+	if(databitlen > 0)
 	{
-		if ((databitlen & 7) == 0)
-			memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
+		if((databitlen & 7) == 0)
+			memcpy(state->buffer, data + index, (databitlen & 0x1ff) >> 3);
 		else
-			memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
+			memcpy(state->buffer, data + index, ((databitlen & 0x1ff) >> 3) + 1);
 		state->datasize_in_buffer = databitlen;
 	}
 }
 
 /*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
-__device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ hashval)
+__device__ void cn_jh_final(jhHashState* __restrict__ state, BitSequence* __restrict__ hashval)
 {
 	unsigned int i;
 	//uint32_t *bufptr = (uint32_t *)state->buffer;
 
-	if ( (state->databitlen & 0x1ff) == 0 )
+	if((state->databitlen & 0x1ff) == 0)
 	{
 		/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
 		memset(state->buffer, 0, 64);
 		//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
-		state->buffer[0]  = 0x80;
+		state->buffer[0] = 0x80;
 		state->buffer[63] = state->databitlen & 0xff;
-		state->buffer[62] = (state->databitlen >> 8)  & 0xff;
+		state->buffer[62] = (state->databitlen >> 8) & 0xff;
 		state->buffer[61] = (state->databitlen >> 16) & 0xff;
 		state->buffer[60] = (state->databitlen >> 24) & 0xff;
 		state->buffer[59] = (state->databitlen >> 32) & 0xff;
@@ -250,19 +267,19 @@ __device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __re
 	else
 	{
 		/*set the rest of the bytes in the buffer to 0*/
-		if ( (state->datasize_in_buffer & 7) == 0)
+		if((state->datasize_in_buffer & 7) == 0)
 		{
-			for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)
+			for(i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)
 				state->buffer[i] = 0;
 		}
 		else
 		{
-			for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++)
+			for(i = ((state->databitlen & 0x1ff) >> 3) + 1; i < 64; i++)
 				state->buffer[i] = 0;
 		}
 
 		/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
-		state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
+		state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7 - (state->databitlen & 7));
 
 		cn_jh_F8(state);
 		memset(state->buffer, 0, 64);
@@ -278,10 +295,10 @@ __device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __re
 		cn_jh_F8(state);
 	}
 
-	memcpy(hashval,(unsigned char*)state->x+64+32,32);
+	memcpy(hashval, (unsigned char*)state->x + 64 + 32, 32);
 }
 
-__device__ void cn_jh_init(jhHashState *state, int hashbitlen)
+__device__ void cn_jh_init(jhHashState* state, int hashbitlen)
 {
 	state->databitlen = 0;
 	state->datasize_in_buffer = 0;
@@ -289,7 +306,7 @@ __device__ void cn_jh_init(jhHashState *state, int hashbitlen)
 	memcpy(state->x, d_JH256_H0, 128);
 }
 
-__device__ void cn_jh(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+__device__ void cn_jh(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval)
 {
 	int hashbitlen = 256;
 	DataLength databitlen = len << 3;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
index 3f535631db0ceaceb30f6342d6a3ffa8dbf87425..0fe277bd5da31d3167fd0271cceb95a6f4111e9d 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
@@ -7,42 +7,49 @@ __constant__
 #else
 const
 #endif
-uint64_t keccakf_rndc[24] ={
-	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
-	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
-	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
-	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
-	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
-	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
-};
+	uint64_t keccakf_rndc[24] = {
+		0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+		0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+		0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+		0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+		0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+		0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+		0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+		0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
 
 #if __CUDA_ARCH__ >= 350
-	__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
+__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
+{
+	uint2 result;
+	if(offset >= 32)
+	{
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.x)
+			: "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.y)
+			: "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	}
+	else
 	{
-		uint2 result;
-		if(offset >= 32)
-		{
-			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-		}
-		else
-		{
-			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-		}
-		return  __double_as_longlong(__hiloint2double(result.y, result.x));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.x)
+			: "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+			: "=r"(result.y)
+			: "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
 	}
-	#define rotl64_1(x, y) (cuda_rotl64((x), (y)))
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#define rotl64_1(x, y) (cuda_rotl64((x), (y)))
 #else
-	#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y))))
+#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y))))
 #endif
 
 #define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y))
 #define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
 
-__device__ __forceinline__ void cn_keccakf2(uint64_t *s)
+__device__ __forceinline__ void cn_keccakf2(uint64_t* s)
 {
 	uint8_t i;
 
@@ -90,16 +97,46 @@ __device__ __forceinline__ void cn_keccakf2(uint64_t *s)
 		s[7] = rotl64_1(s[10] ^ bc[4], 3);
 		s[10] = rotl64_1(tmp1, 1);
 
-		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
-		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
-		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
-		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
-		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		tmp1 = s[0];
+		tmp2 = s[1];
+		s[0] = bitselect(s[0] ^ s[2], s[0], s[1]);
+		s[1] = bitselect(s[1] ^ s[3], s[1], s[2]);
+		s[2] = bitselect(s[2] ^ s[4], s[2], s[3]);
+		s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]);
+		s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5];
+		tmp2 = s[6];
+		s[5] = bitselect(s[5] ^ s[7], s[5], s[6]);
+		s[6] = bitselect(s[6] ^ s[8], s[6], s[7]);
+		s[7] = bitselect(s[7] ^ s[9], s[7], s[8]);
+		s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]);
+		s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10];
+		tmp2 = s[11];
+		s[10] = bitselect(s[10] ^ s[12], s[10], s[11]);
+		s[11] = bitselect(s[11] ^ s[13], s[11], s[12]);
+		s[12] = bitselect(s[12] ^ s[14], s[12], s[13]);
+		s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]);
+		s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15];
+		tmp2 = s[16];
+		s[15] = bitselect(s[15] ^ s[17], s[15], s[16]);
+		s[16] = bitselect(s[16] ^ s[18], s[16], s[17]);
+		s[17] = bitselect(s[17] ^ s[19], s[17], s[18]);
+		s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]);
+		s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20];
+		tmp2 = s[21];
+		s[20] = bitselect(s[20] ^ s[22], s[20], s[21]);
+		s[21] = bitselect(s[21] ^ s[23], s[21], s[22]);
+		s[22] = bitselect(s[22] ^ s[24], s[22], s[23]);
+		s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]);
+		s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
 		s[0] ^= keccakf_rndc[i];
 	}
 }
 
-__device__ __forceinline__ void cn_keccakf(uint64_t *s)
+__device__ __forceinline__ void cn_keccakf(uint64_t* s)
 {
 	uint64_t bc[5], tmpxor[5], tmp1, tmp2;
 
@@ -145,16 +182,46 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s)
 		s[7] = rotl64_1(s[10] ^ bc[4], 3);
 		s[10] = rotl64_1(tmp1, 1);
 
-		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
-		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
-		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
-		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
-		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		tmp1 = s[0];
+		tmp2 = s[1];
+		s[0] = bitselect(s[0] ^ s[2], s[0], s[1]);
+		s[1] = bitselect(s[1] ^ s[3], s[1], s[2]);
+		s[2] = bitselect(s[2] ^ s[4], s[2], s[3]);
+		s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]);
+		s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5];
+		tmp2 = s[6];
+		s[5] = bitselect(s[5] ^ s[7], s[5], s[6]);
+		s[6] = bitselect(s[6] ^ s[8], s[6], s[7]);
+		s[7] = bitselect(s[7] ^ s[9], s[7], s[8]);
+		s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]);
+		s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10];
+		tmp2 = s[11];
+		s[10] = bitselect(s[10] ^ s[12], s[10], s[11]);
+		s[11] = bitselect(s[11] ^ s[13], s[11], s[12]);
+		s[12] = bitselect(s[12] ^ s[14], s[12], s[13]);
+		s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]);
+		s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15];
+		tmp2 = s[16];
+		s[15] = bitselect(s[15] ^ s[17], s[15], s[16]);
+		s[16] = bitselect(s[16] ^ s[18], s[16], s[17]);
+		s[17] = bitselect(s[17] ^ s[19], s[17], s[18]);
+		s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]);
+		s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20];
+		tmp2 = s[21];
+		s[20] = bitselect(s[20] ^ s[22], s[20], s[21]);
+		s[21] = bitselect(s[21] ^ s[23], s[21], s[22]);
+		s[22] = bitselect(s[22] ^ s[24], s[22], s[23]);
+		s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]);
+		s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
 		s[0] ^= keccakf_rndc[i];
 	}
 }
 
-__device__ __forceinline__ void cn_keccak(const uint8_t * __restrict__ in, uint32_t len, uint8_t * __restrict__ md)
+__device__ __forceinline__ void cn_keccak(const uint8_t* __restrict__ in, uint32_t len, uint8_t* __restrict__ md)
 {
 	uint64_t st[25];
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
index fc45db1ae8906ea47777039bc26fd4763fed1eeb..b8073f03b507213bf2121a4be87d78cf32bb4be6 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp
@@ -1,124 +1,146 @@
 #pragma once
 
-typedef unsigned int    uint_t;             /* native unsigned integer */
+typedef unsigned int uint_t; /* native unsigned integer */
 
-#define SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */
 
-#define SKEIN_256_STATE_WORDS ( 4)
-#define SKEIN_512_STATE_WORDS ( 8)
+#define SKEIN_256_STATE_WORDS (4)
+#define SKEIN_512_STATE_WORDS (8)
 #define SKEIN1024_STATE_WORDS (16)
 
-#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
-#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS)
 
-#define SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
-#define SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS)
 
-#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
-#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
+#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS)
+#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS)
+#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS)
 
-#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
-#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32))
+#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
 
-#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+#define SKEIN_T1_BIT(BIT) ((BIT)-64) /* offset 64 because it's the second word  */
 
-#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
-#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
-#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
-#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126)	/* bits 126     : first block flag         */
+#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119)  /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127)	/* bit  127     : final block flag         */
+#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field               */
 
-#define SKEIN_T1_FLAG_FIRST     (((uint64_t)  1 ) << SKEIN_T1_POS_FIRST)
-#define SKEIN_T1_FLAG_BIT_PAD   (((uint64_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
-#define SKEIN_T1_FLAG_FINAL     (((uint64_t)  1 ) << SKEIN_T1_POS_FINAL)
+#define SKEIN_T1_FLAG_FIRST (((uint64_t)1) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t)1) << SKEIN_T1_POS_BIT_PAD)
+#define SKEIN_T1_FLAG_FINAL (((uint64_t)1) << SKEIN_T1_POS_FINAL)
 
-#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
-#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+#define SKEIN_BLK_TYPE_MSG (48) /* message processing */
+#define SKEIN_BLK_TYPE_OUT (63) /* output stage */
 
-#define SKEIN_T1_BLK_TYPE(T)   (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE(T) (((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
 
-#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
-#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */
 
-#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
 
-#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
-
-#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
-#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
-
-#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \
-  Skein_Set_T0(ctxPtr,(T0)); \
-  Skein_Set_T1(ctxPtr,(T1)); }
-
-#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
-{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
-
-#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \
+	{                                          \
+		(ctxPtr)->h.T[TWK_NUM] = (tVal);       \
+	}
 
-#define KW_TWK_BASE     (0)
-#define KW_KEY_BASE     (3)
-#define ks              (kw + KW_KEY_BASE)
-#define ts              (kw + KW_TWK_BASE)
+#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0)
+#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1)
 
-#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \
-	X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \
-	X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \
-	X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \
-	X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6;
+#define Skein_Set_T0_T1(ctxPtr, T0, T1) \
+	{                                   \
+		Skein_Set_T0(ctxPtr, (T0));     \
+		Skein_Set_T1(ctxPtr, (T1));     \
+	}
 
-#define I512(R) \
-	X0   += ks[((R)+1) % 9]; \
-	X1   += ks[((R)+2) % 9]; \
-	X2   += ks[((R)+3) % 9]; \
-	X3   += ks[((R)+4) % 9]; \
-	X4   += ks[((R)+5) % 9]; \
-	X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
-	X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
-	X7   += ks[((R)+8) % 9] + (R)+1;
+#define Skein_Start_New_Type(ctxPtr, BLK_TYPE)                                          \
+	{                                                                                   \
+		Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); \
+		(ctxPtr)->h.bCnt = 0;                                                           \
+	}
 
+#define Skein_Set_Bit_Pad_Flag(hdr)          \
+	{                                        \
+		(hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \
+	}
 
-#define R512_8_rounds(R) \
-	R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
-	R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
-	R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
-	R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
-	I512(2*(R)); \
-	R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
-	R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
-	R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
-	R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
-	I512(2*(R)+1);
+#define KW_TWK_BASE (0)
+#define KW_KEY_BASE (3)
+#define ks (kw + KW_KEY_BASE)
+#define ts (kw + KW_TWK_BASE)
+
+#define R512(p0, p1, p2, p3, p4, p5, p6, p7, R512ROT, rNum) \
+	X##p0 += X##p1;                                         \
+	X##p1 = ROTL64(X##p1, R512ROT##_0);                     \
+	X##p1 ^= X##p0;                                         \
+	X##p2 += X##p3;                                         \
+	X##p3 = ROTL64(X##p3, R512ROT##_1);                     \
+	X##p3 ^= X##p2;                                         \
+	X##p4 += X##p5;                                         \
+	X##p5 = ROTL64(X##p5, R512ROT##_2);                     \
+	X##p5 ^= X##p4;                                         \
+	X##p6 += X##p7;                                         \
+	X##p7 = ROTL64(X##p7, R512ROT##_3);                     \
+	X##p7 ^= X##p6;
+
+#define I512(R)                                  \
+	X0 += ks[((R) + 1) % 9];                     \
+	X1 += ks[((R) + 2) % 9];                     \
+	X2 += ks[((R) + 3) % 9];                     \
+	X3 += ks[((R) + 4) % 9];                     \
+	X4 += ks[((R) + 5) % 9];                     \
+	X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \
+	X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \
+	X7 += ks[((R) + 8) % 9] + (R) + 1;
+
+#define R512_8_rounds(R)                                \
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \
+	I512(2 * (R));                                      \
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \
+	I512(2 * (R) + 1);
 
 typedef struct
 {
-	size_t  hashBitLen;
-	size_t  bCnt;
-	uint64_t  T[SKEIN_MODIFIER_WORDS];
+	size_t hashBitLen;
+	size_t bCnt;
+	uint64_t T[SKEIN_MODIFIER_WORDS];
 } Skein_Ctxt_Hdr_t;
 
-typedef struct {
+typedef struct
+{
 	Skein_Ctxt_Hdr_t h;
-	uint64_t  X[SKEIN_256_STATE_WORDS];
-	uint8_t  b[SKEIN_256_BLOCK_BYTES];
+	uint64_t X[SKEIN_256_STATE_WORDS];
+	uint8_t b[SKEIN_256_BLOCK_BYTES];
 } Skein_256_Ctxt_t;
 
-typedef struct {
+typedef struct
+{
 	Skein_Ctxt_Hdr_t h;
-	uint64_t  X[SKEIN_512_STATE_WORDS];
-	uint8_t  b[SKEIN_512_BLOCK_BYTES];
+	uint64_t X[SKEIN_512_STATE_WORDS];
+	uint8_t b[SKEIN_512_BLOCK_BYTES];
 } Skein_512_Ctxt_t;
 
-typedef struct {
+typedef struct
+{
 	Skein_Ctxt_Hdr_t h;
-	uint64_t  X[SKEIN1024_STATE_WORDS];
-	uint8_t  b[SKEIN1024_BLOCK_BYTES];
+	uint64_t X[SKEIN1024_STATE_WORDS];
+	uint8_t b[SKEIN1024_BLOCK_BYTES];
 } Skein1024_Ctxt_t;
 
-typedef struct {
-	uint_t  statebits;
+typedef struct
+{
+	uint_t statebits;
 	union {
 		Skein_Ctxt_Hdr_t h;
 		Skein_256_Ctxt_t ctx_256;
@@ -127,21 +149,20 @@ typedef struct {
 	} u;
 } skeinHashState;
 
-__device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen)
+__device__ void cn_skein_init(skeinHashState* state, size_t hashBitLen)
 {
 	const uint64_t SKEIN_512_IV_256[] =
-	{
-		SKEIN_MK_64(0xCCD044A1,0x2FDB3E13),
-		SKEIN_MK_64(0xE8359030,0x1A79A9EB),
-		SKEIN_MK_64(0x55AEA061,0x4F816E6F),
-		SKEIN_MK_64(0x2A2767A4,0xAE9B94DB),
-		SKEIN_MK_64(0xEC06025E,0x74DD7683),
-		SKEIN_MK_64(0xE7A436CD,0xC4746251),
-		SKEIN_MK_64(0xC36FBAF9,0x393AD185),
-		SKEIN_MK_64(0x3EEDBA18,0x33EDFC13)
-	};
+		{
+			SKEIN_MK_64(0xCCD044A1, 0x2FDB3E13),
+			SKEIN_MK_64(0xE8359030, 0x1A79A9EB),
+			SKEIN_MK_64(0x55AEA061, 0x4F816E6F),
+			SKEIN_MK_64(0x2A2767A4, 0xAE9B94DB),
+			SKEIN_MK_64(0xEC06025E, 0x74DD7683),
+			SKEIN_MK_64(0xE7A436CD, 0xC4746251),
+			SKEIN_MK_64(0xC36FBAF9, 0x393AD185),
+			SKEIN_MK_64(0x3EEDBA18, 0x33EDFC13)};
 
-	Skein_512_Ctxt_t *ctx = &state->u.ctx_512;
+	Skein_512_Ctxt_t* ctx = &state->u.ctx_512;
 
 	ctx->h.hashBitLen = hashBitLen;
 
@@ -150,22 +171,47 @@ __device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen)
 	Skein_Start_New_Type(ctx, MSG);
 }
 
-__device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd)
+__device__ void cn_skein512_processblock(Skein_512_Ctxt_t* __restrict__ ctx, const uint8_t* __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd)
 {
-	enum {
-		R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
-		R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
-		R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
-		R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
-		R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
-		R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
-		R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
-		R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22
+	enum
+	{
+		R_512_0_0 = 46,
+		R_512_0_1 = 36,
+		R_512_0_2 = 19,
+		R_512_0_3 = 37,
+		R_512_1_0 = 33,
+		R_512_1_1 = 27,
+		R_512_1_2 = 14,
+		R_512_1_3 = 42,
+		R_512_2_0 = 17,
+		R_512_2_1 = 49,
+		R_512_2_2 = 36,
+		R_512_2_3 = 39,
+		R_512_3_0 = 44,
+		R_512_3_1 = 9,
+		R_512_3_2 = 54,
+		R_512_3_3 = 56,
+		R_512_4_0 = 39,
+		R_512_4_1 = 30,
+		R_512_4_2 = 34,
+		R_512_4_3 = 24,
+		R_512_5_0 = 13,
+		R_512_5_1 = 50,
+		R_512_5_2 = 10,
+		R_512_5_3 = 17,
+		R_512_6_0 = 25,
+		R_512_6_1 = 29,
+		R_512_6_2 = 39,
+		R_512_6_3 = 43,
+		R_512_7_0 = 8,
+		R_512_7_1 = 35,
+		R_512_7_2 = 56,
+		R_512_7_3 = 22
 	};
 
-	uint64_t X0,X1,X2,X3,X4,X5,X6,X7;
+	uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
 	uint64_t w[SKEIN_512_STATE_WORDS];
-	uint64_t kw[SKEIN_512_STATE_WORDS+4];
+	uint64_t kw[SKEIN_512_STATE_WORDS + 4];
 
 	ts[0] = ctx->h.T[0];
 	ts[1] = ctx->h.T[1];
@@ -184,7 +230,7 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co
 		ks[6] = ctx->X[6];
 		ks[7] = ctx->X[7];
 		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
-		ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+				ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
 
 		ts[2] = ts[0] ^ ts[1];
 
@@ -201,15 +247,15 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co
 
 		blkPtr += SKEIN_512_BLOCK_BYTES;
 
-		R512_8_rounds( 0);
-		R512_8_rounds( 1);
-		R512_8_rounds( 2);
-		R512_8_rounds( 3);
-		R512_8_rounds( 4);
-		R512_8_rounds( 5);
-		R512_8_rounds( 6);
-		R512_8_rounds( 7);
-		R512_8_rounds( 8);
+		R512_8_rounds(0);
+		R512_8_rounds(1);
+		R512_8_rounds(2);
+		R512_8_rounds(3);
+		R512_8_rounds(4);
+		R512_8_rounds(5);
+		R512_8_rounds(6);
+		R512_8_rounds(7);
+		R512_8_rounds(8);
 
 		ctx->X[0] = X0 ^ w[0];
 		ctx->X[1] = X1 ^ w[1];
@@ -221,125 +267,124 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co
 		ctx->X[7] = X7 ^ w[7];
 
 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
-	}
-	while (--blkCnt);
+	} while(--blkCnt);
 
 	ctx->h.T[0] = ts[0];
 	ctx->h.T[1] = ts[1];
 }
 
-__device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
+__device__ void cn_skein_final(skeinHashState* __restrict__ state, uint8_t* __restrict__ hashVal)
 {
-	size_t i,n,byteCnt;
+	size_t i, n, byteCnt;
 	uint64_t X[SKEIN_512_STATE_WORDS];
-	Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
+	Skein_512_Ctxt_t* ctx = (Skein_512_Ctxt_t*)&state->u.ctx_512;
 	//size_t tmp;
 	//uint8_t *p8;
 	//uint64_t *p64;
 
 	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
 
-	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+	if(ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
 	{
-		memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
 		//p8 = &ctx->b[ctx->h.bCnt];
 		//tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
 		//for( i = 0; i < tmp; i++ ) *(p8+i) = 0;
 	}
 
-	cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt);
+	cn_skein512_processblock(ctx, ctx->b, 1, ctx->h.bCnt);
 
 	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
 
 	//uint8_t  b[SKEIN_512_BLOCK_BYTES] == 64
-	memset(ctx->b,0,sizeof(ctx->b));
+	memset(ctx->b, 0, sizeof(ctx->b));
 	//p64 = (uint64_t *)ctx->b;
 	//for( i = 0; i < 8; i++ ) *(p64+i) = 0;
 
-	memcpy(X,ctx->X,sizeof(X));
+	memcpy(X, ctx->X, sizeof(X));
 
-	for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++)
+	for(i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++)
 	{
-		((uint64_t *)ctx->b)[0]= (uint64_t)i;
-		Skein_Start_New_Type(ctx,OUT_FINAL);
-		cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t));
-		n = byteCnt - i*SKEIN_512_BLOCK_BYTES;
-		if (n >= SKEIN_512_BLOCK_BYTES)
-		n  = SKEIN_512_BLOCK_BYTES;
-		memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);
-		memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+		((uint64_t*)ctx->b)[0] = (uint64_t)i;
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		cn_skein512_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+		if(n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		memcpy(hashVal + i * SKEIN_512_BLOCK_BYTES, ctx->X, n);
+		memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */
 	}
 }
 
-__device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt)
+__device__ void cn_skein512_update(Skein_512_Ctxt_t* __restrict__ ctx, const uint8_t* __restrict__ msg, size_t msgByteCnt)
 {
 	size_t n;
 
-	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+	if(msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
 	{
 
-		if (ctx->h.bCnt)
+		if(ctx->h.bCnt)
 		{
 
 			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
 
-			if (n)
+			if(n)
 			{
-				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
-				msgByteCnt  -= n;
-				msg         += n;
+				memcpy(&ctx->b[ctx->h.bCnt], msg, n);
+				msgByteCnt -= n;
+				msg += n;
 				ctx->h.bCnt += n;
 			}
 
-			cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+			cn_skein512_processblock(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES);
 			ctx->h.bCnt = 0;
 		}
 
-		if (msgByteCnt > SKEIN_512_BLOCK_BYTES)
+		if(msgByteCnt > SKEIN_512_BLOCK_BYTES)
 		{
-			n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;
-			cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+			n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES;
+			cn_skein512_processblock(ctx, msg, n, SKEIN_512_BLOCK_BYTES);
 			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
-			msg        += n * SKEIN_512_BLOCK_BYTES;
+			msg += n * SKEIN_512_BLOCK_BYTES;
 		}
 	}
 
-	if (msgByteCnt)
+	if(msgByteCnt)
 	{
-		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt);
 		ctx->h.bCnt += msgByteCnt;
 	}
 }
 
-__device__ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen)
+__device__ void cn_skein_update(skeinHashState* __restrict__ state, const BitSequence* __restrict__ data, DataLength databitlen)
 {
-	if ((databitlen & 7) == 0)
+	if((databitlen & 7) == 0)
 	{
-		cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3);
+		cn_skein512_update(&state->u.ctx_512, data, databitlen >> 3);
 	}
 	else
 	{
 
 		size_t bCnt = (databitlen >> 3) + 1;
-		uint8_t b,mask;
+		uint8_t b, mask;
 
-		mask = (uint8_t) (1u << (7 - (databitlen & 7)));
-		b    = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask);
+		mask = (uint8_t)(1u << (7 - (databitlen & 7)));
+		b = (uint8_t)((data[bCnt - 1] & (0 - mask)) | mask);
 
-		cn_skein512_update(&state->u.ctx_512,data,bCnt-1);
-		cn_skein512_update(&state->u.ctx_512,&b  ,  1   );
+		cn_skein512_update(&state->u.ctx_512, data, bCnt - 1);
+		cn_skein512_update(&state->u.ctx_512, &b, 1);
 
 		Skein_Set_Bit_Pad_Flag(state->u.h);
 	}
 }
 
-__device__ void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+__device__ void cn_skein(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval)
 {
 	int hashbitlen = 256;
 	DataLength databitlen = len << 3;
 	skeinHashState state;
 
-	state.statebits = 64*SKEIN_512_STATE_WORDS;
+	state.statebits = 64 * SKEIN_512_STATE_WORDS;
 
 	cn_skein_init(&state, hashbitlen);
 	cn_skein_update(&state, data, databitlen);
diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp
index 5c7dfe16a38376c2700bea15991ca9ed522b8f10..902a662305e9c353535fa5175d6e0f4819c9a883 100644
--- a/xmrstak/backend/plugin.hpp
+++ b/xmrstak/backend/plugin.hpp
@@ -3,22 +3,22 @@
 #include "xmrstak/misc/environment.hpp"
 #include "xmrstak/params.hpp"
 
-#include <thread>
-#include <atomic>
-#include <vector>
-#include <string>
 #include "iBackend.hpp"
+#include <atomic>
 #include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
 
 #ifndef USE_PRECOMPILED_HEADERS
-#	ifdef WIN32
-#		include <direct.h>
-#		include <windows.h>
-#	else
-#		include <sys/types.h>
-#		include <dlfcn.h>
-#	endif
-#	include <iostream>
+#ifdef WIN32
+#include <direct.h>
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#include <sys/types.h>
+#endif
+#include <iostream>
 #endif
 
 namespace xmrstak
@@ -36,16 +36,16 @@ struct plugin
 		libBackend = LoadLibrary(TEXT((libName + ".dll").c_str()));
 		if(!libBackend)
 		{
-			std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << (libName + ".dll") << std::endl;
+			std::cerr << "WARNING: " << m_backendName << " cannot load backend library: " << (libName + ".dll") << std::endl;
 			return;
 		}
 #else
 		// `.so` linux file extention for dynamic libraries
 		std::string fileExtension = ".so";
-#	if defined(__APPLE__)
+#if defined(__APPLE__)
 		// `.dylib` Mac OS X file extention for dynamic libraries
 		fileExtension = ".dylib";
-#	endif
+#endif
 		// search library in working directory
 		libBackend = dlopen(("./lib" + libName + fileExtension).c_str(), RTLD_LAZY);
 		// fallback to binary directory
@@ -56,21 +56,21 @@ struct plugin
 			libBackend = dlopen(("lib" + libName + fileExtension).c_str(), RTLD_LAZY);
 		if(!libBackend)
 		{
-			std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << dlerror() << std::endl;
+			std::cerr << "WARNING: " << m_backendName << " cannot load backend library: " << dlerror() << std::endl;
 			return;
 		}
 #endif
 
 #ifdef WIN32
-		fn_startBackend = (startBackend_t) GetProcAddress(libBackend, "xmrstak_start_backend");
-		if (!fn_startBackend)
+		fn_startBackend = (startBackend_t)GetProcAddress(libBackend, "xmrstak_start_backend");
+		if(!fn_startBackend)
 		{
-			std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " <<GetLastError()<< std::endl;
+			std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " << GetLastError() << std::endl;
 		}
 #else
 		// reset last error
 		dlerror();
-		fn_startBackend = (startBackend_t) dlsym(libBackend, "xmrstak_start_backend");
+		fn_startBackend = (startBackend_t)dlsym(libBackend, "xmrstak_start_backend");
 		const char* dlsym_error = dlerror();
 		if(dlsym_error)
 		{
@@ -112,7 +112,7 @@ struct plugin
 #ifdef WIN32
 	HINSTANCE libBackend;
 #else
-	void *libBackend = nullptr;
+	void* libBackend = nullptr;
 #endif
 };
 
diff --git a/xmrstak/backend/pool_data.hpp b/xmrstak/backend/pool_data.hpp
index 4e92359ec193a2a095daea6ae64e9e76220e2fe3..632fc40ec1a8613a4dee40acaeeb76a7b53f0d6c 100644
--- a/xmrstak/backend/pool_data.hpp
+++ b/xmrstak/backend/pool_data.hpp
@@ -11,9 +11,11 @@ namespace xmrstak
 struct pool_data
 {
 	uint32_t iSavedNonce;
-	size_t   pool_id;
+	size_t pool_id;
 
-	pool_data() : iSavedNonce(0), pool_id(invalid_pool_id)
+	pool_data() :
+		iSavedNonce(0),
+		pool_id(invalid_pool_id)
 	{
 	}
 };
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index d6822cd63c544e07975d22ecd28474a6e2be904b..cc4b1b3d96db5b10ca2330c56abfc4101b1d44ed 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -1,4 +1,4 @@
- /*
+/*
   * This program is free software: you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation, either version 3 of the License, or
@@ -21,38 +21,37 @@
   *
   */
 
-#include "xmrstak/misc/executor.hpp"
-#include "xmrstak/backend/miner_work.hpp"
-#include "xmrstak/backend/globalStates.hpp"
 #include "xmrstak/backend/backendConnector.hpp"
+#include "xmrstak/backend/globalStates.hpp"
+#include "xmrstak/backend/miner_work.hpp"
+#include "xmrstak/donate-level.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/configEditor.hpp"
 #include "xmrstak/misc/console.hpp"
-#include "xmrstak/donate-level.hpp"
+#include "xmrstak/misc/executor.hpp"
+#include "xmrstak/misc/utility.hpp"
 #include "xmrstak/params.hpp"
-#include "xmrstak/misc/configEditor.hpp"
 #include "xmrstak/version.hpp"
-#include "xmrstak/misc/utility.hpp"
 
 #ifndef CONF_NO_HTTPD
-#	include "xmrstak/http/httpd.hpp"
+#include "xmrstak/http/httpd.hpp"
 #endif
 
-#include <stdlib.h>
+#include <iostream>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string>
-#include <iostream>
 #include <time.h>
-#include <iostream>
 
 #ifndef CONF_NO_TLS
-#include <openssl/ssl.h>
 #include <openssl/err.h>
+#include <openssl/ssl.h>
 #endif
 
 #ifdef _WIN32
-#	define strcasecmp _stricmp
-#	include <windows.h>
-#	include "xmrstak/misc/uac.hpp"
+#define strcasecmp _stricmp
+#include "xmrstak/misc/uac.hpp"
+#include <windows.h>
 #endif // _WIN32
 
 int do_benchmark(int block_version, int wait_sec, int work_sec);
@@ -62,71 +61,75 @@ void help()
 	using namespace std;
 	using namespace xmrstak;
 
-	cout<<"Usage: "<<params::inst().binaryName<<" [OPTION]..."<<endl;
-	cout<<" "<<endl;
-	cout<<"  -h, --help                 show this help"<<endl;
-	cout<<"  -v, --version              show version number"<<endl;
-	cout<<"  -V, --version-long         show long version number"<<endl;
-	cout<<"  -c, --config FILE          common miner configuration file"<<endl;
-	cout<<"  -C, --poolconf FILE        pool configuration file"<<endl;
+	cout << "Usage: " << params::inst().binaryName << " [OPTION]..." << endl;
+	cout << " " << endl;
+	cout << "  -h, --help                 show this help" << endl;
+	cout << "  -v, --version              show version number" << endl;
+	cout << "  -V, --version-long         show long version number" << endl;
+	cout << "  -c, --config FILE          common miner configuration file" << endl;
+	cout << "  -C, --poolconf FILE        pool configuration file" << endl;
 #ifdef _WIN32
-	cout<<"  --noUAC                    disable the UAC dialog"<<endl;
+	cout << "  --noUAC                    disable the UAC dialog" << endl;
 #endif
-	cout<<"  --benchmark BLOCKVERSION   ONLY do a benchmark and exit"<<endl;
-	cout<<"  --benchwait WAIT_SEC             ... benchmark wait time"<<endl;
-	cout<<"  --benchwork WORK_SEC             ... benchmark work time"<<endl;
+	cout << "  --benchmark BLOCKVERSION   ONLY do a benchmark and exit" << endl;
+	cout << "  --benchwait WAIT_SEC             ... benchmark wait time" << endl;
+	cout << "  --benchwork WORK_SEC             ... benchmark work time" << endl;
 #ifndef CONF_NO_CPU
-	cout<<"  --noCPU                    disable the CPU miner backend"<<endl;
-	cout<<"  --cpu FILE                 CPU backend miner config file"<<endl;
+	cout << "  --noCPU                    disable the CPU miner backend" << endl;
+	cout << "  --cpu FILE                 CPU backend miner config file" << endl;
 #endif
 #ifndef CONF_NO_OPENCL
-	cout<<"  --noAMD                    disable the AMD miner backend"<<endl;
-	cout<<"  --noAMDCache               disable the AMD(OpenCL) cache for precompiled binaries"<<endl;
-	cout<<"  --openCLVendor VENDOR      use OpenCL driver of VENDOR and devices [AMD,NVIDIA]"<<endl;
-	cout<<"                             default: AMD"<<endl;
-	cout<<"  --amd FILE                 AMD backend miner config file"<<endl;
+	cout << "  --noAMD                    disable the AMD miner backend" << endl;
+	cout << "  --noAMDCache               disable the AMD(OpenCL) cache for precompiled binaries" << endl;
+	cout << "  --openCLVendor VENDOR      use OpenCL driver of VENDOR and devices [AMD,NVIDIA]" << endl;
+	cout << "                             default: AMD" << endl;
+	cout << "  --amdCacheDir DIRECTORY    directory to store AMD binary files" << endl;
+	cout << "  --amd FILE                 AMD backend miner config file" << endl;
 #endif
 #ifndef CONF_NO_CUDA
-	cout<<"  --noNVIDIA                 disable the NVIDIA miner backend"<<endl;
-	cout<<"  --nvidia FILE              NVIDIA backend miner config file"<<endl;
+	cout << "  --noNVIDIA                 disable the NVIDIA miner backend" << endl;
+	cout << "  --nvidia FILE              NVIDIA backend miner config file" << endl;
 #endif
 #ifndef CONF_NO_HTTPD
-	cout<<"  -i --httpd HTTP_PORT       HTTP interface port"<<endl;
+	cout << "  -i --httpd HTTP_PORT       HTTP interface port" << endl;
 #endif
-	cout<<" "<<endl;
-	cout<<"The following options can be used for automatic start without a guided config,"<<endl;
-	cout<<"If config exists then this pool will be top priority."<<endl;
-	cout<<"  -o, --url URL              pool url and port, e.g. pool.usxmrpool.com:3333"<<endl;
-	cout<<"  -O, --tls-url URL          TLS pool url and port, e.g. pool.usxmrpool.com:10443"<<endl;
-	cout<<"  -u, --user USERNAME        pool user name or wallet address"<<endl;
-	cout<<"  -r, --rigid RIGID          rig identifier for pool-side statistics (needs pool support)"<<endl;
-	cout<<"  -p, --pass PASSWD          pool password, in the most cases x or empty \"\""<<endl;
-	cout<<"  --use-nicehash             the pool should run in nicehash mode"<<endl;
-	cout<<"  --currency NAME            currency to mine"<<endl;
-	cout<< endl;
+	cout << " " << endl;
+	cout << "The following options can be used for automatic start without a guided config," << endl;
+	cout << "If config exists then this pool will be top priority." << endl;
+	cout << "  -o, --url URL              pool url and port, e.g. pool.usxmrpool.com:3333" << endl;
+	cout << "  -O, --tls-url URL          TLS pool url and port, e.g. pool.usxmrpool.com:10443" << endl;
+	cout << "  -u, --user USERNAME        pool user name or wallet address" << endl;
+	cout << "  -r, --rigid RIGID          rig identifier for pool-side statistics (needs pool support)" << endl;
+	cout << "  -p, --pass PASSWD          pool password, in the most cases x or empty \"\"" << endl;
+	cout << "  --use-nicehash             the pool should run in nicehash mode" << endl;
+	cout << "  --currency NAME            currency to mine" << endl;
+	cout << endl;
 #ifdef _WIN32
-	cout<<"Environment variables:\n"<<endl;
-	cout<<"  XMRSTAK_NOWAIT             disable the dialog `Press any key to exit."<<std::endl;
-	cout<<"                	            for non UAC execution"<<endl;
-	cout<< endl;
+	cout << "Environment variables:\n"
+		 << endl;
+	cout << "  XMRSTAK_NOWAIT             disable the dialog `Press any key to exit." << std::endl;
+	cout << "                	            for non UAC execution" << endl;
+	cout << endl;
 #endif
 	std::string algos;
 	jconf::GetAlgoList(algos);
-	cout<< "Supported coin options: " << endl << algos << endl;
-	cout<< "Version: " << get_version_str_short() << endl;
-	cout<<"Brought to by fireice_uk and psychocrypt under GPLv3."<<endl;
+	cout << "Supported coin options: " << endl
+		 << algos << endl;
+	cout << "Version: " << get_version_str_short() << endl;
+	cout << "Brought to by fireice_uk and psychocrypt under GPLv3." << endl;
 }
 
-bool read_yes_no(const char* str)
+bool read_yes_no(const char* str, std::string default_value = "")
 {
 	std::string tmp;
 	do
 	{
 		std::cout << str << std::endl;
-		std::cin >> tmp;
+		getline(std::cin, tmp);
+		if(tmp.empty())
+			tmp = default_value;
 		std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
-	}
-	while(tmp != "y" && tmp != "n" && tmp != "yes" && tmp != "no");
+	} while(tmp != "y" && tmp != "n" && tmp != "yes" && tmp != "no");
 
 	return tmp == "y" || tmp == "yes";
 }
@@ -138,34 +141,37 @@ inline const char* bool_to_str(bool v)
 
 std::string get_multipool_entry(bool& final)
 {
-	std::cout<<std::endl<<"- Next Pool:"<<std::endl<<std::endl;
+	std::cout << std::endl
+			  << "- Next Pool:" << std::endl
+			  << std::endl;
 
 	std::string pool;
-	std::cout<<"- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
+	std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
 	std::cin >> pool;
 
 	std::string userName;
-	std::cout<<"- Username (wallet address or pool login):"<<std::endl;
+	std::cout << "- Username (wallet address or pool login):" << std::endl;
 	std::cin >> userName;
 
 	std::string passwd;
-	std::cin.clear(); std::cin.ignore(INT_MAX,'\n');
-	std::cout<<"- Password (mostly empty or x):"<<std::endl;
+	std::cin.clear();
+	std::cin.ignore(INT_MAX, '\n');
+	std::cout << "- Password (mostly empty or x):" << std::endl;
 	getline(std::cin, passwd);
 
 	std::string rigid;
-	std::cout<<"- Rig identifier for pool-side statistics (needs pool support). Can be empty:"<<std::endl;
+	std::cout << "- Rig identifier for pool-side statistics (needs pool support). Can be empty:" << std::endl;
 	getline(std::cin, rigid);
 
 #ifdef CONF_NO_TLS
 	bool tls = false;
 #else
-	bool tls = read_yes_no("- Does this pool port support TLS/SSL? Use no if unknown. (y/N)");
+	bool tls = read_yes_no("- Does this pool port support TLS/SSL? Use no if unknown. (y/N)", "N");
 #endif
-	bool nicehash = read_yes_no("- Do you want to use nicehash on this pool? (y/n)");
+	bool nicehash = read_yes_no("- Do you want to use nicehash on this pool? (y/N)", "N");
 
 	int64_t pool_weight;
-	std::cout << "- Please enter a weight for this pool: "<<std::endl;
+	std::cout << "- Please enter a weight for this pool: " << std::endl;
 	while(!(std::cin >> pool_weight) || pool_weight <= 0)
 	{
 		std::cin.clear();
@@ -173,30 +179,37 @@ std::string get_multipool_entry(bool& final)
 		std::cout << "Invalid weight.  Try 1, 10, 100, etc:" << std::endl;
 	}
 
-	final = !read_yes_no("- Do you want to add another pool? (y/n)");
+	final = !read_yes_no("- Do you want to add another pool? (y/N)", "N");
 
-	return "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid +
-		"\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
-		bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
+	return "\t{\"pool_address\" : \"" + pool + "\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid +
+		   "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
+		   bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
 }
 
 inline void prompt_once(bool& prompted)
 {
 	if(!prompted)
 	{
-		std::cout<<"Please enter:"<<std::endl;
+		std::cout << "Please enter:" << std::endl;
 		prompted = true;
 	}
 }
 
+inline bool use_simple_start()
+{
+	// ask this question only once
+	static bool simple_start = read_yes_no("\nUse simple setup method? (Y/n)", "Y");
+	return simple_start;
+}
+
 void do_guided_pool_config()
 {
 	using namespace xmrstak;
 
 	// load the template of the backend config into a char variable
-	const char *tpl =
-		#include "../pools.tpl"
-	;
+	const char* tpl =
+#include "../pools.tpl"
+		;
 
 	configEditor configTpl{};
 	configTpl.set(std::string(tpl));
@@ -212,7 +225,7 @@ void do_guided_pool_config()
 		{
 			std::string list;
 			jconf::GetAlgoList(list);
-			std::cout << "- Please enter the currency that you want to mine: "<<std::endl;
+			std::cout << "- Please enter the currency that you want to mine: " << std::endl;
 			std::cout << list << std::endl;
 			std::cin >> tmp;
 		}
@@ -226,7 +239,7 @@ void do_guided_pool_config()
 		prompt_once(prompted);
 
 		userSetPool = false;
-		std::cout<<"- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
+		std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
 		std::cin >> pool;
 	}
 
@@ -235,7 +248,7 @@ void do_guided_pool_config()
 	{
 		prompt_once(prompted);
 
-		std::cout<<"- Username (wallet address or pool login):"<<std::endl;
+		std::cout << "- Username (wallet address or pool login):" << std::endl;
 		std::cin >> userName;
 	}
 
@@ -246,63 +259,67 @@ void do_guided_pool_config()
 		prompt_once(prompted);
 
 		// clear everything from stdin to allow an empty password
-		std::cin.clear(); std::cin.ignore(INT_MAX,'\n');
+		std::cin.clear();
+		std::cin.ignore(INT_MAX, '\n');
 		stdin_flushed = true;
 
-		std::cout<<"- Password (mostly empty or x):"<<std::endl;
+		std::cout << "- Password (mostly empty or x):" << std::endl;
 		getline(std::cin, passwd);
 	}
 
 	auto& rigid = params::inst().poolRigid;
 	if(rigid.empty() && !params::inst().userSetRigid)
 	{
-		prompt_once(prompted);
-
-		if(!stdin_flushed)
+		if(!use_simple_start())
 		{
-			// clear everything from stdin to allow an empty rigid
-			std::cin.clear(); std::cin.ignore(INT_MAX,'\n');
-		}
+			prompt_once(prompted);
 
-		std::cout<<"- Rig identifier for pool-side statistics (needs pool support). Can be empty:"<<std::endl;
-		getline(std::cin, rigid);
+			if(!stdin_flushed)
+			{
+				// clear everything from stdin to allow an empty rigid
+				std::cin.clear();
+				std::cin.ignore(INT_MAX, '\n');
+			}
+
+			std::cout << "- Rig identifier for pool-side statistics (needs pool support). Can be empty:" << std::endl;
+			getline(std::cin, rigid);
+		}
 	}
 
-	bool tls;
+	bool tls = params::inst().poolUseTls;
 #ifdef CONF_NO_TLS
 	tls = false;
 #else
 	if(!userSetPool)
 	{
 		prompt_once(prompted);
-		tls = read_yes_no("- Does this pool port support TLS/SSL? Use no if unknown. (y/N)");
+		tls = read_yes_no("- Does this pool port support TLS/SSL? Use no if unknown. (y/N)", "N");
 	}
-	else
-		tls = params::inst().poolUseTls;
+
 #endif
 
-	bool nicehash;
+	bool nicehash = params::inst().nicehashMode;
 	if(!userSetPool)
 	{
-		prompt_once(prompted);
-		nicehash = read_yes_no("- Do you want to use nicehash on this pool? (y/n)");
+		if(!use_simple_start())
+		{
+			prompt_once(prompted);
+			nicehash = read_yes_no("- Do you want to use nicehash on this pool? (y/N)", "N");
+		}
 	}
-	else
-		nicehash = params::inst().nicehashMode;
 
-	bool multipool;
+	bool multipool = false;
 	if(!userSetPool)
-		multipool = read_yes_no("- Do you want to use multiple pools? (y/n)");
-	else
-		multipool = false;
+		if(!use_simple_start())
+			multipool = read_yes_no("- Do you want to use multiple pools? (y/N)", "N");
 
-	int64_t pool_weight;
+	int64_t pool_weight = 1;
 	if(multipool)
 	{
 		std::cout << "Pool weight is a number telling the miner how important the pool is." << std::endl;
 		std::cout << "Miner will mine mostly at the pool with the highest weight, unless the pool fails." << std::endl;
 		std::cout << "Weight must be an integer larger than 0." << std::endl;
-		std::cout << "- Please enter a weight for this pool: "<<std::endl;
+		std::cout << "- Please enter a weight for this pool: " << std::endl;
 
 		while(!(std::cin >> pool_weight) || pool_weight <= 0)
 		{
@@ -311,13 +328,11 @@ void do_guided_pool_config()
 			std::cout << "Invalid weight.  Try 1, 10, 100, etc:" << std::endl;
 		}
 	}
-	else
-		pool_weight = 1;
 
 	std::string pool_table;
-	pool_table += "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName +  "\", \"rig_id\" : \"" + rigid +
-		"\", \"pool_password\" : \"" +  passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
-		bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
+	pool_table += "\t{\"pool_address\" : \"" + pool + "\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid +
+				  "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " +
+				  bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n";
 
 	if(multipool)
 	{
@@ -325,14 +340,13 @@ void do_guided_pool_config()
 		do
 		{
 			pool_table += get_multipool_entry(final);
-		}
-		while(!final);
+		} while(!final);
 	}
 
 	configTpl.replace("CURRENCY", currency);
 	configTpl.replace("POOLCONF", pool_table);
 	configTpl.write(params::inst().configFilePools);
-	std::cout<<"Pool configuration stored in file '"<<params::inst().configFilePools<<"'"<<std::endl;
+	std::cout << "Pool configuration stored in file '" << params::inst().configFilePools << "'" << std::endl;
 }
 
 void do_guided_config()
@@ -340,9 +354,9 @@ void do_guided_config()
 	using namespace xmrstak;
 
 	// load the template of the backend config into a char variable
-	const char *tpl =
-		#include "../config.tpl"
-	;
+	const char* tpl =
+#include "../config.tpl"
+		;
 
 	configEditor configTpl{};
 	configTpl.set(std::string(tpl));
@@ -351,33 +365,34 @@ void do_guided_config()
 	auto& http_port = params::inst().httpd_port;
 	if(http_port == params::httpd_port_unset)
 	{
-#if defined(CONF_NO_HTTPD)
 		http_port = params::httpd_port_disabled;
-#else
-		prompt_once(prompted);
+#ifndef CONF_NO_HTTPD
+		if(!use_simple_start())
+		{
+			prompt_once(prompted);
 
-		std::cout<<"- Do you want to use the HTTP interface?" <<std::endl;
-		std::cout<<"Unlike the screen display, browser interface is not affected by the GPU lag." <<std::endl;
-		std::cout<<"If you don't want to use it, please enter 0, otherwise enter port number that the miner should listen on" <<std::endl;
+			std::cout << "- Do you want to use the HTTP interface?" << std::endl;
+			std::cout << "Unlike the screen display, browser interface is not affected by the GPU lag." << std::endl;
+			std::cout << "If you don't want to use it, please enter 0, otherwise enter port number that the miner should listen on" << std::endl;
 
-		int32_t port;
-		while(!(std::cin >> port) || port < 0 || port > 65535)
-		{
-			std::cin.clear();
-			std::cin.ignore(INT_MAX, '\n');
-			std::cout << "Invalid port number. Please enter a number between 0 and 65535." << std::endl;
+			int32_t port;
+			while(!(std::cin >> port) || port < 0 || port > 65535)
+			{
+				std::cin.clear();
+				std::cin.ignore(INT_MAX, '\n');
+				std::cout << "Invalid port number. Please enter a number between 0 and 65535." << std::endl;
+			}
+			http_port = port;
 		}
-
-		http_port = port;
 #endif
 	}
 
 	configTpl.replace("HTTP_PORT", std::to_string(http_port));
 	configTpl.write(params::inst().configFile);
-	std::cout<<"Configuration stored in file '"<<params::inst().configFile<<"'"<<std::endl;
+	std::cout << "Configuration stored in file '" << params::inst().configFile << "'" << std::endl;
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
 #ifndef CONF_NO_TLS
 	SSL_library_init();
@@ -418,7 +433,7 @@ int main(int argc, char *argv[])
 	}
 
 	bool pool_url_set = false;
-	for(size_t i = 1; i < argc-1; i++)
+	for(size_t i = 1; i < argc - 1; i++)
 	{
 		std::string opName(argv[i]);
 		if(opName == "-o" || opName == "-O" || opName == "--url" || opName == "--tls-url")
@@ -436,13 +451,13 @@ int main(int argc, char *argv[])
 		}
 		if(opName.compare("-v") == 0 || opName.compare("--version") == 0)
 		{
-			std::cout<< "Version: " << get_version_str_short() << std::endl;
+			std::cout << "Version: " << get_version_str_short() << std::endl;
 			win_exit();
 			return 0;
 		}
 		else if(opName.compare("-V") == 0 || opName.compare("--version-long") == 0)
 		{
-			std::cout<< "Version: " << get_version_str() << std::endl;
+			std::cout << "Version: " << get_version_str() << std::endl;
 			win_exit();
 			return 0;
 		}
@@ -457,7 +472,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--openCLVendor") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--openCLVendor' given");
 				win_exit();
@@ -483,7 +498,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--cpu") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--cpu' given");
 				win_exit();
@@ -494,7 +509,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--amd") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--amd' given");
 				win_exit();
@@ -502,10 +517,21 @@ int main(int argc, char *argv[])
 			}
 			params::inst().configFileAMD = argv[i];
 		}
+		else if(opName.compare("--amdCacheDir") == 0)
+		{
+			++i;
+			if(i >= argc)
+			{
+				printer::inst()->print_msg(L0, "No argument for parameter '--amdCacheDir' given");
+				win_exit();
+				return 1;
+			}
+			params::inst().rootAMDCacheDir = std::string(argv[i]) + "/";
+		}
 		else if(opName.compare("--nvidia") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--nvidia' given");
 				win_exit();
@@ -516,7 +542,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--currency") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--currency' given");
 				win_exit();
@@ -527,7 +553,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("-o") == 0 || opName.compare("--url") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-o/--url' given");
 				win_exit();
@@ -539,7 +565,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("-O") == 0 || opName.compare("--tls-url") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-O/--tls-url' given");
 				win_exit();
@@ -558,7 +584,7 @@ int main(int argc, char *argv[])
 			}
 
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-u/--user' given");
 				win_exit();
@@ -576,7 +602,7 @@ int main(int argc, char *argv[])
 			}
 
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-p/--pass' given");
 				win_exit();
@@ -595,7 +621,7 @@ int main(int argc, char *argv[])
 			}
 
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-r/--rigid' given");
 				win_exit();
@@ -612,7 +638,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("-c") == 0 || opName.compare("--config") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-c/--config' given");
 				win_exit();
@@ -623,7 +649,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("-C") == 0 || opName.compare("--poolconf") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-C/--poolconf' given");
 				win_exit();
@@ -634,7 +660,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("-i") == 0 || opName.compare("--httpd") == 0)
 		{
 			++i;
-			if( i >=argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '-i/--httpd' given");
 				win_exit();
@@ -660,7 +686,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--benchmark") == 0)
 		{
 			++i;
-			if( i >= argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--benchmark' given");
 				win_exit();
@@ -679,7 +705,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--benchwait") == 0)
 		{
 			++i;
-			if( i >= argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--benchwait' given");
 				win_exit();
@@ -698,7 +724,7 @@ int main(int argc, char *argv[])
 		else if(opName.compare("--benchwork") == 0)
 		{
 			++i;
-			if( i >= argc )
+			if(i >= argc)
 			{
 				printer::inst()->print_msg(L0, "No argument for parameter '--benchwork' given");
 				win_exit();
@@ -716,17 +742,20 @@ int main(int argc, char *argv[])
 		}
 		else
 		{
-			printer::inst()->print_msg(L0, "Parameter unknown '%s'",argv[i]);
+			printer::inst()->print_msg(L0, "Parameter unknown '%s'", argv[i]);
 			win_exit();
 			return 1;
 		}
 	}
 
+	bool hasConfigFile = configEditor::file_exist(params::inst().configFile);
+	bool hasPoolConfig = configEditor::file_exist(params::inst().configFilePools);
+
 	// check if we need a guided start
-	if(!configEditor::file_exist(params::inst().configFile))
+	if(!hasConfigFile)
 		do_guided_config();
 
-	if(!configEditor::file_exist(params::inst().configFilePools))
+	if(!hasPoolConfig)
 		do_guided_pool_config();
 
 	if(!jconf::inst()->parse_config(params::inst().configFile.c_str(), params::inst().configFilePools.c_str()))
@@ -747,7 +776,7 @@ int main(int argc, char *argv[])
 	if(strlen(jconf::inst()->GetOutputFile()) != 0)
 		printer::inst()->open_logfile(jconf::inst()->GetOutputFile());
 
-	if (!BackendConnector::self_test())
+	if(!BackendConnector::self_test())
 	{
 		printer::inst()->print_msg(L0, "Self test not passed!");
 		win_exit();
@@ -761,7 +790,7 @@ int main(int argc, char *argv[])
 		win_exit();
 		return 1;
 #else
-		if (!httpd::inst()->start_daemon())
+		if(!httpd::inst()->start_daemon())
 		{
 			win_exit();
 			return 1;
@@ -835,7 +864,7 @@ int main(int argc, char *argv[])
 		uint64_t currentTime = get_timestamp_ms();
 
 		/* Hard guard to make sure we never get called more than twice per second */
-		if( currentTime - lastTime < 500)
+		if(currentTime - lastTime < 500)
 			std::this_thread::sleep_for(std::chrono::milliseconds(500 - (currentTime - lastTime)));
 		lastTime = currentTime;
 	}
@@ -851,7 +880,7 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 	printer::inst()->print_msg(L0, "Prepare benchmark for block version %d", block_version);
 
 	uint8_t work[128];
-	memset(work,0,128);
+	memset(work, 0, 128);
 	work[0] = static_cast<uint8_t>(block_version);
 
 	xmrstak::pool_data dat;
@@ -859,12 +888,12 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 	xmrstak::miner_work oWork = xmrstak::miner_work();
 	pvThreads = xmrstak::BackendConnector::thread_starter(oWork);
 
-	printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized",wait_sec);
+	printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized", wait_sec);
 	std::this_thread::sleep_for(std::chrono::seconds(wait_sec));
 
 	/* AMD and NVIDIA is currently only supporting work sizes up to 128byte
 	 */
-	printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec);
+	printer::inst()->print_msg(L0, "Start a %d second benchmark...", work_sec);
 	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat);
 	uint64_t iStartStamp = get_timestamp_ms();
 
@@ -872,7 +901,7 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat);
 
 	double fTotalHps = 0.0;
-	for (uint32_t i = 0; i < pvThreads->size(); i++)
+	for(uint32_t i = 0; i < pvThreads->size(); i++)
 	{
 		double fHps = pvThreads->at(i)->iHashCount;
 		fHps /= (pvThreads->at(i)->iTimestamp - iStartStamp) / 1000.0;
@@ -880,7 +909,7 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 		auto bType = static_cast<xmrstak::iBackend::BackendType>(pvThreads->at(i)->backendType);
 		std::string name(xmrstak::iBackend::getName(bType));
 
-		printer::inst()->print_msg(L0, "Benchmark Thread %u %s: %.1f H/S", i,name.c_str(), fHps);
+		printer::inst()->print_msg(L0, "Benchmark Thread %u %s: %.1f H/S", i, name.c_str(), fHps);
 		fTotalHps += fHps;
 	}
 
diff --git a/xmrstak/http/httpd.cpp b/xmrstak/http/httpd.cpp
index ed9abc2bc261c06ddbc891131989a45ef8f0b9f3..b4f0f547e4eefa23be62ed6683fab29e806df80a 100644
--- a/xmrstak/http/httpd.cpp
+++ b/xmrstak/http/httpd.cpp
@@ -23,16 +23,15 @@
 
 #ifndef CONF_NO_HTTPD
 
-
 #include "httpd.hpp"
 #include "webdesign.hpp"
-#include "xmrstak/net/msgstruct.hpp"
+#include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/executor.hpp"
-#include "xmrstak/jconf.hpp"
+#include "xmrstak/net/msgstruct.hpp"
 
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <string>
 
@@ -45,21 +44,20 @@ httpd* httpd::oInst = nullptr;
 
 httpd::httpd()
 {
-
 }
 
-int httpd::req_handler(void * cls,
-			MHD_Connection* connection,
-			const char* url,
-			const char* method,
-			const char* version,
-			const char* upload_data,
-			size_t* upload_data_size,
-			void ** ptr)
+int httpd::req_handler(void* cls,
+	MHD_Connection* connection,
+	const char* url,
+	const char* method,
+	const char* version,
+	const char* upload_data,
+	size_t* upload_data_size,
+	void** ptr)
 {
-	struct MHD_Response * rsp;
+	struct MHD_Response* rsp;
 
-	if (strcmp(method, "GET") != 0)
+	if(strcmp(method, "GET") != 0)
 		return MHD_NO;
 
 	if(strlen(jconf::inst()->GetHttpUsername()) != 0)
@@ -68,7 +66,7 @@ int httpd::req_handler(void * cls,
 		int ret;
 
 		username = MHD_digest_auth_get_username(connection);
-		if (username == NULL)
+		if(username == NULL)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
 			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, MHD_NO);
@@ -78,7 +76,7 @@ int httpd::req_handler(void * cls,
 		free(username);
 
 		ret = MHD_digest_auth_check(connection, sHttpAuthRealm, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300);
-		if (ret == MHD_INVALID_NONCE || ret == MHD_NO)
+		if(ret == MHD_INVALID_NONCE || ret == MHD_NO)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
 			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO);
@@ -174,4 +172,3 @@ bool httpd::start_daemon()
 }
 
 #endif
-
diff --git a/xmrstak/http/httpd.hpp b/xmrstak/http/httpd.hpp
index fe534f0388caeb1e90e9259cdc5ab9113db9fb04..dfad082ca62ec23ab577165be3eab8649806c21b 100644
--- a/xmrstak/http/httpd.hpp
+++ b/xmrstak/http/httpd.hpp
@@ -7,27 +7,28 @@ struct MHD_Connection;
 
 class httpd
 {
-public:
+  public:
 	static httpd* inst()
 	{
-		if (oInst == nullptr) oInst = new httpd;
+		if(oInst == nullptr)
+			oInst = new httpd;
 		return oInst;
 	};
 
 	bool start_daemon();
 
-private:
+  private:
 	httpd();
 	static httpd* oInst;
 
-	static int req_handler(void * cls,
-			MHD_Connection* connection,
-			const char* url,
-			const char* method,
-			const char* version,
-			const char* upload_data,
-			size_t* upload_data_size,
-			void ** ptr);
+	static int req_handler(void* cls,
+		MHD_Connection* connection,
+		const char* url,
+		const char* method,
+		const char* version,
+		const char* upload_data,
+		size_t* upload_data_size,
+		void** ptr);
 
-	MHD_Daemon *d;
+	MHD_Daemon* d;
 };
diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp
index 8f20078aa3538e722978317aab1d856de82a6c2f..fbd56526994e00dcd24f54ce351911840d27b75f 100644
--- a/xmrstak/http/webdesign.cpp
+++ b/xmrstak/http/webdesign.cpp
@@ -1,114 +1,114 @@
 #include <stdlib.h>
 
-extern const char sHtmlCssEtag [] = "00000009";
-extern const char sHtmlCssFile [] =
+extern const char sHtmlCssEtag[] = "00000009";
+extern const char sHtmlCssFile[] =
 	"body {"
-		"font-family: Tahoma, Arial, sans-serif;"
-		"font-size: 80%;"
-		"background-color: rgb(240, 240, 240);"
+	"font-family: Tahoma, Arial, sans-serif;"
+	"font-size: 80%;"
+	"background-color: rgb(240, 240, 240);"
 	"}"
 
 	"a {"
-		"color: rgb(44, 55, 66);"
+	"color: rgb(44, 55, 66);"
 	"}"
 
 	"a:link {"
-		"text-decoration: none;"
+	"text-decoration: none;"
 	"}"
 
 	"a:visited {"
-		"color: rgb(44, 55, 66);"
+	"color: rgb(44, 55, 66);"
 	"}"
 
 	"a:hover {"
-		"color: rgb(255, 153, 0);"
+	"color: rgb(255, 153, 0);"
 	"}"
 
 	"a:active {"
-		"color: rgb(204, 122, 0);"
+	"color: rgb(204, 122, 0);"
 	"}"
 
 	".all {"
-		"max-width:600px;"
-		"margin: auto;"
+	"max-width:600px;"
+	"margin: auto;"
 	"}"
 
 	".header {"
-		"background-color: rgb(30, 30, 30);"
-		"color: white;"
-		"padding: 10px;"
-		"font-weight: bold;"
-		"margin: 0px;"
-		"margin-bottom: 10px;"
+	"background-color: rgb(30, 30, 30);"
+	"color: white;"
+	"padding: 10px;"
+	"font-weight: bold;"
+	"margin: 0px;"
+	"margin-bottom: 10px;"
 	"}"
 
 	".version {"
-		"font-size: 75%;"
-		"text-align: right;"
+	"font-size: 75%;"
+	"text-align: right;"
 	"}"
 
 	".links {"
-		"padding: 7px;"
-		"text-align: center;"
-		"background-color: rgb(215, 215, 215);"
-		"box-shadow: 0px 1px 3px 0px rgba(0, 0, 0, 0.2), 0px 1px 1px 0px rgba(0, 0, 0, 0.14), 0px 2px 1px -1px rgba(0, 0, 0, 0.12);"
+	"padding: 7px;"
+	"text-align: center;"
+	"background-color: rgb(215, 215, 215);"
+	"box-shadow: 0px 1px 3px 0px rgba(0, 0, 0, 0.2), 0px 1px 1px 0px rgba(0, 0, 0, 0.14), 0px 2px 1px -1px rgba(0, 0, 0, 0.12);"
 	"}"
 
 	".data th, td {"
-		"padding: 5px 12px;"
-		"text-align: right;"
-		"border-bottom: 1px solid #ccc;"
+	"padding: 5px 12px;"
+	"text-align: right;"
+	"border-bottom: 1px solid #ccc;"
 	"}"
 
 	".data tr:nth-child(even) {"
-		"background-color: #ddd;"
+	"background-color: #ddd;"
 	"}"
 
 	".data th {"
-		"background-color: #ccc;"
+	"background-color: #ccc;"
 	"}"
 
 	".data table {"
-		"width: 100%;"
-		"max-width: 600px;"
+	"width: 100%;"
+	"max-width: 600px;"
 	"}"
 
 	".letter {"
-		"font-weight: bold;"
+	"font-weight: bold;"
 	"}"
 
 	"h4 {"
-		"background-color: rgb(0, 130, 130);"
-		"color: white;"
-		"padding: 10px;"
-		"margin: 10px 0px;"
+	"background-color: rgb(0, 130, 130);"
+	"color: white;"
+	"padding: 10px;"
+	"margin: 10px 0px;"
 	"}"
 
 	".flex-container {"
-		"display: -webkit-flex;"
-		"display: flex;"
+	"display: -webkit-flex;"
+	"display: flex;"
 	"}"
 
 	".flex-item {"
-		"width: 33%;"
-		"margin: 3px;"
+	"width: 33%;"
+	"margin: 3px;"
 	"}"
 
 	".motd-box {"
-		"background-color: #ccc;"
-		"padding: 0px 10px 5px 10px;"
-		"margin-bottom: 10px;"
+	"background-color: #ccc;"
+	"padding: 0px 10px 5px 10px;"
+	"margin-bottom: 10px;"
 	"}"
 
 	".motd-head {"
-		"border-bottom: 1px solid #000;"
-		"margin-bottom: 0.5em;"
-		"padding: 0.5em 0em;"
-		"font-weight: bold;"
+	"border-bottom: 1px solid #000;"
+	"margin-bottom: 0.5em;"
+	"padding: 0.5em 0em;"
+	"font-weight: bold;"
 	"}"
 
 	".motd-body {"
-		"overflow: hidden;"
+	"overflow: hidden;"
 	"}";
 
 size_t sHtmlCssSize = sizeof(sHtmlCssFile) - 1;
@@ -124,7 +124,7 @@ extern const char sHtmlAccessDenied[] =
 
 size_t sHtmlAccessDeniedSize = sizeof(sHtmlAccessDenied) - 1;
 
-extern const char sHtmlCommonHeader [] =
+extern const char sHtmlCommonHeader[] =
 	"<!DOCTYPE html>"
 	"<html>"
 	"<head><meta name='viewport' content='width=device-width' />"
@@ -135,15 +135,15 @@ extern const char sHtmlCommonHeader [] =
 	"<div class='header'><span style='color: rgb(255, 160, 0)'>XMR</span>-Stak Monero Miner</div>"
 
 	"<div class='flex-container'>"
-		"<div class='links flex-item'>"
-			"<a href='h'><div><span class='letter'>H</span>ashrate</div></a>"
-		"</div>"
-		"<div class='links flex-item'>"
-			"<a href='r'><div><span class='letter'>R</span>esults</div></a>"
-		"</div>"
-		"<div class='links flex-item'>"
-			"<a href='c'><div><span class='letter'>C</span>onnection</div></a>"
-		"</div>"
+	"<div class='links flex-item'>"
+	"<a href='h'><div><span class='letter'>H</span>ashrate</div></a>"
+	"</div>"
+	"<div class='links flex-item'>"
+	"<a href='r'><div><span class='letter'>R</span>esults</div></a>"
+	"</div>"
+	"<div class='links flex-item'>"
+	"<a href='c'><div><span class='letter'>C</span>onnection</div></a>"
+	"</div>"
 	"</div>"
 	"<h4>%s</h4>";
 
@@ -151,61 +151,61 @@ extern const char sHtmlMotdBoxStart[] = "<div class='motd-box'>";
 extern const char sHtmlMotdEntry[] = "<div class='motd-head'>Message from %s</div><div class='motd-body'>%s</div>";
 extern const char sHtmlMotdBoxEnd[] = "</div>";
 
-extern const char sHtmlHashrateBodyHigh [] =
+extern const char sHtmlHashrateBodyHigh[] =
 	"<div class='data'>"
 	"<table>"
-		"<tr><th>Thread ID</th><th>10s</th><th>60s</th><th>15m</th><th rowspan='%u'>H/s</td></tr>";
+	"<tr><th>Thread ID</th><th>10s</th><th>60s</th><th>15m</th><th rowspan='%u'>H/s</td></tr>";
 
-extern const char sHtmlHashrateTableRow [] =
+extern const char sHtmlHashrateTableRow[] =
 	"<tr><th>%s</th><td>%s</td><td>%s</td><td>%s</td></tr>";
 
-extern const char sHtmlHashrateBodyLow [] =
-		"<tr><th>Totals:</th><td>%s</td><td>%s</td><td>%s</td></tr>"
-		"<tr><th>Highest:</th><td>%s</td><td colspan='2'></td></tr>"
+extern const char sHtmlHashrateBodyLow[] =
+	"<tr><th>Totals:</th><td>%s</td><td>%s</td><td>%s</td></tr>"
+	"<tr><th>Highest:</th><td>%s</td><td colspan='2'></td></tr>"
 	"</table>"
 	"</div></div></body></html>";
 
-extern const char sHtmlConnectionBodyHigh [] =
+extern const char sHtmlConnectionBodyHigh[] =
 	"<div class='data'>"
 	"<table>"
-		"<tr><th>Rig ID</th><td>%s</td></tr>"
-		"<tr><th>Pool address</th><td>%s</td></tr>"
-		"<tr><th>Connected since</th><td>%s</td></tr>"
-		"<tr><th>Pool ping time</th><td>%u ms</td></tr>"
+	"<tr><th>Rig ID</th><td>%s</td></tr>"
+	"<tr><th>Pool address</th><td>%s</td></tr>"
+	"<tr><th>Connected since</th><td>%s</td></tr>"
+	"<tr><th>Pool ping time</th><td>%u ms</td></tr>"
 	"</table>"
 	"<h4>Network error log</h4>"
 	"<table>"
-		"<tr><th style='width: 20%; min-width: 10em;'>Date</th><th>Error</th></tr>";
+	"<tr><th style='width: 20%; min-width: 10em;'>Date</th><th>Error</th></tr>";
 
-extern const char sHtmlConnectionTableRow [] =
+extern const char sHtmlConnectionTableRow[] =
 	"<tr><td>%s</td><td>%s</td></tr>";
 
-extern const char sHtmlConnectionBodyLow [] =
+extern const char sHtmlConnectionBodyLow[] =
 	"</table></div></div></body></html>";
 
-extern const char sHtmlResultBodyHigh [] =
+extern const char sHtmlResultBodyHigh[] =
 	"<div class='data'>"
 	"<table>"
-		"<tr><th>Currency</th><td>%s</td></tr>"
-		"<tr><th>Difficulty</th><td>%u</td></tr>"
-		"<tr><th>Good results</th><td>%u / %u (%.1f %%)</td></tr>"
-		"<tr><th>Avg result time</th><td>%.1f sec</td></tr>"
-		"<tr><th>Pool-side hashes</th><td>%u</td></tr>"
+	"<tr><th>Currency</th><td>%s</td></tr>"
+	"<tr><th>Difficulty</th><td>%u</td></tr>"
+	"<tr><th>Good results</th><td>%u / %u (%.1f %%)</td></tr>"
+	"<tr><th>Avg result time</th><td>%.1f sec</td></tr>"
+	"<tr><th>Pool-side hashes</th><td>%u</td></tr>"
 	"</table>"
 	"<h4>Top 10 best results found</h4>"
 	"<table>"
-		"<tr><th style='width: 2em;'>1</th><td>%llu</td><th style='width: 2em;'>2</th><td>%llu</td></tr>"
-		"<tr><th>3</th><td>%llu</td><th>4</th><td>%llu</td></tr>"
-		"<tr><th>5</th><td>%llu</td><th>6</th><td>%llu</td></tr>"
-		"<tr><th>7</th><td>%llu</td><th>8</th><td>%llu</td></tr>"
-		"<tr><th>9</th><td>%llu</td><th>10</th><td>%llu</td></tr>"
+	"<tr><th style='width: 2em;'>1</th><td>%llu</td><th style='width: 2em;'>2</th><td>%llu</td></tr>"
+	"<tr><th>3</th><td>%llu</td><th>4</th><td>%llu</td></tr>"
+	"<tr><th>5</th><td>%llu</td><th>6</th><td>%llu</td></tr>"
+	"<tr><th>7</th><td>%llu</td><th>8</th><td>%llu</td></tr>"
+	"<tr><th>9</th><td>%llu</td><th>10</th><td>%llu</td></tr>"
 	"</table>"
 	"<h4>Error details</h4>"
 	"<table>"
-		"<tr><th colspan='2'>Error text</th></tr>"
-		"<tr><th style='width: 5em;'>Count</th><th>Last seen</th></tr>";
+	"<tr><th colspan='2'>Error text</th></tr>"
+	"<tr><th style='width: 5em;'>Count</th><th>Last seen</th></tr>";
 
-extern const char sHtmlResultTableRow [] =
+extern const char sHtmlResultTableRow[] =
 	"<tr><td colspan='2'>%s</td></tr><tr><td>%llu</td><td>%s</td></tr>";
 
 extern const char sHtmlResultBodyLow[] =
@@ -220,31 +220,30 @@ extern const char sJsonApiResultError[] =
 extern const char sJsonApiConnectionError[] =
 	"{\"last_seen\":%llu,\"text\":\"%s\"}";
 
-extern const char sJsonApiFormat [] =
-"{"
+extern const char sJsonApiFormat[] =
+	"{"
 	"\"version\":\"%s\","
 
 	"\"hashrate\":{"
-		"\"threads\":[%s],"
-		"\"total\":%s,"
-		"\"highest\":%s"
+	"\"threads\":[%s],"
+	"\"total\":%s,"
+	"\"highest\":%s"
 	"},"
 
 	"\"results\":{"
-		"\"diff_current\":%llu,"
-		"\"shares_good\":%llu,"
-		"\"shares_total\":%llu,"
-		"\"avg_time\":%.1f,"
-		"\"hashes_total\":%llu,"
-		"\"best\":[%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu],"
-		"\"error_log\":[%s]"
+	"\"diff_current\":%llu,"
+	"\"shares_good\":%llu,"
+	"\"shares_total\":%llu,"
+	"\"avg_time\":%.1f,"
+	"\"hashes_total\":%llu,"
+	"\"best\":[%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu],"
+	"\"error_log\":[%s]"
 	"},"
 
 	"\"connection\":{"
-		"\"pool\": \"%s\","
-		"\"uptime\":%llu,"
-		"\"ping\":%llu,"
-		"\"error_log\":[%s]"
+	"\"pool\": \"%s\","
+	"\"uptime\":%llu,"
+	"\"ping\":%llu,"
+	"\"error_log\":[%s]"
 	"}"
-"}";
-
+	"}";
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 5e3384a63484713228c9096eb0580829678b0fd7..10082a09fa2c97841acbe0a8272acf0613ebfed7 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -26,16 +26,15 @@
 
 #include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/jext.hpp"
-#include "xmrstak/misc/console.hpp"
 #include "xmrstak/misc/utility.hpp"
 
+#include <algorithm>
+#include <math.h>
+#include <numeric>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
 #include <vector>
-#include <numeric>
-#include <algorithm>
 
 #ifdef _WIN32
 #define strcasecmp _stricmp
@@ -44,18 +43,34 @@
 #include <cpuid.h>
 #endif
 
-
 using namespace rapidjson;
 
 /*
  * This enum needs to match index in oConfigValues, otherwise we will get a runtime error
  */
-enum configEnum {
-	aPoolList, sCurrency, bTlsSecureAlgo, iCallTimeout, iNetRetry, iGiveUpLimit, iVerboseLevel, bPrintMotd, iAutohashTime,
-	bDaemonMode, sOutputFile, iHttpdPort, sHttpLogin, sHttpPass, bPreferIpv4, bAesOverride, sUseSlowMem
+enum configEnum
+{
+	aPoolList,
+	sCurrency,
+	bTlsSecureAlgo,
+	iCallTimeout,
+	iNetRetry,
+	iGiveUpLimit,
+	iVerboseLevel,
+	bPrintMotd,
+	iAutohashTime,
+	bDaemonMode,
+	sOutputFile,
+	iHttpdPort,
+	sHttpLogin,
+	sHttpPass,
+	bPreferIpv4,
+	bAesOverride,
+	sUseSlowMem
 };
 
-struct configVal {
+struct configVal
+{
 	configEnum iName;
 	const char* sName;
 	Type iType;
@@ -64,68 +79,66 @@ struct configVal {
 // Same order as in configEnum, as per comment above
 // kNullType means any type
 configVal oConfigValues[] = {
-	{ aPoolList, "pool_list", kArrayType },
-	{ sCurrency, "currency", kStringType },
-	{ bTlsSecureAlgo, "tls_secure_algo", kTrueType },
-	{ iCallTimeout, "call_timeout", kNumberType },
-	{ iNetRetry, "retry_time", kNumberType },
-	{ iGiveUpLimit, "giveup_limit", kNumberType },
-	{ iVerboseLevel, "verbose_level", kNumberType },
-	{ bPrintMotd, "print_motd", kTrueType },
-	{ iAutohashTime, "h_print_time", kNumberType },
-	{ bDaemonMode, "daemon_mode", kTrueType },
-	{ sOutputFile, "output_file", kStringType },
-	{ iHttpdPort, "httpd_port", kNumberType },
-	{ sHttpLogin, "http_login", kStringType },
-	{ sHttpPass, "http_pass", kStringType },
-	{ bPreferIpv4, "prefer_ipv4", kTrueType },
-	{ bAesOverride, "aes_override", kNullType },
-	{ sUseSlowMem, "use_slow_memory", kStringType }
-};
-
-constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
+	{aPoolList, "pool_list", kArrayType},
+	{sCurrency, "currency", kStringType},
+	{bTlsSecureAlgo, "tls_secure_algo", kTrueType},
+	{iCallTimeout, "call_timeout", kNumberType},
+	{iNetRetry, "retry_time", kNumberType},
+	{iGiveUpLimit, "giveup_limit", kNumberType},
+	{iVerboseLevel, "verbose_level", kNumberType},
+	{bPrintMotd, "print_motd", kTrueType},
+	{iAutohashTime, "h_print_time", kNumberType},
+	{bDaemonMode, "daemon_mode", kTrueType},
+	{sOutputFile, "output_file", kStringType},
+	{iHttpdPort, "httpd_port", kNumberType},
+	{sHttpLogin, "http_login", kStringType},
+	{sHttpPass, "http_pass", kStringType},
+	{bPreferIpv4, "prefer_ipv4", kTrueType},
+	{bAesOverride, "aes_override", kNullType},
+	{sUseSlowMem, "use_slow_memory", kStringType}};
+
+constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0]));
 
 xmrstak::coin_selection coins[] = {
 	// name, userpool, devpool, default_pool_suggestion
-	{ "aeon7",                   {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)}, "mine.aeon-pool.com:5555" },
-	{ "bbscoin",                 {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)}, nullptr },
-	{ "bittube",                 {POW(cryptonight_bittube2)},  {POW(cryptonight_gpu)}, "mining.bit.tube:13333" },
-	{ "cryptonight",             {POW(cryptonight)},           {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_bittube2",    {POW(cryptonight_bittube2)},  {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_masari",      {POW(cryptonight_masari)},    {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_haven",       {POW(cryptonight_haven)},     {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_heavy",       {POW(cryptonight_heavy)},     {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_lite",        {POW(cryptonight_lite)},      {POW(cryptonight_aeon)},      nullptr },
-	{ "cryptonight_lite_v7",     {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)},      nullptr },
-	{ "cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)},      {POW(cryptonight_aeon)},      nullptr },
-	{ "cryptonight_r",           {POW(cryptonight_r)},         {POW(cryptonight_r)}, nullptr },
-	{ "cryptonight_superfast",   {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_turtle",      {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
-	{ "cryptonight_v7",          {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v8",          {POW(cryptonight_monero_v8)}, {POW(cryptonight_r)}, nullptr },
-	{ "cryptonight_v8_double",   {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v8_half",     {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_v7_stellite", {POW(cryptonight_stellite)},  {POW(cryptonight_gpu)}, nullptr },
-	{ "cryptonight_gpu",         {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)},       "pool.ryo-currency.com:3333" },
-	{ "cryptonight_conceal",     {POW(cryptonight_conceal)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "freehaven",               {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "graft",                   {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr },
-	{ "haven",                   {POW(cryptonight_haven)},     {POW(cryptonight_gpu)}, nullptr },
-	{ "lethean",                 {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
-	{ "masari",                  {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "monero",                  {POW(cryptonight_r)},         {POW(cryptonight_r)}, "pool.usxmrpool.com:3333" },
-	{ "qrl",             	     {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
-	{ "ryo",                     {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" },
-	{ "stellite",                {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "turtlecoin",              {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr },
-	{ "plenteum",			     {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
-	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_gpu)}, nullptr },
-	{ "xcash",                   {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr }
-};
-
-constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
+	{"aeon7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, "mine.aeon-pool.com:5555"},
+	{"bbscoin", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr},
+	{"bittube", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, "mining.bit.tube:13333"},
+	{"cryptonight", {POW(cryptonight)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_bittube2", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_masari", {POW(cryptonight_masari)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_heavy", {POW(cryptonight_heavy)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_lite", {POW(cryptonight_lite)}, {POW(cryptonight_aeon)}, nullptr},
+	{"cryptonight_lite_v7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr},
+	{"cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)}, {POW(cryptonight_aeon)}, nullptr},
+	{"cryptonight_r", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr},
+	{"cryptonight_superfast", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_turtle", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr},
+	{"cryptonight_v7", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v8", {POW(cryptonight_monero_v8)}, {POW(cryptonight_r)}, nullptr},
+	{"cryptonight_v8_double", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v8_half", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_gpu)}, nullptr},
+	{"cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"},
+	{"cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_gpu)}, nullptr},
+	{"freehaven", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr},
+	{"graft", {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr},
+	{"haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr},
+	{"lethean", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr},
+	{"masari", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr},
+	{"monero", {POW(cryptonight_r)}, {POW(cryptonight_r)}, "pool.usxmrpool.com:3333"},
+	{"qrl", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr},
+	{"ryo", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"},
+	{"stellite", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr},
+	{"turtlecoin", {POW(cryptonight_turtle), 6u, POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr},
+	{"plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr},
+	{"zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr},
+	{"xcash", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr}};
+
+constexpr size_t coin_algo_size = (sizeof(coins) / sizeof(coins[0]));
 
 inline bool checkType(Type have, Type want)
 {
@@ -275,7 +288,7 @@ const char* jconf::GetOutputFile()
 
 void jconf::cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
 {
-	memset(val, 0, sizeof(int32_t)*4);
+	memset(val, 0, sizeof(int32_t) * 4);
 
 #ifdef _WIN32
 	__cpuidex(val, eax, ecx);
@@ -326,7 +339,7 @@ std::string jconf::GetMiningCoin()
 void jconf::GetAlgoList(std::string& list)
 {
 	list.reserve(256);
-	for(size_t i=0; i < coin_algo_size; i++)
+	for(size_t i = 0; i < coin_algo_size; i++)
 	{
 		list += "\t- ";
 		list += coins[i].coin_name;
@@ -338,7 +351,7 @@ bool jconf::IsOnAlgoList(std::string& needle)
 {
 	std::transform(needle.begin(), needle.end(), needle.begin(), ::tolower);
 
-	for(size_t i=0; i < coin_algo_size; i++)
+	for(size_t i = 0; i < coin_algo_size; i++)
 	{
 		if(needle == coins[i].coin_name)
 			return true;
@@ -350,7 +363,7 @@ const char* jconf::GetDefaultPool(const char* needle)
 {
 	const char* default_example = "pool.example.com:3333";
 
-	for(size_t i=0; i < coin_algo_size; i++)
+	for(size_t i = 0; i < coin_algo_size; i++)
 	{
 		if(strcmp(needle, coins[i].coin_name) == 0)
 		{
@@ -366,22 +379,22 @@ const char* jconf::GetDefaultPool(const char* needle)
 
 bool jconf::parse_file(const char* sFilename, bool main_conf)
 {
-	FILE * pFile;
-	char * buffer;
+	FILE* pFile;
+	char* buffer;
 	size_t flen;
 
 	pFile = fopen(sFilename, "rb");
-	if (pFile == NULL)
+	if(pFile == NULL)
 	{
 		printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename);
 		return false;
 	}
 
-	fseek(pFile,0,SEEK_END);
+	fseek(pFile, 0, SEEK_END);
 	flen = ftell(pFile);
 	rewind(pFile);
 
-	if(flen >= 64*1024)
+	if(flen >= 64 * 1024)
 	{
 		fclose(pFile);
 		printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename);
@@ -396,7 +409,7 @@ bool jconf::parse_file(const char* sFilename, bool main_conf)
 	}
 
 	buffer = (char*)malloc(flen + 3);
-	if(fread(buffer+1, flen, 1, pFile) != 1)
+	if(fread(buffer + 1, flen, 1, pFile) != 1)
 	{
 		free(buffer);
 		fclose(pFile);
@@ -420,7 +433,7 @@ bool jconf::parse_file(const char* sFilename, bool main_conf)
 
 	Document& root = main_conf ? prv->jsonDoc : prv->jsonDocPools;
 
-	root.Parse<kParseCommentsFlag|kParseTrailingCommasFlag>(buffer, flen+2);
+	root.Parse<kParseCommentsFlag | kParseTrailingCommasFlag>(buffer, flen + 2);
 	free(buffer);
 
 	if(root.HasParseError())
@@ -514,11 +527,11 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools)
 	std::vector<size_t> pool_weights;
 	pool_weights.reserve(pool_cnt);
 
-	const char* aPoolValues[] = { "pool_address", "wallet_address", "rig_id", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight" };
-	Type poolValTypes[] = { kStringType, kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType };
+	const char* aPoolValues[] = {"pool_address", "wallet_address", "rig_id", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight"};
+	Type poolValTypes[] = {kStringType, kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType};
 
-	constexpr size_t pvcnt = sizeof(aPoolValues)/sizeof(aPoolValues[0]);
-	for(uint32_t i=0; i < pool_cnt; i++)
+	constexpr size_t pvcnt = sizeof(aPoolValues) / sizeof(aPoolValues[0]);
+	for(uint32_t i = 0; i < pool_cnt; i++)
 	{
 		const Value& oThdConf = prv->configValues[aPoolList]->GetArray()[i];
 
@@ -528,7 +541,7 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools)
 			return false;
 		}
 
-		for(uint32_t j=0; j < pvcnt; j++)
+		for(uint32_t j = 0; j < pvcnt; j++)
 		{
 			const Value* v;
 			if((v = GetObjectMember(oThdConf, aPoolValues[j])) == nullptr)
@@ -620,7 +633,7 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools)
 		return false;
 	}
 
-	for(size_t i=0; i < coin_algo_size; i++)
+	for(size_t i = 0; i < coin_algo_size; i++)
 	{
 		if(ctmp == coins[i].coin_name)
 		{
diff --git a/xmrstak/jconf.hpp b/xmrstak/jconf.hpp
index 102b70f54f761ca6f0add0c4b24b8430d69d53ef..5597bf23ed68b623d2289cf15b211631178050ae 100644
--- a/xmrstak/jconf.hpp
+++ b/xmrstak/jconf.hpp
@@ -1,15 +1,15 @@
 #pragma once
 
-#include "xmrstak/misc/environment.hpp"
-#include "xmrstak/misc/coinDescription.hpp"
 #include "params.hpp"
+#include "xmrstak/misc/coinDescription.hpp"
+#include "xmrstak/misc/environment.hpp"
 
 #include <stdlib.h>
 #include <string>
 
 class jconf
 {
-public:
+  public:
 	static jconf* inst()
 	{
 		auto& env = xmrstak::environment::inst();
@@ -20,7 +20,8 @@ public:
 
 	bool parse_config(const char* sFilename, const char* sFilenamePools);
 
-	struct pool_cfg {
+	struct pool_cfg
+	{
 		const char* sPoolAddr;
 		const char* sWalletAddr;
 		const char* sRigId;
@@ -38,7 +39,8 @@ public:
 	uint64_t GetPoolCount();
 	bool GetPoolConfig(size_t id, pool_cfg& cfg);
 
-	enum slow_mem_cfg {
+	enum slow_mem_cfg
+	{
 		always_use,
 		no_mlck,
 		print_warning,
@@ -80,7 +82,7 @@ public:
 
 	slow_mem_cfg GetSlowMemSetting();
 
-private:
+  private:
 	jconf();
 
 	bool parse_file(const char* sFilename, bool main_conf);
diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp
index 65dee143c726e4f3f821cfcd37af35f9ca3bb229..b3b11922605bd880df6875041021140e057cf798 100644
--- a/xmrstak/misc/coinDescription.hpp
+++ b/xmrstak/misc/coinDescription.hpp
@@ -2,86 +2,88 @@
 
 #include "xmrstak/backend/cryptonight.hpp"
 
+#include <algorithm>
 #include <stdlib.h>
 #include <string>
 #include <vector>
-#include <algorithm>
 
 namespace xmrstak
 {
-	struct coinDescription
-	{
-		xmrstak_algo algo = {xmrstak_algo_id::invalid_algo};
-		uint8_t fork_version = 0u;
-		xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo};
+struct coinDescription
+{
+	xmrstak_algo algo = {xmrstak_algo_id::invalid_algo};
+	uint8_t fork_version = 0u;
+	xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo};
 
-		coinDescription() = default;
+	coinDescription() = default;
 
-		coinDescription(
-			const xmrstak_algo in_algo,
-			const uint8_t in_fork_version = 0,
-			xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo
-		) :
-			algo(in_algo), algo_root(in_algo_root), fork_version(in_fork_version)
-		{}
+	coinDescription(
+		const xmrstak_algo in_algo,
+		const uint8_t in_fork_version = 0,
+		xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo) :
+		algo(in_algo),
+		algo_root(in_algo_root),
+		fork_version(in_fork_version)
+	{
+	}
 
-		inline xmrstak_algo GetMiningAlgo() const { return algo; }
-		inline xmrstak_algo GetMiningAlgoRoot() const { return algo_root; }
-		inline uint8_t GetMiningForkVersion() const { return fork_version; }
-	};
+	inline xmrstak_algo GetMiningAlgo() const { return algo; }
+	inline xmrstak_algo GetMiningAlgoRoot() const { return algo_root; }
+	inline uint8_t GetMiningForkVersion() const { return fork_version; }
+};
 
-	struct coin_selection
-	{
-		const char* coin_name = nullptr;
-		/* [0] -> user pool
+struct coin_selection
+{
+	const char* coin_name = nullptr;
+	/* [0] -> user pool
 		 * [1] -> dev pool
 		 */
-		coinDescription pool_coin[2];
-		const char* default_pool = nullptr;
+	coinDescription pool_coin[2];
+	const char* default_pool = nullptr;
 
-		coin_selection() = default;
+	coin_selection() = default;
 
-		coin_selection(
-			const char* in_coin_name,
-			const coinDescription user_coinDescription,
-			const coinDescription dev_coinDescription,
-			const char* in_default_pool
-		) :
-			coin_name(in_coin_name), default_pool(in_default_pool)
-		{
-			pool_coin[0] = user_coinDescription;
-			pool_coin[1] = dev_coinDescription;
-		}
+	coin_selection(
+		const char* in_coin_name,
+		const coinDescription user_coinDescription,
+		const coinDescription dev_coinDescription,
+		const char* in_default_pool) :
+		coin_name(in_coin_name),
+		default_pool(in_default_pool)
+	{
+		pool_coin[0] = user_coinDescription;
+		pool_coin[1] = dev_coinDescription;
+	}
 
-		/** get coin description for the pool
+	/** get coin description for the pool
 		 *
 		 * @param poolId 0 select dev pool, else the user pool is selected
 		 */
-		inline coinDescription GetDescription(size_t poolId) const {
-			coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]);
-			return tmp;
-		}
+	inline coinDescription GetDescription(size_t poolId) const
+	{
+		coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]);
+		return tmp;
+	}
 
-		/** return all POW algorithm for the current selected currency
+	/** return all POW algorithm for the current selected currency
 		 *
 		 * @return required POW algorithms without duplicated entries
 		 */
-		inline std::vector<xmrstak_algo> GetAllAlgorithms()
-		{
-			std::vector<xmrstak_algo> allAlgos = {
-				GetDescription(0).GetMiningAlgo(),
-				GetDescription(0).GetMiningAlgoRoot(),
-				GetDescription(1).GetMiningAlgo(),
-				GetDescription(1).GetMiningAlgoRoot()
-			};
+	inline std::vector<xmrstak_algo> GetAllAlgorithms()
+	{
+		std::vector<xmrstak_algo> allAlgos = {
+			GetDescription(0).GetMiningAlgo(),
+			GetDescription(0).GetMiningAlgoRoot(),
+			GetDescription(1).GetMiningAlgo(),
+			GetDescription(1).GetMiningAlgoRoot()};
 
-			std::sort(allAlgos.begin(), allAlgos.end());
-			std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo);
-			auto last = std::unique(allAlgos.begin(), allAlgos.end());
-			// remove duplicated algorithms
-			allAlgos.erase(last, allAlgos.end());
+		std::sort(allAlgos.begin(), allAlgos.end());
+		std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo);
+		auto last = std::unique(allAlgos.begin(), allAlgos.end());
+		// remove duplicated algorithms
+		allAlgos.erase(last, allAlgos.end());
 
-			return allAlgos;
-		}
-	};
+		return allAlgos;
+	}
+};
 } // namespace xmrstak
diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp
index 3f79df44cb2aa282d371af430f4aa1e0ff16361d..ae81f62c5569240a7e2a4ea6428ee660ecaea9b2 100644
--- a/xmrstak/misc/configEditor.hpp
+++ b/xmrstak/misc/configEditor.hpp
@@ -1,10 +1,10 @@
 #pragma once
 
 #include <atomic>
-#include <string>
 #include <fstream>
-#include <streambuf>
 #include <regex>
+#include <streambuf>
+#include <string>
 
 #include "../version.hpp"
 
@@ -17,16 +17,15 @@ struct configEditor
 
 	configEditor()
 	{
-
 	}
 
-	static bool file_exist( const std::string filename)
+	static bool file_exist(const std::string filename)
 	{
 		std::ifstream fstream(filename);
 		return fstream.good();
 	}
 
-	void set( const std::string && content)
+	void set(const std::string&& content)
 	{
 		m_fileContent = content;
 	}
@@ -36,8 +35,7 @@ struct configEditor
 		std::ifstream fstream(filename);
 		m_fileContent = std::string(
 			(std::istreambuf_iterator<char>(fstream)),
-			std::istreambuf_iterator<char>()
-		);
+			std::istreambuf_iterator<char>());
 		return fstream.good();
 	}
 
@@ -70,7 +68,6 @@ struct configEditor
 	{
 		m_fileContent = std::regex_replace(m_fileContent, std::regex(search), substring);
 	}
-
 };
 
 } // namespace xmrstak
diff --git a/xmrstak/misc/console.cpp b/xmrstak/misc/console.cpp
index c39237eab63462937e283f46e727faee0a978f21..529cc945374fe81865571d28add6a080c26a5123 100644
--- a/xmrstak/misc/console.cpp
+++ b/xmrstak/misc/console.cpp
@@ -23,11 +23,11 @@
 
 #include "xmrstak/misc/console.hpp"
 
-#include <time.h>
+#include <cstdlib>
+#include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
-#include <stdarg.h>
-#include <cstdlib>
+#include <time.h>
 
 #ifdef _WIN32
 #include <windows.h>
@@ -37,15 +37,15 @@ int get_key()
 	DWORD mode, rd;
 	HANDLE h;
 
-	if ((h = GetStdHandle(STD_INPUT_HANDLE)) == NULL)
+	if((h = GetStdHandle(STD_INPUT_HANDLE)) == NULL)
 		return -1;
 
-	GetConsoleMode( h, &mode );
-	SetConsoleMode( h, mode & ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT) );
+	GetConsoleMode(h, &mode);
+	SetConsoleMode(h, mode & ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT));
 
 	int c = 0;
-	ReadConsole( h, &c, 1, &rd, NULL );
-	SetConsoleMode( h, mode );
+	ReadConsole(h, &c, 1, &rd, NULL);
+	SetConsoleMode(h, mode);
 
 	return c;
 }
@@ -90,20 +90,20 @@ void reset_colour()
 }
 
 #else
+#include <stdio.h>
 #include <termios.h>
 #include <unistd.h>
-#include <stdio.h>
 
 int get_key()
 {
 	struct termios oldattr, newattr;
 	int ch;
-	tcgetattr( STDIN_FILENO, &oldattr );
+	tcgetattr(STDIN_FILENO, &oldattr);
 	newattr = oldattr;
-	newattr.c_lflag &= ~( ICANON | ECHO );
-	tcsetattr( STDIN_FILENO, TCSANOW, &newattr );
+	newattr.c_lflag &= ~(ICANON | ECHO);
+	tcsetattr(STDIN_FILENO, TCSANOW, &newattr);
 	ch = getchar();
-	tcsetattr( STDIN_FILENO, TCSANOW, &oldattr );
+	tcsetattr(STDIN_FILENO, TCSANOW, &oldattr);
 	return ch;
 }
 
@@ -182,17 +182,17 @@ void printer::print_msg(verbosity verbose, const char* fmt, ...)
 
 	va_list args;
 	va_start(args, fmt);
-	vsnprintf(buf+bpos, sizeof(buf)-bpos, fmt, args);
+	vsnprintf(buf + bpos, sizeof(buf) - bpos, fmt, args);
 	va_end(args);
 	bpos = strlen(buf);
 
-	if(bpos+2 >= sizeof(buf))
+	if(bpos + 2 >= sizeof(buf))
 		return;
 
 	buf[bpos] = '\n';
-	buf[bpos+1] = '\0';
+	buf[bpos + 1] = '\0';
 
-    print_str(buf);
+	print_str(buf);
 }
 
 void printer::print_str(const char* str)
diff --git a/xmrstak/misc/console.hpp b/xmrstak/misc/console.hpp
index 6df6597c6e63fcb230d0f39555d486c59b0d612a..3c27ee86bc8137e961bf64915a1baf32809a55f5 100644
--- a/xmrstak/misc/console.hpp
+++ b/xmrstak/misc/console.hpp
@@ -4,8 +4,17 @@
 
 #include <mutex>
 
-
-enum out_colours { K_RED, K_GREEN, K_BLUE, K_YELLOW, K_CYAN, K_MAGENTA, K_WHITE, K_NONE };
+enum out_colours
+{
+	K_RED,
+	K_GREEN,
+	K_BLUE,
+	K_YELLOW,
+	K_CYAN,
+	K_MAGENTA,
+	K_WHITE,
+	K_NONE
+};
 
 // Warning - on Linux get_key will detect control keys, but not on Windows.
 // We will only use it for alphanum keys anyway.
@@ -21,11 +30,20 @@ inline long long unsigned int int_port(size_t i)
 	return i;
 }
 
-enum verbosity : size_t { L0 = 0, L1 = 1, L2 = 2, L3 = 3, L4 = 4, LDEBUG = 10, LINF = 100};
+enum verbosity : size_t
+{
+	L0 = 0,
+	L1 = 1,
+	L2 = 2,
+	L3 = 3,
+	L4 = 4,
+	LDEBUG = 10,
+	LINF = 100
+};
 
 class printer
 {
-public:
+  public:
 	static inline printer* inst()
 	{
 		auto& env = xmrstak::environment::inst();
@@ -39,7 +57,7 @@ public:
 	void print_str(const char* str);
 	bool open_logfile(const char* file);
 
-private:
+  private:
 	printer();
 
 	std::mutex print_mutex;
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index 236efb22476410844f81ecfa623ab6f758fb1926..2173946fce13462b15225ea1aba9a6d1451e207d 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -21,31 +21,30 @@
   *
   */
 
-#include "xmrstak/jconf.hpp"
 #include "executor.hpp"
+#include "xmrstak/jconf.hpp"
 #include "xmrstak/net/jpsock.hpp"
 
 #include "telemetry.hpp"
-#include "xmrstak/backend/miner_work.hpp"
-#include "xmrstak/backend/globalStates.hpp"
 #include "xmrstak/backend/backendConnector.hpp"
+#include "xmrstak/backend/globalStates.hpp"
 #include "xmrstak/backend/iBackend.hpp"
+#include "xmrstak/backend/miner_work.hpp"
 
+#include "xmrstak/donate-level.hpp"
+#include "xmrstak/http/webdesign.hpp"
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/console.hpp"
-#include "xmrstak/donate-level.hpp"
 #include "xmrstak/version.hpp"
-#include "xmrstak/http/webdesign.hpp"
 
-#include <thread>
-#include <string>
-#include <cmath>
 #include <algorithm>
-#include <functional>
 #include <assert.h>
+#include <cmath>
+#include <functional>
+#include <string>
+#include <thread>
 #include <time.h>
 
-
 #ifdef _WIN32
 #define strncasecmp _strnicmp
 #endif // _WIN32
@@ -63,7 +62,7 @@ void executor::push_timed_event(ex_event&& ev, size_t sec)
 void executor::ex_clock_thd()
 {
 	size_t tick = 0;
-	while (true)
+	while(true)
 	{
 		std::this_thread::sleep_for(std::chrono::milliseconds(size_t(iTickTime)));
 
@@ -76,7 +75,7 @@ void executor::ex_clock_thd()
 		// Service timed events
 		std::unique_lock<std::mutex> lck(timed_event_mutex);
 		std::list<timed_event>::iterator ev = lTimedEvents.begin();
-		while (ev != lTimedEvents.end())
+		while(ev != lTimedEvents.end())
 		{
 			ev->ticks_left--;
 			if(ev->ticks_left == 0)
@@ -96,7 +95,8 @@ bool executor::get_live_pools(std::vector<jpsock*>& eval_pools, bool is_dev)
 	size_t limit = jconf::inst()->GetGiveUpLimit();
 	size_t wait = jconf::inst()->GetNetRetry();
 
-	if(limit == 0 || is_dev) limit = (-1); //No limit = limit of 2^64-1
+	if(limit == 0 || is_dev)
+		limit = (-1); //No limit = limit of 2^64-1
 
 	size_t pool_count = 0;
 	size_t over_limit = 0;
@@ -330,7 +330,7 @@ void executor::on_sock_ready(size_t pool_id)
 	{
 		if(pool->have_call_error() && !pool->is_dev_pool())
 		{
-			std::string str = "Login error: " +  pool->get_call_error();
+			std::string str = "Login error: " + pool->get_call_error();
 			log_socket_error(pool, std::move(str));
 		}
 
@@ -369,7 +369,8 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob)
 	dat.pool_id = pool_id;
 
 	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work(oPoolJob.sJobID, oPoolJob.bWorkBlob,
-		oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight), dat);
+												  oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight),
+		dat);
 
 	if(dat.pool_id != pool_id)
 	{
@@ -420,12 +421,11 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult)
 		//Ignore errors silently
 		if(pool->is_running() && pool->is_logged_in())
 			pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult, backend_name,
-			backend_hashcount, total_hashcount, oResult.algorithm
-		);
+				backend_hashcount, total_hashcount, oResult.algorithm);
 		return;
 	}
 
-	if (!pool->is_running() || !pool->is_logged_in())
+	if(!pool->is_running() || !pool->is_logged_in())
 	{
 		log_result_error("[NETWORK ERROR]");
 		return;
@@ -433,8 +433,7 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult)
 
 	size_t t_start = get_timestamp_ms();
 	bool bResult = pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult,
-		backend_name, backend_hashcount, total_hashcount, oResult.algorithm
-	);
+		backend_name, backend_hashcount, total_hashcount, oResult.algorithm);
 	size_t t_len = get_timestamp_ms() - t_start;
 
 	if(t_len > 0xFFFF)
@@ -477,12 +476,14 @@ void disable_sigpipe()
 	memset(&sa, 0, sizeof(sa));
 	sa.sa_handler = SIG_IGN;
 	sa.sa_flags = 0;
-	if (sigaction(SIGPIPE, &sa, 0) == -1)
+	if(sigaction(SIGPIPE, &sa, 0) == -1)
 		printer::inst()->print_msg(L1, "ERROR: Call to sigaction failed!");
 }
 
 #else
-inline void disable_sigpipe() {}
+inline void disable_sigpipe()
+{
+}
 #endif
 
 void executor::ex_main()
@@ -496,7 +497,7 @@ void executor::ex_main()
 	// \todo collect all backend threads
 	pvThreads = xmrstak::BackendConnector::thread_starter(oWork);
 
-	if(pvThreads->size()==0)
+	if(pvThreads->size() == 0)
 	{
 		printer::inst()->print_msg(L1, "ERROR: No miner backend enabled.");
 		win_exit();
@@ -508,11 +509,11 @@ void executor::ex_main()
 	size_t pc = jconf::inst()->GetPoolCount();
 	bool dev_tls = true;
 	bool already_have_cli_pool = false;
-	size_t i=0;
+	size_t i = 0;
 	for(; i < pc; i++)
 	{
 		jconf::pool_cfg cfg;
- 		jconf::inst()->GetPoolConfig(i, cfg);
+		jconf::inst()->GetPoolConfig(i, cfg);
 #ifdef CONF_NO_TLS
 		if(cfg.tls)
 		{
@@ -520,7 +521,8 @@ void executor::ex_main()
 			win_exit();
 		}
 #endif
-		if(!cfg.tls) dev_tls = false;
+		if(!cfg.tls)
+			dev_tls = false;
 
 		if(!xmrstak::params::inst().poolURL.empty() && xmrstak::params::inst().poolURL == cfg.sPoolAddr)
 		{
@@ -532,10 +534,10 @@ void executor::ex_main()
 			const char* pwd = params.userSetPwd ? params.poolPasswd.c_str() : cfg.sPasswd;
 			bool nicehash = cfg.nicehash || params.nicehashMode;
 
-			pools.emplace_back(i+1, cfg.sPoolAddr, wallet, rigid, pwd, 9.9, false, params.poolUseTls, cfg.tls_fingerprint, nicehash);
+			pools.emplace_back(i + 1, cfg.sPoolAddr, wallet, rigid, pwd, 9.9, false, params.poolUseTls, cfg.tls_fingerprint, nicehash);
 		}
 		else
-			pools.emplace_back(i+1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sRigId, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash);
+			pools.emplace_back(i + 1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sRigId, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash);
 	}
 
 	if(!xmrstak::params::inst().poolURL.empty() && !already_have_cli_pool)
@@ -547,7 +549,7 @@ void executor::ex_main()
 			win_exit();
 		}
 
-		pools.emplace_back(i+1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolRigid.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode);
+		pools.emplace_back(i + 1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolRigid.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode);
 	}
 
 	switch(jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo())
@@ -604,10 +606,10 @@ void executor::ex_main()
 		push_timed_event(ex_event(EV_HASHRATE_LOOP), jconf::inst()->GetAutohashTime());
 
 	size_t cnt = 0;
-	while (true)
+	while(true)
 	{
 		ev = oEventQ.pop();
-		switch (ev.iName)
+		switch(ev.iName)
 		{
 		case EV_SOCK_READY:
 			on_sock_ready(ev.iPoolId);
@@ -638,9 +640,9 @@ void executor::ex_main()
 		}
 
 		case EV_PERF_TICK:
-			for (i = 0; i < pvThreads->size(); i++)
+			for(i = 0; i < pvThreads->size(); i++)
 				telem->push_perf_value(i, pvThreads->at(i)->iHashCount.load(std::memory_order_relaxed),
-				pvThreads->at(i)->iTimestamp.load(std::memory_order_relaxed));
+					pvThreads->at(i)->iTimestamp.load(std::memory_order_relaxed));
 
 			if((cnt++ & 0xF) == 0) //Every 16 ticks
 			{
@@ -648,7 +650,7 @@ void executor::ex_main()
 				double fTelem;
 				bool normal = true;
 
-				for (i = 0; i < pvThreads->size(); i++)
+				for(i = 0; i < pvThreads->size(); i++)
 				{
 					fTelem = telem->calc_telemetry_data(10000, i);
 					if(std::isnormal(fTelem))
@@ -709,7 +711,7 @@ bool executor::motd_filter_console(std::string& motd)
 	if(motd.size() > motd_max_length)
 		return false;
 
-	motd.erase(std::remove_if(motd.begin(), motd.end(), [](int chr)->bool { return !((chr >= 0x20 && chr <= 0x7e) || chr == '\n');}), motd.end());
+	motd.erase(std::remove_if(motd.begin(), motd.end(), [](int chr) -> bool { return !((chr >= 0x20 && chr <= 0x7e) || chr == '\n'); }), motd.end());
 	return motd.size() > 0;
 }
 
@@ -721,7 +723,7 @@ bool executor::motd_filter_web(std::string& motd)
 	std::string tmp;
 	tmp.reserve(motd.size() + 128);
 
-	for(size_t i=0; i < motd.size(); i++)
+	for(size_t i = 0; i < motd.size(); i++)
 	{
 		char c = motd[i];
 		switch(c)
@@ -774,17 +776,15 @@ void executor::hashrate_report(std::string& out)
 	}
 
 	char num[32];
-	double fTotal[3] = { 0.0, 0.0, 0.0};
+	double fTotal[3] = {0.0, 0.0, 0.0};
 
-	for( uint32_t b = 0; b < 4u; ++b)
+	for(uint32_t b = 0; b < 4u; ++b)
 	{
 		std::vector<xmrstak::iBackend*> backEnds;
 		std::copy_if(pvThreads->begin(), pvThreads->end(), std::back_inserter(backEnds),
-			[&](xmrstak::iBackend* backend)
-			{
+			[&](xmrstak::iBackend* backend) {
 				return backend->backendType == b;
-			}
-		);
+			});
 
 		size_t nthd = backEnds.size();
 		if(nthd != 0)
@@ -801,8 +801,8 @@ void executor::hashrate_report(std::string& out)
 			else
 				out.append(1, '\n');
 
-			double fTotalCur[3] = { 0.0, 0.0, 0.0};
-			for (i = 0; i < nthd; i++)
+			double fTotalCur[3] = {0.0, 0.0, 0.0};
+			for(i = 0; i < nthd; i++)
 			{
 				double fHps[3];
 
@@ -883,12 +883,11 @@ void executor::result_report(std::string& out)
 	size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes;
 	size_t ln = vMineResults.size();
 
-	for(size_t i=1; i < ln; i++)
+	for(size_t i = 1; i < ln; i++)
 		iTotalRes += vMineResults[i].count;
 
 	out.append("RESULT REPORT\n");
-	out.append("Currency         : ").
-		append(jconf::inst()->GetMiningCoin()).append("\n");
+	out.append("Currency         : ").append(jconf::inst()->GetMiningCoin()).append("\n");
 	if(iTotalRes == 0)
 	{
 		out.append("You haven't found any results yet.\n");
@@ -904,8 +903,7 @@ void executor::result_report(std::string& out)
 	snprintf(num, sizeof(num), " (%.1f %%)\n", 100.0 * iGoodRes / iTotalRes);
 
 	out.append("Difficulty       : ").append(std::to_string(iPoolDiff)).append(1, '\n');
-	out.append("Good results     : ").append(std::to_string(iGoodRes)).append(" / ").
-		append(std::to_string(iTotalRes)).append(num);
+	out.append("Good results     : ").append(std::to_string(iGoodRes)).append(" / ").append(std::to_string(iTotalRes)).append(num);
 
 	if(iPoolCallTimes.size() != 0)
 	{
@@ -916,10 +914,10 @@ void executor::result_report(std::string& out)
 	out.append("Pool-side hashes : ").append(std::to_string(iPoolHashes)).append(2, '\n');
 	out.append("Top 10 best results found:\n");
 
-	for(size_t i=0; i < 10; i += 2)
+	for(size_t i = 0; i < 10; i += 2)
 	{
 		snprintf(num, sizeof(num), "| %2llu | %16llu | %2llu | %16llu |\n",
-			int_port(i), int_port(iTopDiff[i]), int_port(i+1), int_port(iTopDiff[i+1]));
+			int_port(i), int_port(iTopDiff[i]), int_port(i + 1), int_port(iTopDiff[i + 1]));
 		out.append(num);
 	}
 
@@ -927,7 +925,7 @@ void executor::result_report(std::string& out)
 	if(ln > 1)
 	{
 		out.append("| Count | Error text                       | Last seen           |\n");
-		for(size_t i=1; i < ln; i++)
+		for(size_t i = 1; i < ln; i++)
 		{
 			snprintf(num, sizeof(num), "| %5llu | %-32.32s | %s |\n", int_port(vMineResults[i].count),
 				vMineResults[i].msg.c_str(), time_format(date, sizeof(date), vMineResults[i].time));
@@ -958,11 +956,11 @@ void executor::connection_report(std::string& out)
 		out.append("Connected since : <not connected>\n");
 
 	size_t n_calls = iPoolCallTimes.size();
-	if (n_calls > 1)
+	if(n_calls > 1)
 	{
 		//Not-really-but-good-enough median
-		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end());
-		out.append("Pool ping time  : ").append(std::to_string(iPoolCallTimes[n_calls/2])).append(" ms\n");
+		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end());
+		out.append("Pool ping time  : ").append(std::to_string(iPoolCallTimes[n_calls / 2])).append(" ms\n");
 	}
 	else
 		out.append("Pool ping time  : (n/a)\n");
@@ -972,7 +970,7 @@ void executor::connection_report(std::string& out)
 	if(ln > 0)
 	{
 		out.append("| Date                | Error text                                             |\n");
-		for(size_t i=0; i < ln; i++)
+		for(size_t i = 0; i < ln; i++)
 		{
 			snprintf(num, sizeof(num), "| %s | %-54.54s |\n",
 				time_format(date, sizeof(date), vSocketLog[i].time), vSocketLog[i].msg.c_str());
@@ -1045,11 +1043,11 @@ void executor::http_hashrate_report(std::string& out)
 	snprintf(buffer, sizeof(buffer), sHtmlHashrateBodyHigh, (unsigned int)nthd + 3);
 	out.append(buffer);
 
-	double fTotal[3] = { 0.0, 0.0, 0.0};
+	double fTotal[3] = {0.0, 0.0, 0.0};
 	auto bTypePrev = static_cast<xmrstak::iBackend::BackendType>(0);
 	std::string name;
 	size_t j = 0;
-	for(size_t i=0; i < nthd; i++)
+	for(size_t i = 0; i < nthd; i++)
 	{
 		double fHps[3];
 		char csThreadTag[25];
@@ -1065,14 +1063,13 @@ void executor::http_hashrate_report(std::string& out)
 		}
 		snprintf(csThreadTag, sizeof(csThreadTag),
 			(99 < nthd) ? "[%s.%03u]:%03u" : ((9 < nthd) ? "[%s.%02u]:%02u" : "[%s.%u]:%u"),
-			name.c_str(), (unsigned int)(j), (unsigned int)i
-		);
+			name.c_str(), (unsigned int)(j), (unsigned int)i);
 
 		fHps[0] = telem->calc_telemetry_data(10000, i);
 		fHps[1] = telem->calc_telemetry_data(60000, i);
 		fHps[2] = telem->calc_telemetry_data(900000, i);
 
-		num_a[0] = num_b[0] = num_c[0] ='\0';
+		num_a[0] = num_b[0] = num_c[0] = '\0';
 		hps_format(fHps[0], num_a, sizeof(num_a));
 		hps_format(fHps[1], num_b, sizeof(num_b));
 		hps_format(fHps[2], num_c, sizeof(num_c));
@@ -1085,7 +1082,7 @@ void executor::http_hashrate_report(std::string& out)
 		out.append(buffer);
 	}
 
-	num_a[0] = num_b[0] = num_c[0] = num_d[0] ='\0';
+	num_a[0] = num_b[0] = num_c[0] = num_d[0] = '\0';
 	hps_format(fTotal[0], num_a, sizeof(num_a));
 	hps_format(fTotal[1], num_b, sizeof(num_b));
 	hps_format(fTotal[2], num_c, sizeof(num_c));
@@ -1102,13 +1099,13 @@ void executor::http_result_report(std::string& out)
 
 	out.reserve(4096);
 
-	snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Result Report", ver_html,  "Result Report");
+	snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Result Report", ver_html, "Result Report");
 	out.append(buffer);
 
 	size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes;
 	size_t ln = vMineResults.size();
 
-	for(size_t i=1; i < ln; i++)
+	for(size_t i = 1; i < ln; i++)
 		iTotalRes += vMineResults[i].count;
 
 	double fGoodResPrc = 0.0;
@@ -1119,8 +1116,7 @@ void executor::http_result_report(std::string& out)
 	if(iPoolCallTimes.size() > 0)
 	{
 		using namespace std::chrono;
-		fAvgResTime = ((double)duration_cast<seconds>(system_clock::now() - tPoolConnTime).count())
-			/ iPoolCallTimes.size();
+		fAvgResTime = ((double)duration_cast<seconds>(system_clock::now() - tPoolConnTime).count()) / iPoolCallTimes.size();
 	}
 
 	snprintf(buffer, sizeof(buffer), sHtmlResultBodyHigh,
@@ -1132,7 +1128,7 @@ void executor::http_result_report(std::string& out)
 
 	out.append(buffer);
 
-	for(size_t i=1; i < vMineResults.size(); i++)
+	for(size_t i = 1; i < vMineResults.size(); i++)
 	{
 		snprintf(buffer, sizeof(buffer), sHtmlResultTableRow, vMineResults[i].msg.c_str(),
 			int_port(vMineResults[i].count), time_format(date, sizeof(date), vMineResults[i].time));
@@ -1149,7 +1145,7 @@ void executor::http_connection_report(std::string& out)
 
 	out.reserve(4096);
 
-	snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Connection Report", ver_html,  "Connection Report");
+	snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Connection Report", ver_html, "Connection Report");
 	out.append(buffer);
 
 	jpsock* pool = pick_pool_by_id(current_pool_id);
@@ -1157,16 +1153,16 @@ void executor::http_connection_report(std::string& out)
 		pool = pick_pool_by_id(last_usr_pool_id);
 
 	const char* cdate = "not connected";
-	if (pool != nullptr && pool->is_running() && pool->is_logged_in())
+	if(pool != nullptr && pool->is_running() && pool->is_logged_in())
 		cdate = time_format(date, sizeof(date), tPoolConnTime);
 
 	size_t n_calls = iPoolCallTimes.size();
 	unsigned int ping_time = 0;
-	if (n_calls > 1)
+	if(n_calls > 1)
 	{
 		//Not-really-but-good-enough median
-		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end());
-		ping_time = iPoolCallTimes[n_calls/2];
+		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end());
+		ping_time = iPoolCallTimes[n_calls / 2];
 	}
 
 	snprintf(buffer, sizeof(buffer), sHtmlConnectionBodyHigh,
@@ -1175,8 +1171,7 @@ void executor::http_connection_report(std::string& out)
 		cdate, ping_time);
 	out.append(buffer);
 
-
-	for(size_t i=0; i < vSocketLog.size(); i++)
+	for(size_t i = 0; i < vSocketLog.size(); i++)
 	{
 		snprintf(buffer, sizeof(buffer), sHtmlConnectionTableRow,
 			time_format(date, sizeof(date), vSocketLog[i].time), vSocketLog[i].msg.c_str());
@@ -1205,12 +1200,13 @@ void executor::http_json_report(std::string& out)
 	std::string hr_thds, res_error, cn_error;
 
 	size_t nthd = pvThreads->size();
-	double fTotal[3] = { 0.0, 0.0, 0.0};
+	double fTotal[3] = {0.0, 0.0, 0.0};
 	hr_thds.reserve(nthd * 32);
 
-	for(size_t i=0; i < nthd; i++)
+	for(size_t i = 0; i < nthd; i++)
 	{
-		if(i != 0) hr_thds.append(1, ',');
+		if(i != 0)
+			hr_thds.append(1, ',');
 
 		double fHps[3];
 		fHps[0] = telem->calc_telemetry_data(10000, i);
@@ -1238,7 +1234,7 @@ void executor::http_json_report(std::string& out)
 	size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes;
 	size_t ln = vMineResults.size();
 
-	for(size_t i=1; i < ln; i++)
+	for(size_t i = 1; i < ln; i++)
 		iTotalRes += vMineResults[i].count;
 
 	jpsock* pool = pick_pool_by_id(current_pool_id);
@@ -1258,10 +1254,11 @@ void executor::http_json_report(std::string& out)
 
 	char buffer[2048];
 	res_error.reserve((vMineResults.size() - 1) * 128);
-	for(size_t i=1; i < vMineResults.size(); i++)
+	for(size_t i = 1; i < vMineResults.size(); i++)
 	{
 		using namespace std::chrono;
-		if(i != 1) res_error.append(1, ',');
+		if(i != 1)
+			res_error.append(1, ',');
 
 		snprintf(buffer, sizeof(buffer), sJsonApiResultError, int_port(vMineResults[i].count),
 			int_port(duration_cast<seconds>(vMineResults[i].time.time_since_epoch()).count()),
@@ -1271,18 +1268,19 @@ void executor::http_json_report(std::string& out)
 
 	size_t n_calls = iPoolCallTimes.size();
 	size_t iPoolPing = 0;
-	if (n_calls > 1)
+	if(n_calls > 1)
 	{
 		//Not-really-but-good-enough median
-		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end());
-		iPoolPing = iPoolCallTimes[n_calls/2];
+		std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end());
+		iPoolPing = iPoolCallTimes[n_calls / 2];
 	}
 
 	cn_error.reserve(vSocketLog.size() * 256);
-	for(size_t i=0; i < vSocketLog.size(); i++)
+	for(size_t i = 0; i < vSocketLog.size(); i++)
 	{
 		using namespace std::chrono;
-		if(i != 0) cn_error.append(1, ',');
+		if(i != 0)
+			cn_error.append(1, ',');
 
 		snprintf(buffer, sizeof(buffer), sJsonApiConnectionError,
 			int_port(duration_cast<seconds>(vSocketLog[i].time.time_since_epoch()).count()),
@@ -1291,7 +1289,7 @@ void executor::http_json_report(std::string& out)
 	}
 
 	size_t bb_size = 2048 + hr_thds.size() + res_error.size() + cn_error.size();
-	std::unique_ptr<char[]> bigbuf( new char[ bb_size ] );
+	std::unique_ptr<char[]> bigbuf(new char[bb_size]);
 
 	int bb_len = snprintf(bigbuf.get(), bb_size, sJsonApiFormat,
 		get_version_str().c_str(), hr_thds.c_str(), hr_buffer, a,
@@ -1338,8 +1336,7 @@ void executor::get_http_report(ex_event_name ev_id, std::string& data)
 	std::lock_guard<std::mutex> lck(httpMutex);
 
 	assert(pHttpString == nullptr);
-	assert(ev_id == EV_HTML_HASHRATE || ev_id == EV_HTML_RESULTS
-		|| ev_id == EV_HTML_CONNSTAT || ev_id == EV_HTML_JSON);
+	assert(ev_id == EV_HTML_HASHRATE || ev_id == EV_HTML_RESULTS || ev_id == EV_HTML_CONNSTAT || ev_id == EV_HTML_JSON);
 
 	pHttpString = &data;
 	httpReady = std::promise<void>();
diff --git a/xmrstak/misc/executor.hpp b/xmrstak/misc/executor.hpp
index be5ee6c2f7b81f422e0c122a7b6a09b188b4530f..47359afc217b2d4da5852b652fa2e8aa7c67c6e8 100644
--- a/xmrstak/misc/executor.hpp
+++ b/xmrstak/misc/executor.hpp
@@ -1,18 +1,18 @@
 #pragma once
 
-#include "thdq.hpp"
 #include "telemetry.hpp"
+#include "thdq.hpp"
 #include "xmrstak/backend/iBackend.hpp"
+#include "xmrstak/donate-level.hpp"
 #include "xmrstak/misc/environment.hpp"
 #include "xmrstak/net/msgstruct.hpp"
-#include "xmrstak/donate-level.hpp"
 
-#include <atomic>
 #include <array>
+#include <atomic>
+#include <chrono>
+#include <future>
 #include <list>
 #include <vector>
-#include <future>
-#include <chrono>
 
 class jpsock;
 
@@ -27,7 +27,7 @@ class minethd;
 
 class executor
 {
-public:
+  public:
 	static executor* inst()
 	{
 		auto& env = xmrstak::environment::inst();
@@ -43,13 +43,15 @@ public:
 	inline void push_event(ex_event&& ev) { oEventQ.push(std::move(ev)); }
 	void push_timed_event(ex_event&& ev, size_t sec);
 
-private:
+  private:
 	struct timed_event
 	{
 		ex_event event;
 		size_t ticks_left;
 
-		timed_event(ex_event&& ev, size_t ticks) : event(std::move(ev)), ticks_left(ticks) {}
+		timed_event(ex_event&& ev, size_t ticks) :
+			event(std::move(ev)),
+			ticks_left(ticks) {}
 	};
 
 	inline void set_timestamp() { dev_timestamp = get_timestamp(); };
@@ -119,7 +121,8 @@ private:
 		std::chrono::system_clock::time_point time;
 		std::string msg;
 
-		sck_error_log(std::string&& err) : msg(std::move(err))
+		sck_error_log(std::string&& err) :
+			msg(std::move(err))
 		{
 			time = std::chrono::system_clock::now();
 		}
@@ -134,12 +137,16 @@ private:
 		std::string msg;
 		size_t count;
 
-		result_tally() : msg("[OK]"), count(0)
+		result_tally() :
+			msg("[OK]"),
+			count(0)
 		{
 			time = std::chrono::system_clock::now();
 		}
 
-		result_tally(std::string&& err) : msg(std::move(err)), count(1)
+		result_tally(std::string&& err) :
+			msg(std::move(err)),
+			count(1)
 		{
 			time = std::chrono::system_clock::now();
 		}
@@ -161,7 +168,7 @@ private:
 	std::vector<result_tally> vMineResults;
 
 	//More result statistics
-	std::array<size_t, 10> iTopDiff { { } }; //Initialize to zero
+	std::array<size_t, 10> iTopDiff{{}}; //Initialize to zero
 
 	std::chrono::system_clock::time_point tPoolConnTime;
 	size_t iPoolHashes = 0;
@@ -195,4 +202,3 @@ private:
 
 	inline size_t sec_to_ticks(size_t sec) { return sec * (1000 / iTickTime); }
 };
-
diff --git a/xmrstak/misc/home_dir.hpp b/xmrstak/misc/home_dir.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..836c7cc4edfc78d5df8ad8fbd23291d6ed50feb7
--- /dev/null
+++ b/xmrstak/misc/home_dir.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <string>
+
+#ifdef _WIN32
+#include <WinSock2.h>
+// this comment avoid that clang format reorders the includes
+#include <Shlobj.h>
+
+namespace
+{
+inline std::string get_home()
+{
+	char path[MAX_PATH + 1];
+	// get folder "appdata\local"
+	if(SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE))
+	{
+		return path;
+	}
+	else
+		return ".";
+}
+} // namespace
+
+#else
+#include <cstdlib>
+#include <pwd.h>
+#include <unistd.h>
+
+namespace
+{
+inline std::string get_home()
+{
+	const char* home = ".";
+
+	if((home = getenv("HOME")) == nullptr)
+		home = getpwuid(getuid())->pw_dir;
+
+	return home;
+}
+} // namespace
+
+#endif // _WIN32
diff --git a/xmrstak/misc/jext.hpp b/xmrstak/misc/jext.hpp
index 9936fa81329169a979f18c5cd99bf2a2f87c3382..421508989276de2542c4739da7e29fa3a748716c 100644
--- a/xmrstak/misc/jext.hpp
+++ b/xmrstak/misc/jext.hpp
@@ -9,7 +9,7 @@ using namespace rapidjson;
 inline const Value* GetObjectMember(const Value& obj, const char* key)
 {
 	Value::ConstMemberIterator itr = obj.FindMember(key);
-	if (itr != obj.MemberEnd())
+	if(itr != obj.MemberEnd())
 		return &itr->value;
 	else
 		return nullptr;
@@ -48,8 +48,8 @@ inline const Value* GetObjectMember(const Value& obj, const char* key)
 
 #elif defined(__NetBSD__)
 
-#include <sys/types.h>
 #include <machine/bswap.h>
+#include <sys/types.h>
 #if defined(__BSWAP_RENAME) && !defined(__bswap_32)
 #define bswap_32(x) bswap32(x)
 #define bswap_64(x) bswap64(x)
diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp
index 47442df09873f2f5c41ffee90c7b023a5d11451e..16ecaa6f67cbd11c042daa3fea33e6ad8da7cd12 100644
--- a/xmrstak/misc/telemetry.cpp
+++ b/xmrstak/misc/telemetry.cpp
@@ -24,9 +24,9 @@
 #include "telemetry.hpp"
 #include "xmrstak/net/msgstruct.hpp"
 
+#include <chrono>
 #include <cmath>
 #include <cstring>
-#include <chrono>
 
 namespace xmrstak
 {
@@ -38,7 +38,7 @@ telemetry::telemetry(size_t iThd)
 	iBucketTop = new uint32_t[iThd];
 	mtx = new std::mutex[iThd];
 
-	for (size_t i = 0; i < iThd; i++)
+	for(size_t i = 0; i < iThd; i++)
 	{
 		ppHashCounts[i] = new uint64_t[iBucketSize];
 		ppTimestamps[i] = new uint64_t[iBucketSize];
@@ -51,7 +51,6 @@ telemetry::telemetry(size_t iThd)
 double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 {
 
-
 	uint64_t iEarliestHashCnt = 0;
 	uint64_t iEarliestStamp = 0;
 	uint64_t iLatestStamp = 0;
@@ -62,20 +61,20 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 	uint64_t iTimeNow = get_timestamp_ms();
 
 	//Start at 1, buckettop points to next empty
-	for (size_t i = 1; i < iBucketSize; i++)
+	for(size_t i = 1; i < iBucketSize; i++)
 	{
 		size_t idx = (iBucketTop[iThread] - i) & iBucketMask; //overflow expected here
 
-		if (ppTimestamps[iThread][idx] == 0)
+		if(ppTimestamps[iThread][idx] == 0)
 			break; //That means we don't have the data yet
 
-		if (iLatestStamp == 0)
+		if(iLatestStamp == 0)
 		{
 			iLatestStamp = ppTimestamps[iThread][idx];
 			iLatestHashCnt = ppHashCounts[iThread][idx];
 		}
 
-		if (iTimeNow - ppTimestamps[iThread][idx] > iLastMillisec)
+		if(iTimeNow - ppTimestamps[iThread][idx] > iLastMillisec)
 		{
 			bHaveFullSet = true;
 			break; //We are out of the requested time period
@@ -86,11 +85,11 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 	}
 	lk.unlock();
 
-	if (!bHaveFullSet || iEarliestStamp == 0 || iLatestStamp == 0)
+	if(!bHaveFullSet || iEarliestStamp == 0 || iLatestStamp == 0)
 		return nan("");
 
 	//Don't think that can happen, but just in case
-	if (iLatestStamp - iEarliestStamp == 0)
+	if(iLatestStamp - iEarliestStamp == 0)
 		return nan("");
 
 	double fHashes, fTime;
diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp
index 580565de2e1de25b53f7580129bc2304c1e5f43e..2ab2a9e5fe5e409cd4079cf283830e772b3ed639 100644
--- a/xmrstak/misc/telemetry.hpp
+++ b/xmrstak/misc/telemetry.hpp
@@ -9,12 +9,12 @@ namespace xmrstak
 
 class telemetry
 {
-public:
+  public:
 	telemetry(size_t iThd);
 	void push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp);
 	double calc_telemetry_data(size_t iLastMillisec, size_t iThread);
 
-private:
+  private:
 	std::mutex* mtx;
 	constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations
 	constexpr static size_t iBucketMask = iBucketSize - 1;
diff --git a/xmrstak/misc/thdq.hpp b/xmrstak/misc/thdq.hpp
index 7a4a5cfe4974b6eff4b506175bf6e1964adae7f9..2eef30bcf6f8683bfbc687e44d30d686a3577d2c 100644
--- a/xmrstak/misc/thdq.hpp
+++ b/xmrstak/misc/thdq.hpp
@@ -1,31 +1,37 @@
 #pragma once
-
+
+#include <condition_variable>
+#include <mutex>
 #include <queue>
 #include <thread>
-#include <mutex>
-#include <condition_variable>
-
+
 template <typename T>
 class thdq
 {
-public:
+  public:
 	T pop()
 	{
 		std::unique_lock<std::mutex> mlock(mutex_);
-		while (queue_.empty()) { cond_.wait(mlock); }
+		while(queue_.empty())
+		{
+			cond_.wait(mlock);
+		}
 		auto item = std::move(queue_.front());
 		queue_.pop();
 		return item;
 	}
-
+
 	void pop(T& item)
 	{
 		std::unique_lock<std::mutex> mlock(mutex_);
-		while (queue_.empty()) { cond_.wait(mlock); }
+		while(queue_.empty())
+		{
+			cond_.wait(mlock);
+		}
 		item = queue_.front();
 		queue_.pop();
 	}
-
+
 	void push(const T& item)
 	{
 		std::unique_lock<std::mutex> mlock(mutex_);
@@ -33,7 +39,7 @@ public:
 		mlock.unlock();
 		cond_.notify_one();
 	}
-
+
 	void push(T&& item)
 	{
 		std::unique_lock<std::mutex> mlock(mutex_);
@@ -41,9 +47,9 @@ public:
 		mlock.unlock();
 		cond_.notify_one();
 	}
-
-private:
+
+  private:
 	std::queue<T> queue_;
 	std::mutex mutex_;
 	std::condition_variable cond_;
-};
+};
diff --git a/xmrstak/misc/uac.cpp b/xmrstak/misc/uac.cpp
index 9f940933caf09345e223486aef00e61384b84f18..c7da90bd8d439af63292972fd52480d8ee97e527 100644
--- a/xmrstak/misc/uac.cpp
+++ b/xmrstak/misc/uac.cpp
@@ -9,24 +9,24 @@ BOOL IsElevated()
 {
 	BOOL fRet = FALSE;
 	HANDLE hToken = NULL;
-	if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken))
+	if(OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken))
 	{
 		TOKEN_ELEVATION Elevation;
 		DWORD cbSize = sizeof(TOKEN_ELEVATION);
-		if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
+		if(GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize))
 			fRet = Elevation.TokenIsElevated;
 	}
-	if (hToken)
+	if(hToken)
 		CloseHandle(hToken);
 	return fRet;
 }
 
 BOOL SelfElevate(const std::string& my_path, const std::string& params)
 {
-	if (IsElevated())
+	if(IsElevated())
 		return FALSE;
 
-	SHELLEXECUTEINFO shExecInfo = { 0 };
+	SHELLEXECUTEINFO shExecInfo = {0};
 	shExecInfo.cbSize = sizeof(SHELLEXECUTEINFO);
 	shExecInfo.fMask = SEE_MASK_NOCLOSEPROCESS;
 	shExecInfo.hwnd = NULL;
@@ -37,7 +37,7 @@ BOOL SelfElevate(const std::string& my_path, const std::string& params)
 	shExecInfo.nShow = SW_SHOW;
 	shExecInfo.hInstApp = NULL;
 
-	if (!ShellExecuteEx(&shExecInfo))
+	if(!ShellExecuteEx(&shExecInfo))
 		return FALSE;
 
 	// Loiter in the background to make scripting easier
@@ -65,13 +65,13 @@ VOID RequestElevation()
 
 BOOL IsWindows10OrNewer()
 {
-    OSVERSIONINFOEX osvi = { 0 };
-    osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
-    osvi.dwMajorVersion = 10;
-    osvi.dwMinorVersion = 0;
-    DWORDLONG dwlConditionMask = 0;
-    VER_SET_CONDITION(dwlConditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL);
-    VER_SET_CONDITION(dwlConditionMask, VER_MINORVERSION, VER_GREATER_EQUAL);
-    return ::VerifyVersionInfo(&osvi, VER_MAJORVERSION | VER_MINORVERSION, dwlConditionMask);
+	OSVERSIONINFOEX osvi = {0};
+	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+	osvi.dwMajorVersion = 10;
+	osvi.dwMinorVersion = 0;
+	DWORDLONG dwlConditionMask = 0;
+	VER_SET_CONDITION(dwlConditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL);
+	VER_SET_CONDITION(dwlConditionMask, VER_MINORVERSION, VER_GREATER_EQUAL);
+	return ::VerifyVersionInfo(&osvi, VER_MAJORVERSION | VER_MINORVERSION, dwlConditionMask);
 }
 #endif
diff --git a/xmrstak/misc/utility.cpp b/xmrstak/misc/utility.cpp
index 5177d14c23121f13e4069e4ace67acf6b1a39784..bf665bda3a83f77d18b95cf915005a902c088f2b 100644
--- a/xmrstak/misc/utility.cpp
+++ b/xmrstak/misc/utility.cpp
@@ -1,21 +1,15 @@
-#include <string>
 #include <algorithm>
-
+#include <string>
 
 namespace xmrstak
 {
-	bool strcmp_i(const std::string& str1, const std::string& str2)
-	{
-		if(str1.size() != str2.size())
-			return false;
-		else
-		return (str1.empty() | str2.empty()) ?
-				false :
-				std::equal(str1.begin(), str1.end(),str2.begin(),
-					[](char c1, char c2)
-					{
-						return ::tolower(c1) == ::tolower(c2);
-					}
-				);
-	}
+bool strcmp_i(const std::string& str1, const std::string& str2)
+{
+	if(str1.size() != str2.size())
+		return false;
+	else
+		return (str1.empty() | str2.empty()) ? false : std::equal(str1.begin(), str1.end(), str2.begin(), [](char c1, char c2) {
+			return ::tolower(c1) == ::tolower(c2);
+		});
+}
 } // namespace xmrstak
diff --git a/xmrstak/misc/utility.hpp b/xmrstak/misc/utility.hpp
index 8f2e99fb8819ac06a872d3c7591e524effa5ab6a..0eb08993dfe10396f5a839acd8e52d401e7c58c4 100644
--- a/xmrstak/misc/utility.hpp
+++ b/xmrstak/misc/utility.hpp
@@ -4,9 +4,9 @@
 
 namespace xmrstak
 {
-	/** case insensitive string compare
+/** case insensitive string compare
 	 *
 	 * @return true if both strings are equal, else false
 	 */
-	bool strcmp_i(const std::string& str1, const std::string& str2);
+bool strcmp_i(const std::string& str1, const std::string& str2);
 } // namespace xmrstak
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index 786b18b4f89c3b9d17e7ec5b8282dbe97c180414..f9522962f5a8dcff90c24a5db451f6539232a063 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -21,17 +21,17 @@
   *
   */
 
-#include <stdarg.h>
-#include <assert.h>
 #include <algorithm>
+#include <assert.h>
 #include <chrono>
+#include <stdarg.h>
 
 #include "jpsock.hpp"
-#include "socks.hpp"
 #include "socket.hpp"
+#include "socks.hpp"
 
-#include "xmrstak/misc/executor.hpp"
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/misc/executor.hpp"
 #include "xmrstak/misc/jext.hpp"
 #include "xmrstak/version.hpp"
 
@@ -45,7 +45,9 @@ struct jpsock::call_rsp
 	std::string sCallErr;
 	uint64_t iMessageId;
 
-	call_rsp(Value* val) : pCallData(val), iMessageId(0)
+	call_rsp(Value* val) :
+		pCallData(val),
+		iMessageId(0)
 	{
 		bHaveResponse = false;
 		iCallId = 0;
@@ -70,7 +72,7 @@ typedef GenericDocument<UTF8<>, MemoryPoolAllocator<>, MemoryPoolAllocator<>> Me
 
 struct jpsock::opaque_private
 {
-	Value  oCallValue;
+	Value oCallValue;
 
 	MemoryPoolAllocator<> callAllocator;
 	MemoryPoolAllocator<> recvAllocator;
@@ -91,12 +93,24 @@ struct jpsock::opaque_private
 struct jpsock::opq_json_val
 {
 	const Value* val;
-	opq_json_val(const Value* val) : val(val) {}
+	opq_json_val(const Value* val) :
+		val(val) {}
 };
 
 jpsock::jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash) :
-	net_addr(sAddr), usr_login(sLogin), usr_rigid(sRigId), usr_pass(sPassword), tls_fp(tls_fp), pool_id(id), pool_weight(pool_weight), pool(dev_pool), nicehash(nicehash),
-	connect_time(0), connect_attempts(0), disconnect_time(0), quiet_close(false)
+	net_addr(sAddr),
+	usr_login(sLogin),
+	usr_rigid(sRigId),
+	usr_pass(sPassword),
+	tls_fp(tls_fp),
+	pool_id(id),
+	pool_weight(pool_weight),
+	pool(dev_pool),
+	nicehash(nicehash),
+	connect_time(0),
+	connect_attempts(0),
+	disconnect_time(0),
+	quiet_close(false)
 {
 	sock_init();
 
@@ -245,7 +259,7 @@ bool jpsock::jpsock_thd_main()
 
 	char buf[iSockBufferSize];
 	size_t datalen = 0;
-	while (true)
+	while(true)
 	{
 		int ret = sck->recv(buf + datalen, sizeof(buf) - datalen);
 
@@ -254,7 +268,7 @@ bool jpsock::jpsock_thd_main()
 
 		datalen += ret;
 
-		if (datalen >= sizeof(buf))
+		if(datalen >= sizeof(buf))
 		{
 			sck->close(false);
 			return set_socket_error("RECEIVE error: data overflow");
@@ -262,12 +276,12 @@ bool jpsock::jpsock_thd_main()
 
 		char* lnend;
 		char* lnstart = buf;
-		while ((lnend = (char*)memchr(lnstart, '\n', datalen)) != nullptr)
+		while((lnend = (char*)memchr(lnstart, '\n', datalen)) != nullptr)
 		{
 			lnend++;
 			int lnlen = lnend - lnstart;
 
-			if (!process_line(lnstart, lnlen))
+			if(!process_line(lnstart, lnlen))
 			{
 				sck->close(false);
 				return false;
@@ -278,7 +292,7 @@ bool jpsock::jpsock_thd_main()
 		}
 
 		//Got leftover data? Move it to the front
-		if (datalen > 0 && buf != lnstart)
+		if(datalen > 0 && buf != lnstart)
 			memmove(buf, lnstart, datalen);
 	}
 }
@@ -291,18 +305,18 @@ bool jpsock::process_line(char* line, size_t len)
 	++iMessageCnt;
 
 	/*NULL terminate the line instead of '\n', parsing will add some more NULLs*/
-	line[len-1] = '\0';
+	line[len - 1] = '\0';
 
 	//printf("RECV: %s\n", line);
 
-	if (prv->jsonDoc.ParseInsitu(line).HasParseError())
+	if(prv->jsonDoc.ParseInsitu(line).HasParseError())
 		return set_socket_error("PARSE error: Invalid JSON");
 
-	if (!prv->jsonDoc.IsObject())
+	if(!prv->jsonDoc.IsObject())
 		return set_socket_error("PARSE error: Invalid root");
 
 	const Value* mt;
-	if (prv->jsonDoc.HasMember("method"))
+	if(prv->jsonDoc.HasMember("method"))
 	{
 		mt = GetObjectMember(prv->jsonDoc, "method");
 
@@ -329,7 +343,7 @@ bool jpsock::process_line(char* line, size_t len)
 	{
 		uint64_t iCallId;
 		mt = GetObjectMember(prv->jsonDoc, "id");
-		if (mt == nullptr || !mt->IsUint64())
+		if(mt == nullptr || !mt->IsUint64())
 			return set_socket_error("PARSE error: Protocol error 3");
 
 		iCallId = mt->GetUint64();
@@ -338,10 +352,10 @@ bool jpsock::process_line(char* line, size_t len)
 
 		const char* sError = nullptr;
 		size_t iErrorLen = 0;
-		if (mt == nullptr || mt->IsNull())
+		if(mt == nullptr || mt->IsNull())
 		{
 			/* If there was no error we need a result */
-			if ((mt = GetObjectMember(prv->jsonDoc, "result")) == nullptr)
+			if((mt = GetObjectMember(prv->jsonDoc, "result")) == nullptr)
 				return set_socket_error("PARSE error: Protocol error 7");
 		}
 		else
@@ -359,7 +373,7 @@ bool jpsock::process_line(char* line, size_t len)
 		}
 
 		std::unique_lock<std::mutex> mlock(call_mutex);
-		if (prv->oCallRsp.pCallData == nullptr)
+		if(prv->oCallRsp.pCallData == nullptr)
 		{
 			/*Server sent us a call reply without us making a call*/
 			mlock.unlock();
@@ -400,7 +414,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 
 	mlock.unlock();
 
-	if (!params->val->IsObject())
+	if(!params->val->IsObject())
 		return set_socket_error("PARSE error: Job error 1");
 
 	const Value *blob, *jobid, *target, *motd, *blk_height;
@@ -410,7 +424,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 	motd = GetObjectMember(*params->val, "motd");
 	blk_height = GetObjectMember(*params->val, "height");
 
-	if (jobid == nullptr || blob == nullptr || target == nullptr ||
+	if(jobid == nullptr || blob == nullptr || target == nullptr ||
 		!jobid->IsString() || !blob->IsString() || !target->IsString())
 	{
 		return set_socket_error("PARSE error: Job error 2");
@@ -421,7 +435,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 		std::unique_lock<std::mutex> lck(motd_mutex);
 		if(motd->GetStringLength() > 0)
 		{
-			pool_motd.resize(motd->GetStringLength()/2 + 1);
+			pool_motd.resize(motd->GetStringLength() / 2 + 1);
 			if(!hex2bin(motd->GetString(), motd->GetStringLength(), (unsigned char*)&pool_motd.front()))
 				pool_motd.clear();
 		}
@@ -429,7 +443,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 			pool_motd.clear();
 	}
 
-	if (jobid->GetStringLength() >= sizeof(pool_job::sJobID)) // Note >=
+	if(jobid->GetStringLength() >= sizeof(pool_job::sJobID)) // Note >=
 		return set_socket_error("PARSE error: Job error 3");
 
 	pool_job oPoolJob;
@@ -437,10 +451,10 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 	const uint32_t iWorkLen = blob->GetStringLength() / 2;
 	oPoolJob.iWorkLen = iWorkLen;
 
-	if (iWorkLen > sizeof(pool_job::bWorkBlob))
+	if(iWorkLen > sizeof(pool_job::bWorkBlob))
 		return set_socket_error("PARSE error: Invalid job length. Are you sure you are mining the correct coin?");
 
-	if (!hex2bin(blob->GetString(), iWorkLen * 2, oPoolJob.bWorkBlob))
+	if(!hex2bin(blob->GetString(), iWorkLen * 2, oPoolJob.bWorkBlob))
 		return set_socket_error("PARSE error: Job error 4");
 
 	// lock reading of oCurrentJob
@@ -479,7 +493,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 		return set_socket_error("PARSE error: Job error 5");
 
 	iJobDiff = t64_to_diff(oPoolJob.iTarget);
-	
+
 	if(blk_height != nullptr && blk_height->IsUint64())
 		oPoolJob.iBlockHeight = bswap_64(blk_height->GetUint64());
 
@@ -589,10 +603,10 @@ bool jpsock::cmd_login()
 	uint64_t messageId = 0;
 
 	/*Normal error conditions (failed login etc..) will end here*/
-	if (!cmd_ret_wait(cmd_buffer, oResult, messageId))
+	if(!cmd_ret_wait(cmd_buffer, oResult, messageId))
 		return false;
 
-	if (!oResult.val->IsObject())
+	if(!oResult.val->IsObject())
 	{
 		set_socket_error("PARSE error: Login protocol error 1");
 		disconnect();
@@ -603,14 +617,14 @@ bool jpsock::cmd_login()
 	const Value* job = GetObjectMember(*oResult.val, "job");
 	const Value* ext = GetObjectMember(*oResult.val, "extensions");
 
-	if (id == nullptr || job == nullptr || !id->IsString())
+	if(id == nullptr || job == nullptr || !id->IsString())
 	{
 		set_socket_error("PARSE error: Login protocol error 2");
 		disconnect();
 		return false;
 	}
 
-	if (id->GetStringLength() >= sizeof(sMinerId))
+	if(id->GetStringLength() >= sizeof(sMinerId))
 	{
 		set_socket_error("PARSE error: Login protocol error 3");
 		disconnect();
@@ -622,7 +636,7 @@ bool jpsock::cmd_login()
 
 	if(ext != nullptr && ext->IsArray())
 	{
-		for(size_t i=0; i < ext->Size(); i++)
+		for(size_t i = 0; i < ext->Size(); i++)
 		{
 			const Value& jextname = ext->GetArray()[i];
 
@@ -693,7 +707,7 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 	sResult[64] = '\0';
 
 	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s%s%s},\"id\":1}\n",
-		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations,sMemory, sMemAlignBytes);
+		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations, sMemory, sMemAlignBytes);
 
 	uint64_t messageId = 0;
 	opq_json_val oResult(nullptr);
@@ -732,13 +746,13 @@ bool jpsock::get_pool_motd(std::string& strin)
 	return false;
 }
 
-inline unsigned char hf_hex2bin(char c, bool &err)
+inline unsigned char hf_hex2bin(char c, bool& err)
 {
-	if (c >= '0' && c <= '9')
+	if(c >= '0' && c <= '9')
 		return c - '0';
-	else if (c >= 'a' && c <= 'f')
+	else if(c >= 'a' && c <= 'f')
 		return c - 'a' + 0xA;
-	else if (c >= 'A' && c <= 'F')
+	else if(c >= 'A' && c <= 'F')
 		return c - 'A' + 0xA;
 
 	err = true;
@@ -748,17 +762,18 @@ inline unsigned char hf_hex2bin(char c, bool &err)
 bool jpsock::hex2bin(const char* in, unsigned int len, unsigned char* out)
 {
 	bool error = false;
-	for (unsigned int i = 0; i < len; i += 2)
+	for(unsigned int i = 0; i < len; i += 2)
 	{
 		out[i / 2] = (hf_hex2bin(in[i], error) << 4) | hf_hex2bin(in[i + 1], error);
-		if (error) return false;
+		if(error)
+			return false;
 	}
 	return true;
 }
 
 inline char hf_bin2hex(unsigned char c)
 {
-	if (c <= 0x9)
+	if(c <= 0x9)
 		return '0' + c;
 	else
 		return 'a' - 0xA + c;
@@ -766,7 +781,7 @@ inline char hf_bin2hex(unsigned char c)
 
 void jpsock::bin2hex(const unsigned char* in, unsigned int len, char* out)
 {
-	for (unsigned int i = 0; i < len; i++)
+	for(unsigned int i = 0; i < len; i++)
 	{
 		out[i * 2] = hf_bin2hex((in[i] & 0xF0) >> 4);
 		out[i * 2 + 1] = hf_bin2hex(in[i] & 0x0F);
diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp
index 94976481326fa0c8ba99daaebf48ae6d57fb70b9..4ad6ebbbc38716e3d35ff83f8f996c156bc5ea79 100644
--- a/xmrstak/net/jpsock.hpp
+++ b/xmrstak/net/jpsock.hpp
@@ -1,15 +1,14 @@
 #pragma once
 
-#include "xmrstak/backend/iBackend.hpp"
 #include "msgstruct.hpp"
+#include "xmrstak/backend/iBackend.hpp"
 #include "xmrstak/jconf.hpp"
 
-#include <mutex>
 #include <atomic>
 #include <condition_variable>
-#include <thread>
+#include <mutex>
 #include <string>
-
+#include <thread>
 
 /* Our pool can have two kinds of errors:
 	- Parsing or connection error
@@ -27,7 +26,7 @@ class base_socket;
 
 class jpsock
 {
-public:
+  public:
 	jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash);
 	~jpsock();
 
@@ -55,7 +54,12 @@ public:
 	inline bool is_logged_in() { return bLoggedIn; }
 	inline bool is_dev_pool() { return pool; }
 	inline size_t get_pool_id() { return pool_id; }
-	inline bool get_disconnects(size_t& att, size_t& time) { att = connect_attempts; time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; return pool && usr_login[0]; }
+	inline bool get_disconnects(size_t& att, size_t& time)
+	{
+		att = connect_attempts;
+		time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0;
+		return pool && usr_login[0];
+	}
 	inline const char* get_pool_addr() { return net_addr.c_str(); }
 	inline const char* get_tls_fp() { return tls_fp.c_str(); }
 	inline const char* get_rigid() { return usr_rigid.c_str(); }
@@ -77,7 +81,7 @@ public:
 	bool set_socket_error_strerr(const char* a);
 	bool set_socket_error_strerr(const char* a, int res);
 
-private:
+  private:
 	std::string net_addr;
 	std::string usr_login;
 	std::string usr_rigid;
@@ -142,4 +146,3 @@ private:
 	uint64_t iMessageCnt = 0;
 	uint64_t iLastMessageId = 0;
 };
-
diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp
index 33980bf425842fbe4d88da864f84c17396390a93..3cfce3c6f1b79cde0491fbbd5427189269bc5eec 100644
--- a/xmrstak/net/msgstruct.hpp
+++ b/xmrstak/net/msgstruct.hpp
@@ -2,25 +2,29 @@
 
 #include "xmrstak/backend/cryptonight.hpp"
 
-#include <string>
-#include <string.h>
 #include <assert.h>
+#include <string.h>
+#include <string>
 
 // Structures that we use to pass info between threads constructors are here just to make
 // the stack allocation take up less space, heap is a shared resource that needs locks too of course
 
 struct pool_job
 {
-	char		sJobID[64];
-	uint8_t		bWorkBlob[128];
-	uint64_t	iTarget;
-	uint32_t	iWorkLen;
-	uint32_t	iSavedNonce;
-	uint64_t	iBlockHeight = uint64_t(-1);
-
-	pool_job() : iWorkLen(0), iSavedNonce(0) {}
+	char sJobID[64];
+	uint8_t bWorkBlob[128];
+	uint64_t iTarget;
+	uint32_t iWorkLen;
+	uint32_t iSavedNonce;
+	uint64_t iBlockHeight = uint64_t(-1);
+
+	pool_job() :
+		iWorkLen(0),
+		iSavedNonce(0) {}
 	pool_job(const char* sJobID, uint64_t iTarget, const uint8_t* bWorkBlob, uint32_t iWorkLen) :
-		iTarget(iTarget), iWorkLen(iWorkLen), iSavedNonce(0)
+		iTarget(iTarget),
+		iWorkLen(iWorkLen),
+		iSavedNonce(0)
 	{
 		assert(iWorkLen <= sizeof(pool_job::bWorkBlob));
 		memcpy(this->sJobID, sJobID, sizeof(pool_job::sJobID));
@@ -30,15 +34,17 @@ struct pool_job
 
 struct job_result
 {
-	uint8_t		bResult[32];
-	char		sJobID[64];
-	uint32_t	iNonce;
-	uint32_t	iThreadId;
+	uint8_t bResult[32];
+	char sJobID[64];
+	uint32_t iNonce;
+	uint32_t iThreadId;
 	xmrstak_algo algorithm = {invalid_algo};
 
 	job_result() {}
 	job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, const xmrstak_algo& algo) :
-		iNonce(iNonce), iThreadId(iThreadId), algorithm(algo)
+		iNonce(iNonce),
+		iThreadId(iThreadId),
+		algorithm(algo)
 	{
 		memcpy(this->sJobID, sJobID, sizeof(job_result::sJobID));
 		memcpy(this->bResult, bResult, sizeof(job_result::bResult));
@@ -51,8 +57,12 @@ struct sock_err
 	bool silent;
 
 	sock_err() {}
-	sock_err(std::string&& err, bool silent) : sSocketError(std::move(err)), silent(silent) { }
-	sock_err(sock_err&& from) : sSocketError(std::move(from.sSocketError)), silent(from.silent) {}
+	sock_err(std::string&& err, bool silent) :
+		sSocketError(std::move(err)),
+		silent(silent) {}
+	sock_err(sock_err&& from) :
+		sSocketError(std::move(from.sSocketError)),
+		silent(from.silent) {}
 
 	sock_err& operator=(sock_err&& from)
 	{
@@ -62,7 +72,7 @@ struct sock_err
 		return *this;
 	}
 
-	~sock_err() { }
+	~sock_err() {}
 
 	sock_err(sock_err const&) = delete;
 	sock_err& operator=(sock_err const&) = delete;
@@ -73,13 +83,30 @@ struct gpu_res_err
 {
 	size_t idx; // GPU index
 	const char* error_str;
-	gpu_res_err(const char* error_str, size_t idx) : error_str(error_str), idx(idx) {}
+	gpu_res_err(const char* error_str, size_t idx) :
+		error_str(error_str),
+		idx(idx) {}
 };
 
-enum ex_event_name { EV_INVALID_VAL, EV_SOCK_READY, EV_SOCK_ERROR, EV_GPU_RES_ERROR,
-	EV_POOL_HAVE_JOB, EV_MINER_HAVE_RESULT, EV_PERF_TICK, EV_EVAL_POOL_CHOICE,
-	EV_USR_HASHRATE, EV_USR_RESULTS, EV_USR_CONNSTAT, EV_HASHRATE_LOOP,
-	EV_HTML_HASHRATE, EV_HTML_RESULTS, EV_HTML_CONNSTAT, EV_HTML_JSON };
+enum ex_event_name
+{
+	EV_INVALID_VAL,
+	EV_SOCK_READY,
+	EV_SOCK_ERROR,
+	EV_GPU_RES_ERROR,
+	EV_POOL_HAVE_JOB,
+	EV_MINER_HAVE_RESULT,
+	EV_PERF_TICK,
+	EV_EVAL_POOL_CHOICE,
+	EV_USR_HASHRATE,
+	EV_USR_RESULTS,
+	EV_USR_CONNSTAT,
+	EV_HASHRATE_LOOP,
+	EV_HTML_HASHRATE,
+	EV_HTML_RESULTS,
+	EV_HTML_CONNSTAT,
+	EV_HTML_JSON
+};
 
 /*
    This is how I learned to stop worrying and love c++11 =).
@@ -96,20 +123,37 @@ struct ex_event
 	ex_event_name iName;
 	size_t iPoolId;
 
-	union
-	{
+	union {
 		pool_job oPoolJob;
 		job_result oJobResult;
 		sock_err oSocketError;
 		gpu_res_err oGpuError;
 	};
 
-	ex_event() { iName = EV_INVALID_VAL; iPoolId = 0;}
-	ex_event(const char* gpu_err, size_t gpu_idx, size_t id) : iName(EV_GPU_RES_ERROR), iPoolId(id), oGpuError(gpu_err, gpu_idx) {}
-	ex_event(std::string&& err, bool silent, size_t id) : iName(EV_SOCK_ERROR), iPoolId(id), oSocketError(std::move(err), silent) { }
-	ex_event(job_result dat, size_t id) : iName(EV_MINER_HAVE_RESULT), iPoolId(id), oJobResult(dat) {}
-	ex_event(pool_job dat, size_t id) : iName(EV_POOL_HAVE_JOB), iPoolId(id), oPoolJob(dat) {}
-	ex_event(ex_event_name ev, size_t id = 0) : iName(ev), iPoolId(id) {}
+	ex_event()
+	{
+		iName = EV_INVALID_VAL;
+		iPoolId = 0;
+	}
+	ex_event(const char* gpu_err, size_t gpu_idx, size_t id) :
+		iName(EV_GPU_RES_ERROR),
+		iPoolId(id),
+		oGpuError(gpu_err, gpu_idx) {}
+	ex_event(std::string&& err, bool silent, size_t id) :
+		iName(EV_SOCK_ERROR),
+		iPoolId(id),
+		oSocketError(std::move(err), silent) {}
+	ex_event(job_result dat, size_t id) :
+		iName(EV_MINER_HAVE_RESULT),
+		iPoolId(id),
+		oJobResult(dat) {}
+	ex_event(pool_job dat, size_t id) :
+		iName(EV_POOL_HAVE_JOB),
+		iPoolId(id),
+		oPoolJob(dat) {}
+	ex_event(ex_event_name ev, size_t id = 0) :
+		iName(ev),
+		iPoolId(id) {}
 
 	// Delete the copy operators to make sure we are moving only what is needed
 	ex_event(ex_event const&) = delete;
@@ -123,7 +167,7 @@ struct ex_event
 		switch(iName)
 		{
 		case EV_SOCK_ERROR:
-			new (&oSocketError) sock_err(std::move(from.oSocketError));
+			new(&oSocketError) sock_err(std::move(from.oSocketError));
 			break;
 		case EV_MINER_HAVE_RESULT:
 			oJobResult = from.oJobResult;
@@ -151,7 +195,7 @@ struct ex_event
 		switch(iName)
 		{
 		case EV_SOCK_ERROR:
-			new (&oSocketError) sock_err();
+			new(&oSocketError) sock_err();
 			oSocketError = std::move(from.oSocketError);
 			break;
 		case EV_MINER_HAVE_RESULT:
diff --git a/xmrstak/net/socket.cpp b/xmrstak/net/socket.cpp
index 6fcb454cd7ec09682e54b4c059b6c6aaa9ee007c..6a6abac152a43683dd875ae4ce7dcdb71ad4de5c 100644
--- a/xmrstak/net/socket.cpp
+++ b/xmrstak/net/socket.cpp
@@ -28,16 +28,17 @@
 #include "xmrstak/misc/executor.hpp"
 
 #ifndef CONF_NO_TLS
-#include <openssl/ssl.h>
 #include <openssl/err.h>
 #include <openssl/opensslconf.h>
+#include <openssl/ssl.h>
 
 #ifndef OPENSSL_THREADS
 #error OpenSSL was compiled without thread support
 #endif
 #endif
 
-plain_socket::plain_socket(jpsock* err_callback) : pCallback(err_callback)
+plain_socket::plain_socket(jpsock* err_callback) :
+	pCallback(err_callback)
 {
 	hSocket = INVALID_SOCKET;
 	pSockAddr = nullptr;
@@ -50,58 +51,58 @@ bool plain_socket::set_hostname(const char* sAddr)
 
 	sock_closed = false;
 	size_t ln = strlen(sAddr);
-	if (ln >= sizeof(sAddrMb))
+	if(ln >= sizeof(sAddrMb))
 		return pCallback->set_socket_error("CONNECT error: Pool address overflow.");
 
 	memcpy(sAddrMb, sAddr, ln);
 	sAddrMb[ln] = '\0';
 
-	if ((sTmp = strstr(sAddrMb, "//")) != nullptr)
+	if((sTmp = strstr(sAddrMb, "//")) != nullptr)
 	{
 		sTmp += 2;
 		memmove(sAddrMb, sTmp, strlen(sTmp) + 1);
 	}
 
-	if ((sPort = strchr(sAddrMb, ':')) == nullptr)
+	if((sPort = strchr(sAddrMb, ':')) == nullptr)
 		return pCallback->set_socket_error("CONNECT error: Pool port number not specified, please use format <hostname>:<port>.");
 
 	sPort[0] = '\0';
 	sPort++;
 
-	addrinfo hints = { 0 };
+	addrinfo hints = {0};
 	hints.ai_family = AF_UNSPEC;
 	hints.ai_socktype = SOCK_STREAM;
 	hints.ai_protocol = IPPROTO_TCP;
 
 	pAddrRoot = nullptr;
 	int err;
-	if ((err = getaddrinfo(sAddrMb, sPort, &hints, &pAddrRoot)) != 0)
+	if((err = getaddrinfo(sAddrMb, sPort, &hints, &pAddrRoot)) != 0)
 		return pCallback->set_socket_error_strerr("CONNECT error: GetAddrInfo: ", err);
 
-	addrinfo *ptr = pAddrRoot;
+	addrinfo* ptr = pAddrRoot;
 	std::vector<addrinfo*> ipv4;
 	std::vector<addrinfo*> ipv6;
 
-	while (ptr != nullptr)
+	while(ptr != nullptr)
 	{
-		if (ptr->ai_family == AF_INET)
+		if(ptr->ai_family == AF_INET)
 			ipv4.push_back(ptr);
-		if (ptr->ai_family == AF_INET6)
+		if(ptr->ai_family == AF_INET6)
 			ipv6.push_back(ptr);
 		ptr = ptr->ai_next;
 	}
 
-	if (ipv4.empty() && ipv6.empty())
+	if(ipv4.empty() && ipv6.empty())
 	{
 		freeaddrinfo(pAddrRoot);
 		pAddrRoot = nullptr;
 		return pCallback->set_socket_error("CONNECT error: I found some DNS records but no IPv4 or IPv6 addresses.");
 	}
-	else if (!ipv4.empty() && ipv6.empty())
+	else if(!ipv4.empty() && ipv6.empty())
 		pSockAddr = ipv4[rand() % ipv4.size()];
-	else if (ipv4.empty() && !ipv6.empty())
+	else if(ipv4.empty() && !ipv6.empty())
 		pSockAddr = ipv6[rand() % ipv6.size()];
-	else if (!ipv4.empty() && !ipv6.empty())
+	else if(!ipv4.empty() && !ipv6.empty())
 	{
 		if(jconf::inst()->PreferIpv4())
 			pSockAddr = ipv4[rand() % ipv4.size()];
@@ -111,7 +112,7 @@ bool plain_socket::set_hostname(const char* sAddr)
 
 	hSocket = socket(pSockAddr->ai_family, pSockAddr->ai_socktype, pSockAddr->ai_protocol);
 
-	if (hSocket == INVALID_SOCKET)
+	if(hSocket == INVALID_SOCKET)
 	{
 		freeaddrinfo(pAddrRoot);
 		pAddrRoot = nullptr;
@@ -120,7 +121,7 @@ bool plain_socket::set_hostname(const char* sAddr)
 
 	int flag = 1;
 	/* If it fails, it fails, we won't loose too much sleep over it */
-	setsockopt(hSocket, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int));
+	setsockopt(hSocket, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(int));
 
 	return true;
 }
@@ -133,7 +134,7 @@ bool plain_socket::connect()
 	freeaddrinfo(pAddrRoot);
 	pAddrRoot = nullptr;
 
-	if (ret != 0)
+	if(ret != 0)
 		return pCallback->set_socket_error_strerr("CONNECT error: ");
 	else
 		return true;
@@ -158,10 +159,10 @@ bool plain_socket::send(const char* buf)
 {
 	size_t pos = 0;
 	size_t slen = strlen(buf);
-	while (pos != slen)
+	while(pos != slen)
 	{
 		int ret = ::send(hSocket, buf + pos, slen - pos, 0);
-		if (ret == SOCKET_ERROR)
+		if(ret == SOCKET_ERROR)
 		{
 			pCallback->set_socket_error_strerr("SEND error: ");
 			return false;
@@ -184,7 +185,8 @@ void plain_socket::close(bool free)
 }
 
 #ifndef CONF_NO_TLS
-tls_socket::tls_socket(jpsock* err_callback) : pCallback(err_callback)
+tls_socket::tls_socket(jpsock* err_callback) :
+	pCallback(err_callback)
 {
 }
 
@@ -193,7 +195,7 @@ void tls_socket::print_error()
 	BIO* err_bio = BIO_new(BIO_s_mem());
 	ERR_print_errors(err_bio);
 
-	char *buf = nullptr;
+	char* buf = nullptr;
 	size_t len = BIO_get_mem_data(err_bio, &buf);
 
 	if(buf == nullptr)
@@ -247,7 +249,7 @@ bool tls_socket::set_hostname(const char* sAddr)
 
 	int flag = 1;
 	/* If it fails, it fails, we won't loose too much sleep over it */
-	setsockopt(BIO_get_fd(bio, nullptr), IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int));
+	setsockopt(BIO_get_fd(bio, nullptr), IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(int));
 
 	if(BIO_set_conn_hostname(bio, sAddr) != 1)
 	{
@@ -327,7 +329,7 @@ bool tls_socket::connect()
 	BIO_flush(b64);
 
 	const char* conf_md = pCallback->get_tls_fp();
-	char *b64_md = nullptr;
+	char* b64_md = nullptr;
 	size_t b64_len = BIO_get_mem_data(bmem, &b64_md);
 
 	if(strlen(conf_md) == 0)
@@ -393,4 +395,3 @@ void tls_socket::close(bool free)
 	}
 }
 #endif
-
diff --git a/xmrstak/net/socket.hpp b/xmrstak/net/socket.hpp
index b09142d565a8080d3030eba1951f1309c8611d30..88b665adfabdad0191e2c4a0c0735eb672355fe7 100644
--- a/xmrstak/net/socket.hpp
+++ b/xmrstak/net/socket.hpp
@@ -1,26 +1,26 @@
 #pragma once
 
-#include <atomic>
 #include "socks.hpp"
+#include <atomic>
 
 class jpsock;
 
 class base_socket
 {
-public:
+  public:
 	virtual bool set_hostname(const char* sAddr) = 0;
 	virtual bool connect() = 0;
 	virtual int recv(char* buf, unsigned int len) = 0;
 	virtual bool send(const char* buf) = 0;
 	virtual void close(bool free) = 0;
 
-protected:
+  protected:
 	std::atomic<bool> sock_closed;
 };
 
 class plain_socket : public base_socket
 {
-public:
+  public:
 	plain_socket(jpsock* err_callback);
 
 	bool set_hostname(const char* sAddr);
@@ -29,10 +29,10 @@ public:
 	bool send(const char* buf);
 	void close(bool free);
 
-private:
+  private:
 	jpsock* pCallback;
-	addrinfo *pSockAddr;
-	addrinfo *pAddrRoot;
+	addrinfo* pSockAddr;
+	addrinfo* pAddrRoot;
 	SOCKET hSocket;
 };
 
@@ -42,7 +42,7 @@ typedef struct ssl_st SSL;
 
 class tls_socket : public base_socket
 {
-public:
+  public:
 	tls_socket(jpsock* err_callback);
 
 	bool set_hostname(const char* sAddr);
@@ -51,7 +51,7 @@ public:
 	bool send(const char* buf);
 	void close(bool free);
 
-private:
+  private:
 	void init_ctx();
 	void print_error();
 
diff --git a/xmrstak/net/socks.hpp b/xmrstak/net/socks.hpp
index 86749e52773dc557d16c5e283d9c9e5c33dab63e..600e4d2765011b1adb166eb2630cbc046358cb9e 100644
--- a/xmrstak/net/socks.hpp
+++ b/xmrstak/net/socks.hpp
@@ -2,18 +2,19 @@
 
 #ifdef _WIN32
 #ifndef _WIN32_WINNT
-#define _WIN32_WINNT 0x0601  /* Windows 7 */
+#define _WIN32_WINNT 0x0601 /* Windows 7 */
 #endif
+
 #include <winsock2.h>
 #include <ws2tcpip.h>
+// this comment disable clang include reordering for windows.h
 #include <windows.h>
 
-
 inline void sock_init()
 {
 	static bool bWSAInit = false;
 
-	if (!bWSAInit)
+	if(!bWSAInit)
 	{
 		WSADATA wsaData;
 		WSAStartup(MAKEWORD(2, 2), &wsaData);
@@ -56,20 +57,20 @@ inline const char* sock_gai_strerror(int err, char* buf, size_t len)
 #else
 
 /* Assume that any non-Windows platform uses POSIX-style sockets instead. */
-#include <sys/socket.h>
 #include <arpa/inet.h>
-#include <netdb.h>  /* Needed for getaddrinfo() and freeaddrinfo() */
-#include <unistd.h> /* Needed for close() */
 #include <errno.h>
-#include <string.h>
+#include <netdb.h>		/* Needed for getaddrinfo() and freeaddrinfo() */
 #include <netinet/in.h> /* Needed for IPPROTO_TCP */
 #include <netinet/tcp.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h> /* Needed for close() */
 
 inline void sock_init() {}
 typedef int SOCKET;
 
-#define INVALID_SOCKET  (-1)
-#define SOCKET_ERROR    (-1)
+#define INVALID_SOCKET (-1)
+#define SOCKET_ERROR (-1)
 
 inline void sock_close(SOCKET s)
 {
diff --git a/xmrstak/params.hpp b/xmrstak/params.hpp
index 16089aed7e524979e03ac031e272bb1a19f279c1..a00e51b0a0a8ad24d42aef20be8689b4195c5a13 100644
--- a/xmrstak/params.hpp
+++ b/xmrstak/params.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "xmrstak/misc/environment.hpp"
+#include "xmrstak/misc/home_dir.hpp"
 
 #include <string>
 
@@ -45,6 +46,7 @@ struct params
 	std::string configFile;
 	std::string configFilePools;
 	std::string configFileAMD;
+	std::string rootAMDCacheDir;
 	std::string configFileNVIDIA;
 	std::string configFileCPU;
 
@@ -68,10 +70,11 @@ struct params
 		configFile("config.txt"),
 		configFilePools("pools.txt"),
 		configFileAMD("amd.txt"),
+		rootAMDCacheDir(get_home() + "/.openclcache/"),
 		configFileCPU("cpu.txt"),
 		configFileNVIDIA("nvidia.txt")
-	{}
-
+	{
+	}
 };
 
 } // namespace xmrstak
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 412fc221335944e0a11c2c1414ca0b6642d7eae4..f42e2a0bf0bfe68dc3c3d90f98ef635af6fef03a 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -2,7 +2,9 @@
 
 //! git will put "#define GIT_ARCHIVE 1" on the next line inside archives. $Format:%n#define GIT_ARCHIVE 1$
 #if defined(GIT_ARCHIVE) && !defined(GIT_COMMIT_HASH)
-#define GIT_COMMIT_HASH $Format:%h$
+#define GIT_COMMIT_HASH \
+	$Format:            \
+	% h$
 #endif
 
 #ifndef GIT_COMMIT_HASH
@@ -18,7 +20,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.10.1"
+#define XMR_STAK_VERSION "2.10.4"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"
@@ -35,10 +37,10 @@
 #define XMRSTAK_PP_TOSTRING1(str) #str
 #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str)
 
-#define VERSION_LONG  XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/"
+#define VERSION_LONG XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/"
 #define VERSION_SHORT XMR_STAK_NAME " " XMR_STAK_VERSION " " XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH)
 #define VERSION_HTML "v" XMR_STAK_VERSION "-" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH)
 
-const char ver_long[]  = VERSION_LONG;
+const char ver_long[] = VERSION_LONG;
 const char ver_short[] = VERSION_SHORT;
 const char ver_html[] = VERSION_HTML;
diff --git a/xmrstak/version.hpp b/xmrstak/version.hpp
index cdf82f30d0c2efe619c83e1a9f2ae8daf2d7a398..85905f01cd769ba801803197e4772033434e61f6 100644
--- a/xmrstak/version.hpp
+++ b/xmrstak/version.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
+#include "donate-level.hpp"
 #include <inttypes.h>
 #include <string>
-#include "donate-level.hpp"
 
 extern const char ver_long[];
 extern const char ver_short[];
@@ -10,7 +10,7 @@ extern const char ver_html[];
 
 inline std::string get_version_str()
 {
-	return std::string(ver_long) + std::to_string(uint32_t(fDevDonationLevel * 1000)) ;
+	return std::string(ver_long) + std::to_string(uint32_t(fDevDonationLevel * 1000));
 }
 
 inline std::string get_version_str_short()