diff --git a/CMakeLists.txt b/CMakeLists.txt
index a642b385d0b8aed1f77101d6d749e9082dfd9691..b714ee0ceb3bc3e39de9e7851e1231c16ef53a2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,6 +100,11 @@ if(CUDA_ENABLE)
                 list(APPEND DEFAULT_CUDA_ARCH "70")
             endif()
         endif()
+        # add Turing support for CUDA >= 10.0
+        if(NOT CUDA_VERSION VERSION_LESS 10.0)
+            list(APPEND DEFAULT_CUDA_ARCH "75")
+        endif()
+
         set(CUDA_ARCH "${DEFAULT_CUDA_ARCH}" CACHE STRING "Set GPU architecture (semicolon separated list, e.g. '-DCUDA_ARCH=20;35;60')")
 
         # generate comma separated list with architectures
@@ -186,7 +191,10 @@ if(CUDA_ENABLE)
             endif()
 
             if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND
-                (CUDA_VERSION VERSION_EQUAL 9.0 OR CUDA_VERSION VERSION_EQUAL 9.1 OR CUDA_VERSION VERSION_EQUAL 9.2)
+                (CUDA_VERSION VERSION_EQUAL 9.0 OR
+                CUDA_VERSION VERSION_EQUAL 9.1 OR
+                CUDA_VERSION VERSION_EQUAL 9.2 OR
+                CUDA_VERSION VERSION_EQUAL 10.0)
             )
                 # workaround find_package(CUDA) is using the wrong path to the CXX host compiler
                 # overwrite the CUDA host compiler variable with the used CXX MSVC
@@ -435,6 +443,11 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     endif()
 endif()
 
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+    set(CMAKE_CXX_FLAGS "-Wl,-z,noexecstack ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "-Wl,-z,noexecstack ${CMAKE_C_FLAGS}")
+endif()
+
 # activate static libgcc and libstdc++ linking
 if(CMAKE_LINK_STATIC)
     set(BUILD_SHARED_LIBRARIES OFF)
@@ -445,6 +458,27 @@ if(CMAKE_LINK_STATIC)
     endif()
 endif()
 
+if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
+    # asm optimized monero v8 code
+    enable_language(ASM_MASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM)
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm"
+    )
+else()
+    # asm optimized monero v8 code
+    enable_language(ASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY CPP)
+    set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
+    add_library(xmr-stak-asm
+        STATIC
+        "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S"
+    )
+endif()
+
+set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
+
 # compile C files
 file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c")
 
@@ -456,7 +490,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99)
 if(MICROHTTPD_ENABLE)
     target_link_libraries(xmr-stak-c ${MHTD})
 endif()
-target_link_libraries(xmr-stak-c ${LIBS})
+target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm)
 
 # compile generic backend files
 file(GLOB BACKEND_CPP
@@ -472,7 +506,7 @@ add_library(xmr-stak-backend
     STATIC
     ${BACKEND_CPP}
 )
-target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS})
+target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm)
 
 # compile CUDA backend
 if(CUDA_FOUND)
@@ -499,7 +533,7 @@ if(CUDA_FOUND)
         )
     endif()
     target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES})
-    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile AMD backend
@@ -512,7 +546,7 @@ if(OpenCL_FOUND)
         ${OPENCLSRCFILES}
     )
     target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} )
-    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend)
+    target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm)
 endif()
 
 # compile final binary
@@ -528,7 +562,7 @@ endif()
 set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}")
 set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}")
 
-target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend)
+target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm)
 
 ################################################################################
 # Install
diff --git a/README.md b/README.md
index 887bc5cf30996e0f0616b246a3cf6ada707b7328..046a930e136541792a7288c66e6eeb4ffc910218 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 ###### fireice-uk's and psychocrypt's
 # XMR-Stak - Cryptonight All-in-One Mining Software
 
-XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA gpus and can be used to mine the crypto currencys Monero, Aeon and many more Cryptonight coins.
+XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins.
 
 ## HTML reports
 <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-hashrate.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-results.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-connection.png" width="260">
@@ -28,7 +28,7 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV
 - supports algorithm cryptonight for Monero (XMR) and cryptonight-light (AEON)
 - easy to use
   - guided start (no need to edit a config file for the first start)
-  - auto configuration for each backend
+  - auto-configuration for each backend
 - open source software (GPLv3)
 - TLS support
 - [HTML statistics](doc/usage.md#html-and-json-api-report-configuraton)
@@ -45,10 +45,14 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [Haven](https://havenprotocol.com)
 - [Intense](https://intensecoin.com)
 - [Masari](https://getmasari.org)
-- [Ryo](https://ryo-currency.com)
+- [QRL](https://theqrl.org)
+- **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo**
 - [TurtleCoin](https://turtlecoin.lol)
 
-If your prefered coin is not listed, you can chose one of the following algorithms:
+Ryo currency is a way for us to implement the ideas that we were unable to in
+Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
+
+If your prefered coin is not listed, you can choose one of the following algorithms:
 
 - 1MiB scratchpad memory
     - cryptonight_lite
@@ -59,11 +63,12 @@ If your prefered coin is not listed, you can chose one of the following algorith
     - cryptonight_masari
     - cryptonight_v7
     - cryptonight_v7_stellite
+    - cryptonight_v8
 - 4MiB scratchpad memory
     - cryptonight_haven
     - cryptonight_heavy
 
-Please note, this list is not complete, and is not an endorsement.
+Please note, this list is not complete and is not an endorsement.
 
 ## Download
 
@@ -71,7 +76,7 @@ You can find the latest releases and precompiled binaries on GitHub under [Relea
 
 ## Default Developer Donation
 
-By default the miner will donate 2% of the hashpower (2 minute in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
+By default, the miner will donate 2% of the hashpower (2 minutes in 100 minutes) to my pool. If you want to change that, edit [donate-level.hpp](xmrstak/donate-level.hpp) before you build the binaries.
 
 If you want to donate directly to support further development, here is my wallet
 
diff --git a/doc/FAQ.md b/doc/FAQ.md
index 2fccdd8a42014c5557201e5f8fb0a8c338e2a768..f744e3d24676535694b34947cd5c53798a417089 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -9,8 +9,8 @@
 * [Virus Protection Alert](#virus-protection-alert)
 * [Change Currency to Mine](#change-currency-to-mine)
 * [How can I mine Monero](#how-can-i-mine-monero)
-* [Why is Monero named monero7](#why-is-monero-named-monero7)
 * [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed)
+* [Internal compiler error: Killed (program cc1plus)](#internal-compiler-error)
 
 ## "Obtaining SeLockMemoryPrivilege failed."
 
@@ -44,20 +44,35 @@ Download and install this [runtime package](https://go.microsoft.com/fwlink/?Lin
 
 ## Error: MEMORY ALLOC FAILED: mmap failed
 
-On Linux you will need to configure large page support and increase your ulimit -l.
+On Linux you will need to configure large page support and increase your memlock limit (`ulimit -l`).
 
-To set large page support, add the following lines to `/etc/sysctl.conf` (`/etc/sysctl.d/xmr-stak.conf` for [Arch Linux](https://www.archlinux.org/news/deprecation-of-etcsysctlconf/) and its derivatives):
+Never put settings directly into `/etc/sysctl.conf` or `/etc/security/limits.conf` as those are system defaults and can be replaced in upgrades, and custom settings in that file are deprecated in all distros since at least wheezy/trusty (has been illegal in RedHat based distros for longer than that), and will be even more deprecated with systemd (it no longer even reads sysctl.conf, ONLY sysctl.d files, for example - there is a link to the old `/etc/sysctl.conf` for backward compatibility but that can go away at any time).  Also adding to `/etc/rc.local` is extra incorrect, systemd does not even use that file anymore (once the sysvinit compatibility layer is gone, rc.local will no longer work).
+
+To check current settings, run `/sbin/sysctl vm.nr_hugepages ; ulimit -l` as whatever user you will run `xmr-stak` as (example shows bad/low sample defaults):
+
+    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
+    vm.nr_hugepages = 0
+    16
+
+To set large page support, add the following lines to `/etc/sysctl.d/60-hugepages.conf`:
 
     vm.nr_hugepages=128
 
-To increase the ulimit, add following lines to `/etc/security/limits.conf`:
+You WILL need to run `sudo sysctl --system` for these settings to take effect on your system (or reboot).  In some cases (many threads, very large CPU, etc) you may need more than 128 (try 256 if there are still complaints from thread inits)
 
-    * soft memlock 262144
-    * hard memlock 262144
+To increase the memlock (ulimit -l), add following lines to `/etc/security/limits.d/60-memlock.conf`:
+
+    *    - memlock 262144
+    root - memlock 262144
 
 You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).
+Recheck after completing these steps to validate:
+
+    $ /sbin/sysctl vm.nr_hugepages ; ulimit -l
+    vm.nr_hugepages = 128
+    262144
 
-You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons.
+You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons.  Also running as root does not properly get around the `ulimit -l` being large enough (and limits `*` does not apply to `root` either, it must be specified explicitly).
 
 ## Illegal Instruction
 
@@ -72,18 +87,18 @@ If your antivirus software flags **xmr-stak**, it will likely move it to its qua
 
 If the miner is compiled for Monero and Aeon than you can change
  - the value `currency` in the config *or*
- - start the miner with the [command line option](usage.md) `--currency monero7` or `--currency aeon7`
+ - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon7`
  - run `xmr-stak --help` to see all supported currencies and algorithms
 
 ## How can I mine Monero
 
-Set the value `currency` in `pools.txt` to `monero7`.
-
-## Why is Monero named monero7
-
-To avoid configuration conflicts after the hard fork of Monero to the new POW with our old naming schema where all cryptonight currencies was selected by choosing `monero` as currency we decided to switch to the name `monero7`.
+Set the value `currency` in `pools.txt` to `monero`.
 
 ## Which currency must be chosen if my fork coin is not listed
 
 If your coin you want to mine is not listed please check the documentation of the coin and try to find out if `cryptonight` or `cryptonight-lite` is the used algorithm.
 Select one of these generic coin algorithms.
+
+## Internal compiler error
+
+Seeing `g++: internal compiler error: Killed (program cc1plus)` is probably related to not enough RAM to compile. 1 Gb RAM should be enough (it is on clean Ubuntu 16.04).
diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md
index 072402ff7609422469bbd5a4cc10070e89a1bc39..79e036ef7bdf9975d7f43638db00cd1a82e444b0 100644
--- a/doc/compile_Linux.md
+++ b/doc/compile_Linux.md
@@ -4,7 +4,7 @@
 
 ### AMD APP SDK 3.0 (only needed to use AMD GPUs)
 
-- download and install the latest version from https://www.dropbox.com/sh/mpg882ekirnsfa7/AADWz5X-TgVdsmWt0QwMgTWLa/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2?dl=0
+- download and install the latest version from http://debian.nullivex.com/amd/AMD-APP-SDKInstaller-v3.0.130.136-GA-linux64.tar.bz2 (see https://github.com/fireice-uk/xmr-stak/issues/1511#issuecomment-385120692)
   (do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/228059)
 
 ### Cuda 8.0+ (only needed to use NVIDIA GPUs)
@@ -105,6 +105,8 @@ In that case you can force CUDA to use an older compiler in the following way:
 cmake -DCUDA_HOST_COMPILER=/usr/bin/gcc-5 ..
 ```
 
+- You need 1 Gb RAM to compile (a bit less might be enough, 512 Mb isn't). 
+
 ### To do a generic and static build for a system without gcc 5.1+
 ```
     cmake -DCMAKE_LINK_STATIC=ON -DXMR-STAK_COMPILE=generic .
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index 802d5c5abd77514db38a2ba7217ec08d08a30f03..add5fbfd05f3531161d1ac3f9fca2fd554d6ce1a 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -32,8 +32,8 @@
 
 ### AMD APP SDK 3.0 (only needed for AMD GPUs)
 
-- Download and install the latest version from https://www.dropbox.com/s/gq8vqhelq0m6gj4/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe
-  (do not wonder why it is a link to a dropbox but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855)
+- Download and install the latest version from http://amd-dev.wpengine.netdna-cdn.com/app-sdk/installers/APPSDKInstaller/3.0.130.135-GA/full/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe
+  (do not wonder why it is a link to a netdna-cdn.com but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855)
 
 ### Dependencies OpenSSL/Hwloc and Microhttpd
 - For CUDA 8*:
diff --git a/doc/tuning.md b/doc/tuning.md
index 6bf036e9f316ef9833b8cb76bfe8d3ffe4709e25..2673d68d9765dcefe99bdfd460a8290cb5819c80 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -9,6 +9,7 @@
 * [AMD Backend](#amd-backend)
   * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
   * [Add more GPUs](#add-more-gpus)
+  * [Two Threads per GPU](two-threads-per-gpu)
   * [disable comp_mode](#disable-comp_mode)
   * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern)
   * [Increase Memory Pool](#increase-memory-pool)
@@ -55,10 +56,10 @@ To add a new GPU you need to add a new config set to `gpu_threads_conf`.
 "gpu_threads_conf" :
 [
     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
-      "affine_to_cpu" : false, "sync_mode" : 3,
+      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
     },
     { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
-      "affine_to_cpu" : false, "sync_mode" : 3,
+      "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1,
     },
 ],
 ```
@@ -82,11 +83,37 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to
 ```
 "gpu_threads_conf" :
 [
-    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true
+    {
+      "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
     },
-    { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true
+    {
+      "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    },
+],
+
+"platform_index" : 0,
+```
+
+### Two Threads per GPU
+
+Some GPUs like AMD Vega can mine faster if two threads are using the same GPU.
+Use the auto generated config as base and repeat the config entry for a GPU.
+If the attribute `index` is used twice than two threads will use one GPU.
+Take care that the required memory usage on the GPU will also double.
+Therefore adjust your intensity by hand.
+
+```
+"gpu_threads_conf" :
+[
+    {
+      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    },
+    {
+      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
     },
 ],
 
diff --git a/doc/usage.md b/doc/usage.md
index 886c1b319b8f85851b9600dfbea1583685cb2699..a371f0e67bdbee9bd4d7247b4e9dfebfff8cf062 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -5,6 +5,7 @@
 * [Usage on Windows](#usage-on-windows)
 * [Usage on Linux](#usage-on-linux)
 * [Command Line Options](#command-line-options)
+* [Use different backends](#use-different-backends)
 * [HTML and JSON API report configuraton](#html-and-json-api-report-configuraton)
 
 ## Configurations
@@ -34,6 +35,33 @@ Note: If the pool is ignoring the option `rig_id` in `pools.txt` to name your wo
 The miner allow to overwrite some of the settings via command line options.
 Run `xmr-stak --help` to show all available command line options.
 
+## Use Different Backends
+
+On linux and OSX please add `./` before the binary name `xmr-stak`.
+
+### CPU Only:
+```
+xmr-stak --noAMD --noNVIDIA
+```
+
+### NVIDIA/AMD Only:
+
+The miner will automatically detect if CUDA (for NVIDIA GPUs) or OpenCL (for AMD GPUs) is available.
+
+```
+xmr-stak --noCPU
+```
+**CUDA** is currently not supported. I am currently try to get some performance out it.
+
+### NVIDIA via OpenCL
+
+It is possible to use the OpenCl backend which is originally created for AMD GPUs with NVIDIA GPus.
+Some NVIDIA GPUs can reach better performance with this backend.
+
+```
+xmr-stak --openCLVendor NVIDIA --noNVIDIA
+```
+
 ## Docker image usage
 
 You can run the Docker image the following way:
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 87721ac8f606a6267c20dc1993554a3958893b79..7c7aff788ad60b52024a1db0f5919b4e6b981ecb 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -17,6 +17,7 @@
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/picosha2/picosha2.hpp"
 #include "xmrstak/params.hpp"
+#include "xmrstak/version.hpp"
 
 #include <stdio.h>
 #include <string.h>
@@ -375,6 +376,13 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
+	std::vector<char> openCLDriverVer(1024);
+	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx );
+		return ERR_OCL_API;
+	}
+
 	xmrstak_algo miner_algo[2] = {
 		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(),
 		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()
@@ -388,11 +396,31 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		int threadMemMask = cn_select_mask(miner_algo[ii]);
 		int hashIterations = cn_select_iter(miner_algo[ii]);
 
-		char options[512];
-		snprintf(options, sizeof(options),
-			"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d  -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
-		hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0,
-			int_port(hashMemSize), int(miner_algo[ii]));
+		size_t mem_chunk_exp = 1u << ctx->memChunk;
+		size_t strided_index = ctx->stridedIndex;
+		/* Adjust the config settings to a valid combination
+		 * this is required if the dev pool is mining monero
+		 * but the user tuned there settings for another currency
+		 */
+		if(miner_algo[ii] == cryptonight_monero_v8)
+		{
+			if(ctx->memChunk < 2)
+				mem_chunk_exp = 1u << 2;
+			if(strided_index == 1)
+				strided_index = 0;
+		}
+
+		std::string options;
+		options += " -DITERATIONS=" + std::to_string(hashIterations);
+		options += " -DMASK=" + std::to_string(threadMemMask);
+		options += " -DWORKSIZE=" + std::to_string(ctx->workSize);
+		options += " -DSTRIDED_INDEX=" + std::to_string(strided_index);
+		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp);
+		options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u);
+		options += " -DMEMORY=" + std::to_string(hashMemSize);
+		options += " -DALGO=" + std::to_string(miner_algo[ii]);
+		options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
+
 		/* create a hash for the compile time cache
 		 * used data:
 		 *   - source code
@@ -402,6 +430,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		std::string src_str(source_code);
 		src_str += options;
 		src_str += devNameVec.data();
+		src_str += get_version_str();
+		src_str += openCLDriverVer.data();
+
 		std::string hash_hex_str;
 		picosha2::hash256_hex_string(src_str, hash_hex_str);
 
@@ -418,7 +449,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				return ERR_OCL_API;
 			}
 
-			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options, NULL, NULL);
+			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL);
 			if(ret != CL_SUCCESS)
 			{
 				size_t len;
@@ -594,27 +625,6 @@ const char* const attributeNames[] = {
 
 #define NELEMS(x)  (sizeof(x) / sizeof((x)[0]))
 
-void PrintDeviceInfo(cl_device_id device)
-{
-	char queryBuffer[1024];
-	int queryInt;
-	cl_int clError;
-	clError = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DEVICE_NAME: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DEVICE_VENDOR: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DRIVER_VERSION: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(queryBuffer), &queryBuffer, NULL);
-	printf("    CL_DEVICE_VERSION: %s\n", queryBuffer);
-	queryBuffer[0] = '\0';
-	clError = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &queryInt, NULL);
-	printf("    CL_DEVICE_MAX_COMPUTE_UNITS: %d\n", queryInt);
-}
-
 uint32_t getNumPlatforms()
 {
 	cl_uint num_platforms = 0;
@@ -885,6 +895,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	//char* source_code = LoadTextFile(sSourcePath);
 
+	const char *fastIntMathV2CL =
+			#include "./opencl/fast_int_math_v2.cl"
+	;
 	const char *cryptonightCL =
 			#include "./opencl/cryptonight.cl"
 	;
@@ -905,6 +918,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	;
 
 	std::string source_code(cryptonightCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
@@ -914,13 +928,20 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	// create a directory  for the OpenCL compile cache
 	create_directory(get_home() + "/.openclcache");
 
+	// check if cryptonight_monero_v8 is selected for the user or dev pool
+	bool useCryptonight_v8 =
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
 	for(int i = 0; i < num_gpus; ++i)
 	{
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
 		if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
 		{
 			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
 			ctx[i].rawIntensity = reduced_intensity;
-			const std::string backendName = xmrstak::params::inst().openCLVendor;
 			printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
 		}
 
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 5ab80b82a93d0f0c9fffee5b0846d2c8ecab0653..63c5029d7081349d512bdfef325a5898f0ad5065 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -27,6 +27,7 @@ struct GpuContext
 	size_t workSize;
 	int stridedIndex;
 	int memChunk;
+	int unroll = 0;
 	bool isNVIDIA = false;
 	int compMode;
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 002472d3aa7e59f092619bfe1532c510670cd07a..9c9bcd08e91fee9d3ac3c24fd86db2b84513c41a 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -35,8 +35,8 @@ R"===(
 inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
 {
 	uint2 result;
-	result.s0 =  (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2));
-	result.s1 =  (uint) (((((long)src0.s1) << 32) | (long)src1.s1) >> (src2));
+	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
+	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
 	return result;
 }
 #endif
@@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
 }
 #endif
 
+//#include "opencl/fast_int_math_v2.cl"
+XMRSTAK_INCLUDE_FAST_INT_MATH_V2
 //#include "opencl/wolf-aes.cl"
 XMRSTAK_INCLUDE_WOLF_AES
 //#include "opencl/wolf-skein.cl"
@@ -416,6 +418,9 @@ void AESExpandKey256(uint *keybuf)
 	}
 }
 
+)==="
+R"===(
+
 #define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)
 
 #if(STRIDED_INDEX==0)
@@ -482,9 +487,14 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 		State[10] = input[10];
 
 		((uint *)State)[9] &= 0x00FFFFFFU;
-		((uint *)State)[9] |= ((get_global_id(0)) & 0xFF) << 24;
+		((uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24;
 		((uint *)State)[10] &= 0xFF000000U;
-		((uint *)State)[10] |= ((get_global_id(0) >> 8));
+		/* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
+		 * handle get_global_id and get_global_offset as signed long long int and add
+		 * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
+		 * (even if it is correct casted to unsigned on the host)
+		 */
+		((uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
 
 		for(int i = 11; i < 25; ++i) State[i] = 0x00UL;
 
@@ -551,7 +561,15 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
-		
+
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+#	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
+#	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
+#else
+#	define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)])
+#endif
+
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
@@ -560,9 +578,29 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 )
 {
-	ulong a[2], b[2];
+	ulong a[2];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+	ulong b[4];
+	uint4 b_x[2];
+// NVIDIA
+#	ifdef __NV_CL_C_VERSION
+	__local uint16 scratchpad_line_buf[WORKSIZE];
+ 	__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
+#	endif
+#else
+	ulong b[2];
+	uint4 b_x[1];
+#endif
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 
+// cryptonight_monero_v8
+#if(ALGO==11)
+	__local uint RCP[256];
+	uint2 division_result;
+	uint sqrt_result;
+#endif
 	const ulong gIdx = getIdx();
 
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
@@ -572,6 +610,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		AES1[i] = rotate(tmp, 8U);
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
+// cryptonight_monero_v8
+#if(ALGO==11)
+		RCP[i] = RCP_C[i];
+#endif
 	}
 
 	barrier(CLK_LOCAL_MEM_FENCE);
@@ -579,7 +621,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
     uint2 tweak1_2;
 #endif
-	uint4 b_x;
+
 #if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
@@ -599,13 +641,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		a[1] = states[1] ^ states[5];
 		b[1] = states[3] ^ states[7];
 
-		b_x = ((uint4 *)b)[0];
+		b_x[0] = ((uint4 *)b)[0];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+		a[1] = states[1] ^ states[5];
+		b[2] = states[8] ^ states[10];
+		b[3] = states[9] ^ states[11];
+		b_x[1] = ((uint4 *)b)[1];
+		division_result = as_uint2(states[12]);
+		sqrt_result = as_uint2(states[13]).s0;
+#endif
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 		tweak1_2 = as_uint2(input[4]);
 		tweak1_2.s0 >>= 24;
 		tweak1_2.s0 |= tweak1_2.s1 << 8;
-		tweak1_2.s1 = get_global_id(0);
+		tweak1_2.s1 = (uint)get_global_id(0);
 		tweak1_2 ^= as_uint2(states[24]);
 #endif
 	}
@@ -617,39 +669,102 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	if(gIdx < Threads)
 #endif
 	{
-		ulong idx0 = a[0];
+		ulong idx0 = a[0] & MASK;
 
-		#pragma unroll 8
+		#pragma unroll CN_UNROLL
 		for(int i = 0; i < ITERATIONS; ++i)
 		{
 			ulong c[2];
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+			ulong idxS = idx0 & 0x30;
+ 			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
+#endif
 
-			((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
+			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
 // cryptonight_bittube2
 #if(ALGO == 10)
 			((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #else
 			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #endif
-			b_x ^= ((uint4 *)c)[0];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+			{
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+			}
+#endif
+
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 			uint table = 0x75310U;
+			b_x[0] ^= ((uint4 *)c)[0];
 // cryptonight_stellite
 #	if(ALGO == 7)
-			uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2);
+			uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2);
+#	else
+			uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2);
+#	endif
+			b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
+			SCRATCHPAD_CHUNK(0) = b_x[0];
+			idx0 = c[0] & MASK;
+// cryptonight_monero_v8
+#elif(ALGO==11)
+			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
+#	ifdef __NV_CL_C_VERSION
+			// flush shuffled data
+			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+ 			idx0 = c[0] & MASK;
+ 			idxS = idx0 & 0x30;
+ 			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #	else
-			uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2);
+			idx0 = c[0] & MASK;
 #	endif
-			b_x.s2 ^= ((table >> index) & 0x30U) << 24;
+#else
+			b_x[0] ^= ((uint4 *)c)[0];
+			SCRATCHPAD_CHUNK(0) = b_x[0];
+			idx0 = c[0] & MASK;
 #endif
-			Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;
-
 			uint4 tmp;
-			tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];
-
-			a[1] += c[0] * as_ulong2(tmp).s0;
-			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+			tmp = SCRATCHPAD_CHUNK(0);
+// cryptonight_monero_v8
+#if(ALGO==11)
+			// Use division and square root results from the _previous_ iteration to hide the latency
+			tmp.s0 ^= division_result.s0;
+			tmp.s1 ^= division_result.s1 ^ sqrt_result;
+ 			// Most and least significant bits in the divisor are set to 1
+			// to make sure we don't divide by a small or even number,
+			// so there are no shortcuts for such cases
+			const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL;
+ 			// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
+			// We drop the highest bit to fit both quotient and remainder in 32 bits
+			division_result = fast_div_v2(RCP, c[1], d);
+ 			// Use division_result as an input for the square root to prevent parallel implementation in hardware
+			sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
+#endif
+			ulong2 result_mul;
+			result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0);
+			result_mul.s1 = c[0] * as_ulong2(tmp).s0;
+// cryptonight_monero_v8
+#if(ALGO==11)
+			{
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
+				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				result_mul ^= chunk2;
+				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+			}
+#endif
+			a[1] += result_mul.s1;
+			a[0] += result_mul.s0;
 
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
@@ -658,44 +773,55 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #	if(ALGO == 6 || ALGO == 10)
 			uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
 #	else
 			((uint2 *)&(a[1]))[0] ^= tweak1_2;
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 			((uint2 *)&(a[1]))[0] ^= tweak1_2;
 #	endif
 
 #else
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 #endif
 
 			((uint4 *)a)[0] ^= tmp;
-			idx0 = a[0];
 
-			b_x = ((uint4 *)c)[0];
+// cryptonight_monero_v8
+#if (ALGO == 11)
+#	if defined(__NV_CL_C_VERSION)
+			// flush shuffled data
+			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+#	endif
+			b_x[1] = b_x[0];
+#endif
+			b_x[0] = ((uint4 *)c)[0];
+			idx0 = a[0] & MASK;
 
 // cryptonight_heavy || cryptonight_bittube2
 #if (ALGO == 4 || ALGO == 10)
-			long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
-			int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = n / (d | 0x5);
-			*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
-			idx0 = d ^ q;
-#endif
+			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+			idx0 = (d ^ q) & MASK;
 // cryptonight_haven
-#if (ALGO == 9)
-			long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
-			int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+#elif (ALGO == 9)
+			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = n / (d | 0x5);
-			*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
-			idx0 = (~d) ^ q;
+			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+			idx0 = ((~d) ^ q) & MASK;
 #endif
+
 		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
+)==="
+R"===(
+
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
 __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
 {
@@ -918,7 +1044,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
@@ -994,7 +1120,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 }
@@ -1072,7 +1198,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 }
@@ -1095,7 +1221,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 		#pragma unroll 4
 		for(uint i = 0; i < 4; ++i)
 		{
-			ulong H[8], M[8];
+			volatile ulong H[8], M[8];
 
 			if(i < 3)
 			{
@@ -1133,7 +1259,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
 		{
 			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + get_global_offset(0);
+				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
 		}
 	}
 }
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
new file mode 100644
index 0000000000000000000000000000000000000000..607806b7ad492d29fad7cfd3348b2a8218d17d1a
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -0,0 +1,108 @@
+R"===(
+/*
+ * @author SChernykh
+ */
+static const __constant uint RCP_C[256] =
+{
+	0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu,
+	0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au,
+	0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu,
+	0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u,
+	0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u,
+	0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u,
+	0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u,
+	0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u,
+	0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u,
+	0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u,
+	0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u,
+	0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u,
+	0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au,
+	0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u,
+	0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u,
+	0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu,
+	0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u,
+	0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u,
+	0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u,
+	0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u,
+	0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu,
+	0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u,
+	0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u,
+	0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u,
+	0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u,
+	0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu,
+	0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau,
+	0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u,
+	0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u,
+	0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu,
+	0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u,
+	0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u,
+};
+
+inline uint get_reciprocal(const __local uchar *RCP, uint a)
+{
+	const uint index1 = (a & 0x7F000000U) >> 21;
+	const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768;
+
+	const uint r1 = *(const __local uint*)(RCP + index1);
+
+	uint r2_0 = *(const __local uint*)(RCP + index1 + 4);
+	if (index2 > 0) r2_0 >>= 16;
+	const int r2 = r2_0 & 0xFFFFU;
+
+	const uint r = r1 - (uint)(mul24(r2, index2) >> 6);
+
+	const ulong lo0 = (ulong)(r) * a;
+	ulong lo = lo0 + ((ulong)(a) << 32);
+
+	a >>= 1;
+	const bool b = (a >= lo) || (lo >= lo0);
+	lo = a - lo;
+
+	const ulong k = mul_hi(as_uint2(lo).s0, r) + ((ulong)(r) * as_uint2(lo).s1) + lo;
+	return as_uint2(k).s1 + (b ? r : 0);
+}
+
+inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
+{
+	const uint r = get_reciprocal((const __local uchar *)RCP, b);
+	const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
+
+	ulong q;
+	((uint*)&q)[0] = as_uint2(k).s1;;
+	((uint*)&q)[1] = (k < a) ? 1 : 0;
+
+	const long tmp = a - q * b;
+	const bool overshoot = (tmp < 0);
+	const bool undershoot = (tmp >= b);
+
+	return (uint2)(
+		as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U),
+		as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U)
+	);
+}
+
+inline uint fast_sqrt_v2(const ulong n1)
+{
+	float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
+
+	float x1 = native_rsqrt(x);
+	x = native_sqrt(x);
+
+	// The following line does x1 *= 4294967296.0f;
+	x1 = as_float(as_uint(x1) + (32U << 23));
+
+	const uint x0 = as_uint(x) - (158U << 23);
+	const long delta0 = n1 - (((long)(x0) * x0) << 18);
+	const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
+
+	uint result = (x0 << 10) + convert_int_rte(delta);
+	const uint s = result >> 1;
+	const uint b = result & 1;
+
+	const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
+	if ((long)(x2 + b) > 0) --result;
+	if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
+
+	return result;
+}
+)==="
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index d6acec97196a7af57a440d9ccb478372290b12bd..c5b331c872746cae4cb218eb934b532c1cb458dc 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -127,6 +127,24 @@ private:
 				minFreeMem = 512u * byteToMiB;
 			}
 
+			// check if cryptonight_monero_v8 is selected for the user or dev pool
+			bool useCryptonight_v8 =
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
+			// set strided index to default
+			ctx.stridedIndex = 1;
+
+			// nvidia performance is very bad if the scratchpad is not contiguous
+			if(ctx.isNVIDIA)
+				ctx.stridedIndex = 0;
+
+			// use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2`
+			if(useCryptonight_v8)
+				ctx.stridedIndex = 2;
+
 			// increase all intensity limits by two for aeon
 			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
 				maxThreads *= 2u;
@@ -153,8 +171,8 @@ private:
 				// set 8 threads per block (this is a good value for the most gpus)
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-					"    \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n"
-					"    \"comp_mode\" : true\n" +
+					"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+					"    \"unroll\" : 8, \"comp_mode\" : true\n" +
 					"  },\n";
 			}
 			else
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 28855f070d8d5f7035ce419a46c938b7b35c56d5..421e0ed4bfeb3e084b8b16f94a2ab2000e8f7a65 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * GPU configuration. You should play around with intensity and worksize as the fastest settings will vary.
  * index         - GPU index number usually starts from 0
@@ -9,17 +10,20 @@ R"===(
  *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
  *                     required: intensity must be a multiple of worksize
  *                 1 or true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
+ *                             (for cryptonight_v8 and monero it is equal to strided_index = 0)
  *                 0 or false = use a contiguous block of memory per thread
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2
  *                 element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
+ * unroll        - allow to control how often the POW main loop is unrolled; valid range [1;128) - for most OpenCL implementations it must be a power of two.
  * comp_mode     - Compatibility enable/disable the automatic guard around compute kernel which allows
  *                 to use a intensity which is not the multiple of the worksize.
  *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
  *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true },
+ *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ *    "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
  * ],
  * If you do not wish to mine with your AMD GPU(s) then use:
  * "gpu_threads_conf" :
@@ -34,5 +38,4 @@ GPUCONFIG
  * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel.
  */
 "platform_index" : PLATFORMINDEX,
-
 )==="
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 0f39ff2b99d9cce4b3869a3917585ce86f94811f..152f8add43eed3f9028797db72653d45c1f8680b 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -106,17 +106,18 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	stridedIndex = GetObjectMember(oThdConf, "strided_index");
 	memChunk = GetObjectMember(oThdConf, "mem_chunk");
+	unroll = GetObjectMember(oThdConf, "unroll");
 	compMode = GetObjectMember(oThdConf, "comp_mode");
 
 	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
-		stridedIndex == nullptr || compMode == nullptr)
+		stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
 		return false;
 
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -142,13 +143,20 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 		return false;
 	}
 
+	if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 )
+	{
+		printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
+		return false;
+	}
+
 	cfg.memChunk = (int)memChunk->GetInt64();
 
-	if(!idx->IsUint64() || cfg.memChunk > 18 )
+	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || (int)unroll->GetInt64() == 0)
 	{
-		printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
+		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and not zero");
 		return false;
 	}
+	cfg.unroll = (int)unroll->GetInt64();
 
 	if(!compMode->IsBool())
 		return false;
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index 580b69fe70df23721d9d975ea54bd8ef03c238e6..b852c5940ef6c05d24afe3940f4e920f195d8836 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -28,6 +28,7 @@ public:
 		long long cpu_aff;
 		int stridedIndex;
 		int memChunk;
+		int unroll;
 		bool compMode;
 	};
 
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index f7b47249eeb0bb64aa5d7429404ef8948d7a6271..5e70f25a649f02e1058e21683bf191b0f02a4ecd 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -99,6 +99,7 @@ bool minethd::init_gpus()
 		vGpuData[i].stridedIndex = cfg.stridedIndex;
 		vGpuData[i].memChunk = cfg.memChunk;
 		vGpuData[i].compMode = cfg.compMode;
+		vGpuData[i].unroll = cfg.unroll;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
@@ -173,6 +174,11 @@ void minethd::work_main()
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
 
+	if(cpu_ctx == nullptr)
+	{
+		printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory, miner will be stopped.");
+		win_exit(1);
+	}
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
 	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
@@ -235,7 +241,7 @@ void minethd::work_main()
 				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
 					break;
 			}
-			
+
 
 			cl_uint results[0x100];
 			memset(results,0,sizeof(cl_uint)*(0x100));
@@ -252,7 +258,7 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = results[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 3142117c509b98c3858a82e136d93f229ea627b0..32e66ec870cb43f244557545e8930a28ea36eb35 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -24,14 +24,14 @@ public:
 	static bool init_gpus();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
 
 	minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg);
 
 	void work_main();
 
 	uint64_t iJobNo;
-	
+
 	miner_work oWork;
 
 	std::promise<void> order_fix;
diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp
index 525413fd5754816f121d963a2b7172202564e3b1..92bb015063eb46b3b1bdc494611a69e5d8ca9e60 100644
--- a/xmrstak/backend/backendConnector.cpp
+++ b/xmrstak/backend/backendConnector.cpp
@@ -63,10 +63,35 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 #ifndef CONF_NO_CUDA
 	if(params::inst().useNVIDIA)
 	{
-		plugin nvidiaplugin("NVIDIA", "xmrstak_cuda_backend");
-		std::vector<iBackend*>* nvidiaThreads = nvidiaplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
-		pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
-		if(nvidiaThreads->size() == 0)
+		plugin nvidiaplugin;
+		std::vector<iBackend*>* nvidiaThreads;
+		std::vector<std::string> libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"};
+		size_t numWorkers = 0u;
+
+		for( const auto & name : libNames)
+		{
+			printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str());
+			nvidiaplugin.load("NVIDIA", name);
+			std::vector<iBackend*>* nvidiaThreads = nvidiaplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
+			if(nvidiaThreads != nullptr)
+			{
+				pvThreads->insert(std::end(*pvThreads), std::begin(*nvidiaThreads), std::end(*nvidiaThreads));
+				numWorkers = nvidiaThreads->size();
+				delete nvidiaThreads;
+			}
+			else
+			{
+				// remove the plugin if we have found no GPUs
+				nvidiaplugin.unload();
+			}
+			// we found at leat one working GPU
+			if(numWorkers != 0)
+			{
+				printer::inst()->print_msg(L0, "NVIDIA: use library '%s'", name.c_str());
+				break;
+			}
+		}
+		if(numWorkers == 0)
 			printer::inst()->print_msg(L0, "WARNING: backend NVIDIA disabled.");
 	}
 #endif
@@ -75,10 +100,17 @@ std::vector<iBackend*>* BackendConnector::thread_starter(miner_work& pWork)
 	if(params::inst().useAMD)
 	{
 		const std::string backendName = xmrstak::params::inst().openCLVendor;
-		plugin amdplugin(backendName, "xmrstak_opencl_backend");
+		plugin amdplugin;
+		amdplugin.load(backendName, "xmrstak_opencl_backend");
 		std::vector<iBackend*>* amdThreads = amdplugin.startBackend(static_cast<uint32_t>(pvThreads->size()), pWork, environment::inst());
-		pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
-		if(amdThreads->size() == 0)
+		size_t numWorkers = 0u;
+		if(amdThreads != nullptr)
+		{
+			pvThreads->insert(std::end(*pvThreads), std::begin(*amdThreads), std::end(*amdThreads));
+			numWorkers = amdThreads->size();
+			delete amdThreads;
+		}
+		if(numWorkers == 0)
 			printer::inst()->print_msg(L0, "WARNING: backend %s (OpenCL) disabled.", backendName.c_str());
 	}
 #endif
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 57dbef05358c301d90af5504f072ff23cd2035f5..e7f3e914800b77d9601f0b968ee741e76f4eca60 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -7,6 +7,7 @@
 #include "xmrstak/misc/configEditor.hpp"
 #include "xmrstak/params.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
 #include <string>
 
 #ifdef _WIN32
@@ -20,14 +21,6 @@ namespace xmrstak
 {
 namespace cpu
 {
-// Mask bits between h and l and return the value
-// This enables us to put in values exactly like in the manual
-// For example EBX[31:22] is get_masked(cpu_info[1], 31, 22)
-inline int32_t get_masked(int32_t val, int32_t h, int32_t l)
-{
-	val &= (0x7FFFFFFF >> (31-(h-l))) << l;
-	return val >> l;
-}
 
 class autoAdjust
 {
@@ -58,7 +51,7 @@ public:
 			if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048))
 				printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size);
 
-			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true,  \"asm\" : \"off\", \"affine_to_cpu\" : false },\n");
 			printer::inst()->print_msg(L0, "Autoconf FAILED. Create config for a single thread. Please try to add new ones until the hashrate slows down.");
 		}
 		else
@@ -82,7 +75,7 @@ public:
 
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string(double_mode ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(aff_id);
 				conf += std::string(" },\n");
 
@@ -143,7 +136,8 @@ private:
 			L3KB_size = get_masked(cpu_info[3], 31, 18) * 512;
 
 			::jconf::cpuid(1, 0, cpu_info);
-			if(get_masked(cpu_info[0], 11, 8) < 0x17) //0x17h is Zen
+
+			if(getModel().family < 0x17) //0x17h is Zen
 				old_amd = true;
 
 			return true;
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index 01d2280d8f580fd7950bd2973b3743fd7ebb3f3a..b61582588130801013f4a191659da7a669a139f3 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -70,7 +70,7 @@ public:
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(id & 0x7FFFFFF);
 				conf += std::string(" },\n");
 			}
@@ -78,7 +78,8 @@ public:
 		catch(const std::runtime_error& err)
 		{
 			// \todo add fallback to default auto adjust
-			conf += std::string("    { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
+			conf += std::string("    { \"low_power_mode\" : false");
+			conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : false },\n");
 			printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what());
 		}
 
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index 2fc9a47ec5a92f3c90d3ee89cc0c9aedc4326958..1a64860e4f3825731595cdc133b1d53ae7fb8111 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * Thread configuration for each thread. Make sure it matches the number above.
  * low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true,
@@ -7,10 +8,16 @@ R"===(
  *                  the maximum performance. When set to a number N greater than 1, this mode will increase the
  *                  cache usage and single thread performance by N times.
  *
- * no_prefetch -    Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
+ * no_prefetch    - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
  *                  things slower.
  *
- * affine_to_cpu -  This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
+ * asm            - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, off, intel_avx, amd_avx]
+ *                    - auto: xmr-stak will automatically detect the asm type (default)
+ *                    - off: disable the usage of optimized assembler
+ *                    - intel_avx: supports Intel cpus with avx instructions e.g. Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx
+ *                    - amd_avx: supports AMD cpus with avx instructions e.g. AMD Ryzen 1xxx and 2xxx series
+ *
+ * affine_to_cpu  - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
  *                  systems it is better to assign threads to physical cores. On Windows this usually means selecting
  *                  even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
  *                  physical core CPU you should select cpu numbers 0-3.
@@ -21,8 +28,8 @@ R"===(
  * A filled out configuration should look like this:
  * "cpu_threads_conf" :
  * [
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 },
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 },
  * ],
  * If you do not wish to mine with your CPU(s) then use:
  * "cpu_threads_conf" :
@@ -33,5 +40,4 @@ R"===(
 [
 CPUCONFIG
 ],
-
 )==="
diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5959b75cc7aa753a4028115ec41539fe4b328b69
--- /dev/null
+++ b/xmrstak/backend/cpu/cpuType.cpp
@@ -0,0 +1,79 @@
+
+#include "xmrstak/backend/cpu/cpuType.hpp"
+
+#include <cstring>
+#include <inttypes.h>
+#include <cstdio>
+
+#ifdef _WIN32
+#define strcasecmp _stricmp
+#include <intrin.h>
+#else
+#include <cpuid.h>
+#endif
+
+namespace xmrstak
+{
+namespace cpu
+{
+	void cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
+	{
+		std::memset(val, 0, sizeof(int32_t)*4);
+
+	#ifdef _WIN32
+		__cpuidex(val, eax, ecx);
+	#else
+		__cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
+	#endif
+	}
+
+	int32_t get_masked(int32_t val, int32_t h, int32_t l)
+	{
+		val &= (0x7FFFFFFF >> (31-(h-l))) << l;
+		return val >> l;
+	}
+
+	bool has_feature(int32_t val, int32_t bit)
+	{
+		int32_t mask = 1 << bit;
+		return (val & mask) != 0u;
+		
+	}
+	
+	Model getModel()
+	{
+		int32_t cpu_info[4];
+		char cpustr[13] = {0};
+
+		cpuid(0, 0, cpu_info);
+		std::memcpy(cpustr, &cpu_info[1], 4);
+		std::memcpy(cpustr+4, &cpu_info[3], 4);
+		std::memcpy(cpustr+8, &cpu_info[2], 4);
+
+		Model result;
+
+		cpuid(1, 0, cpu_info);
+		
+		result.family = get_masked(cpu_info[0], 12, 8);
+		result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4;
+		result.type_name = cpustr;
+
+		// feature bits https://en.wikipedia.org/wiki/CPUID
+		// sse2
+		result.sse2 = has_feature(cpu_info[3], 26);
+		// aes-ni
+		result.aes = has_feature(cpu_info[2], 25);
+		// avx
+		result.avx = has_feature(cpu_info[2], 28);	
+
+		if(strcmp(cpustr, "AuthenticAMD") == 0)
+		{
+			if(result.family == 0xF)
+				result.family += get_masked(cpu_info[0], 28, 20);
+		}
+
+		return result;
+	}
+
+} // namespace cpu
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f6bfaf51b663f5ff0e22134463d51c9e2dea70c
--- /dev/null
+++ b/xmrstak/backend/cpu/cpuType.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <string>
+#include <cstdint>
+
+
+namespace xmrstak
+{
+namespace cpu
+{
+	struct Model
+	{
+		uint32_t family = 0u;
+		uint32_t model = 0u;
+		bool aes = false;
+		bool sse2 = false;
+		bool avx = false;
+		std::string type_name = "unknown";
+	};
+
+	Model getModel();
+
+	/** Mask bits between h and l and return the value
+	 *
+	 * This enables us to put in values exactly like in the manual
+	 * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22)
+	 */
+	int32_t get_masked(int32_t val, int32_t h, int32_t l);
+
+	
+} // namespace cpu
+} // namespace xmrstak
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
new file mode 100644
index 0000000000000000000000000000000000000000..79adab671e1bbb0f0be1178764282817267049fb
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_linux.inc
@@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movq	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movq	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movq xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movq	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+ALIGN 16
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movq	xmm0, r11
+	movq	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movq	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movq	xmm0, rbp
+	movq	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movq	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movq	rdx, xmm5
+	shl	rdx, 32
+	movq	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movq xmm0, rdx
+	xor rdx, [r11+r13]
+	movq xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movq	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movq	r11, xmm0
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movq	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movq	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movq	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movq	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movq r9, xmm10
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movq xmm0, rax
+	movq xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movq	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movq xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movq	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movq xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
new file mode 100644
index 0000000000000000000000000000000000000000..ad8f1823309c2cc2f77391e87de6e6de8b36de84
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_double_main_loop_sandybridge_win64.inc
@@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movd	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movd	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movd xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movd xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movd	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movd	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movd	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movd	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movd	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movd	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN 64
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movd	xmm0, r11
+	movd	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movd	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movd	xmm0, rbp
+	movd	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movd	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movd	rdx, xmm5
+	shl	rdx, 32
+	movd	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movd xmm0, rdx
+	xor rdx, [r11+r13]
+	movd xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movd	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movd	r11, xmm0
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movd	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movd	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movd	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movd	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movd r9, xmm10
+	psrldq	xmm1, 8
+	movd	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movd xmm0, rax
+	movd xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movd	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movd xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movd	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movd xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
new file mode 100644
index 0000000000000000000000000000000000000000..c0a3d0b41498ee419deca860ea1c307407a5d0ab
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S
@@ -0,0 +1,37 @@
+#define ALIGN .align
+.intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
+.section .text
+#endif
+.global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm)
+.global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm)
+.global FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm)
+
+ALIGN 8
+FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cryptonight_v8_main_loop_ivybridge_linux.inc"
+	add rsp, 48
+	ret 0
+
+ALIGN 8
+FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cryptonight_v8_main_loop_ryzen_linux.inc"
+	add rsp, 48
+	ret 0
+
+ALIGN 16
+FN_PREFIX(cryptonight_v8_double_mainloop_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	mov rdx, rsi
+	#include "cryptonight_v8_double_main_loop_sandybridge_linux.inc"
+	add rsp, 48
+	ret 0
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
new file mode 100644
index 0000000000000000000000000000000000000000..1f3d2e15cebb3268ce566f62921f40ce9eafafc7
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm
@@ -0,0 +1,25 @@
+_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cryptonight_v8_mainloop_ivybridge_asm
+PUBLIC cryptonight_v8_mainloop_ryzen_asm
+PUBLIC cryptonight_v8_double_mainloop_sandybridge_asm
+
+ALIGN 8
+cryptonight_v8_mainloop_ivybridge_asm PROC
+	INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc
+	ret 0
+cryptonight_v8_mainloop_ivybridge_asm ENDP
+
+ALIGN 8
+cryptonight_v8_mainloop_ryzen_asm PROC
+	INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc
+	ret 0
+cryptonight_v8_mainloop_ryzen_asm ENDP
+
+ALIGN 8
+cryptonight_v8_double_mainloop_sandybridge_asm PROC
+	INCLUDE cryptonight_v8_double_main_loop_sandybridge_win64.inc
+	ret 0
+cryptonight_v8_double_mainloop_sandybridge_asm ENDP
+
+_TEXT_CNV8_MAINLOOP ENDS
+END
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
new file mode 100644
index 0000000000000000000000000000000000000000..cbe43b0d3c22c1b47b2d0a09592ba50b44289c09
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc
@@ -0,0 +1,187 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movq	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movq	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movq	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movq xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN 8
+main_loop_ivybridge:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movq	 xmm0, r11
+	movq	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movq	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movq	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movq	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movq	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movq	 rdx, xmm3
+	test	 edx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+sqrt_fixup_ivybridge_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movq xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movq xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movq	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
new file mode 100755
index 0000000000000000000000000000000000000000..8d49c5db7f45734496eede2495784e2a090d4f74
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc
@@ -0,0 +1,187 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movd	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movd	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movd	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movd xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movd	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN 8
+main_loop_ivybridge:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movd	 xmm0, r11
+	movd	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movd	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movd	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movd	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movd	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movd	 rdx, xmm3
+	test	 edx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+sqrt_fixup_ivybridge_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movd xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movd xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movd	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
new file mode 100644
index 0000000000000000000000000000000000000000..cd8b43477ce279f4232ea59fc5683c6378f4ff51
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc
@@ -0,0 +1,179 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN 8
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq	xmm0, r11
+	movq	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movq	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movq	rax, xmm0
+
+	div	r9
+	movq xmm0, rax
+	movq xmm1, rdx
+	punpckldq xmm0, xmm1
+	movq r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movq r9, xmm2
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
new file mode 100755
index 0000000000000000000000000000000000000000..d103cc2eee2f20f13131c903812ad5a851940b22
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc
@@ -0,0 +1,179 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movd	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movd	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movd	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movd xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movd	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN 8
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movd	xmm0, r11
+	movd	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movd	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movd	rax, xmm0
+
+	div	r9
+	movd xmm0, rax
+	movd xmm1, rdx
+	punpckldq xmm0, xmm1
+	movd r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movd	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+	movd xmm1, rax
+	movd xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movd r9, xmm2
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 9f70bcfa709d2a8be889c30f3766a6a812114d82..2b17417644fdb00f99624a83ff5cbd8ad37cf1f0 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -19,6 +19,8 @@
 #include "xmrstak/backend/cryptonight.hpp"
 #include <memory.h>
 #include <stdio.h>
+#include <cfenv>
+#include <utility>
 
 #ifdef __GNUC__
 #include <x86intrin.h>
@@ -151,15 +153,15 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i
 
 inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
 {
-    __m128i tmp0 = x0;
-    x0 = _mm_xor_si128(x0, x1);
-    x1 = _mm_xor_si128(x1, x2);
-    x2 = _mm_xor_si128(x2, x3);
-    x3 = _mm_xor_si128(x3, x4);
-    x4 = _mm_xor_si128(x4, x5);
-    x5 = _mm_xor_si128(x5, x6);
-    x6 = _mm_xor_si128(x6, x7);
-    x7 = _mm_xor_si128(x7, tmp0);
+	__m128i tmp0 = x0;
+	x0 = _mm_xor_si128(x0, x1);
+	x1 = _mm_xor_si128(x1, x2);
+	x2 = _mm_xor_si128(x2, x3);
+	x3 = _mm_xor_si128(x3, x4);
+	x4 = _mm_xor_si128(x4, x5);
+	x5 = _mm_xor_si128(x5, x6);
+	x6 = _mm_xor_si128(x6, x7);
+	x7 = _mm_xor_si128(x7, tmp0);
 }
 
 template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
@@ -422,6 +424,29 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 	_mm_store_si128(output + 11, xout7);
 }
 
+inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0)
+{
+	__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
+	x = _mm_sqrt_sd(_mm_setzero_pd(), x);
+	uint64_t r = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
+
+	const uint64_t s = r >> 20;
+	r >>= 19;
+
+	uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+
+#ifdef __INTEL_COMPILER
+	_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned __int64*)&x2), r, 0, (unsigned __int64*)&r);
+#elif defined(_MSC_VER) || (__GNUC__ >= 7)
+	_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
+#else
+	// GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence
+	// Fallback to simpler code
+	if (x2 < n0) ++r;
+#endif
+	return r;
+}
+
 inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
 {
 	alignas(16) uint32_t k[4];
@@ -467,712 +492,511 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 
 }
 
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
+/** optimal type for sqrt
+ *
+ * Depending on the number of hashes calculated the optimal type for the sqrt value will be selected.
+ *
+ * @tparam N number of hashes per thread
+ */
+template<size_t N>
+struct GetOptimalSqrtType
 {
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
-
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
-	{
-		memset(output, 0, 32);
-		return;
-	}
-
-	keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
+	using type = __m128i;
+};
 
-	uint64_t monero_const;
-	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-	{
-		monero_const  =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35);
-		monero_const ^=  *(reinterpret_cast<const uint64_t*>(ctx0->hash_state) + 24);
-	}
-
-	// Optim - 99% time boundary
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
-
-	uint8_t* l0 = ctx0->long_state;
-	uint64_t* h0 = (uint64_t*)ctx0->hash_state;
-
-	uint64_t al0 = h0[0] ^ h0[4];
-	uint64_t ah0 = h0[1] ^ h0[5];
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-
-	uint64_t idx0 = h0[0] ^ h0[4];
-
-	// Optim - 90% time boundary
-	for(size_t i = 0; i < ITERATIONS; i++)
-	{
-		__m128i cx;
-		cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]);
+template<>
+struct GetOptimalSqrtType<1u>
+{
+	using type = uint64_t;
+};
+template<size_t N>
+using GetOptimalSqrtType_t = typename GetOptimalSqrtType<N>::type;
+
+/** assign a value and convert if necessary
+ *
+ * @param output output type
+ * @param input value which is assigned to output
+ * @{
+ */
+inline void assign(__m128i& output, const uint64_t input)
+{
+	output = _mm_cvtsi64_si128(input);
+}
 
-		if (ALGO == cryptonight_bittube2)
-		{
-			cx = aes_round_bittube2(cx, _mm_set_epi64x(ah0, al0));
-		} 
-		else
-		{
-			if(SOFT_AES)
-				cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
-			else
-				cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
-		}
+inline void assign(uint64_t& output, const uint64_t input)
+{
+	output = input;
+}
 
-		if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-			cryptonight_monero_tweak<ALGO>((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
-		else
-			_mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+inline void assign(uint64_t& output, const __m128i& input)
+{
+	output = _mm_cvtsi128_si64(input);
+}
+/** @} */
 
-		idx0 = _mm_cvtsi128_si64(cx);
+inline void set_float_rounding_mode()
+{
+#ifdef _MSC_VER
+	_control87(RC_DOWN, MCW_RC);
+#else
+	std::fesetround(FE_DOWNWARD);
+#endif
+}
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
-		bx0 = cx;
+#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	}
 
-		uint64_t hi, lo, cl, ch;
-		cl = ((uint64_t*)&l0[idx0 & MASK])[0];
-		ch = ((uint64_t*)&l0[idx0 & MASK])[1];
+#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		hi ^= ((uint64_t*)&chunk2)[0]; \
+		lo ^= ((uint64_t*)&chunk2)[1]; \
+		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	}
 
-		lo = _umul128(idx0, cl, &hi);
+#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		uint64_t sqrt_result_tmp; \
+		assign(sqrt_result_tmp, sqrt_result); \
+		/* Use division and square root results from the _previous_ iteration to hide the latency */ \
+		const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \
+		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \
+		const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \
+		/* Most and least significant bits in the divisor are set to 1 \
+		 * to make sure we don't divide by a small or even number, \
+		 * so there are no shortcuts for such cases \
+		 * \
+		 * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \
+		 * We drop the highest bit to fit both quotient and remainder in 32 bits \
+		 */  \
+		/* Compiler will optimize it to a single div instruction */ \
+		const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+		const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \
+		division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
+		/* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \
+		assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \
+	}
 
-		al0 += hi;
-		((uint64_t*)&l0[idx0 & MASK])[0] = al0;
-		al0 ^= cl;
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[al0 & MASK], _MM_HINT_T0);
-		ah0 += lo;
+#define CN_INIT_SINGLE \
+	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \
+	{ \
+		memset(output, 0, 32 * N); \
+		return; \
+	}
 
-		if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) {
-			if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
-				((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const ^ ((uint64_t*)&l0[idx0 & MASK])[0];
-			else
-				((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const;
-		}
-		else
-			((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
-		ah0 ^= ch;
+#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \
+	keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
+	uint64_t monero_const; \
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+	{ \
+		monero_const =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len * n + 35); \
+		monero_const ^=  *(reinterpret_cast<const uint64_t*>(ctx[n]->hash_state) + 24); \
+	} \
+	/* Optim - 99% time boundary */ \
+	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \
+	\
+	__m128i ax0; \
+	uint64_t idx0; \
+	__m128i bx0; \
+	uint8_t* l0 = ctx[n]->long_state; \
+	/* BEGIN cryptonight_monero_v8 variables */ \
+	__m128i bx1; \
+	__m128i division_result_xmm; \
+	GetOptimalSqrtType_t<N> sqrt_result; \
+	/* END cryptonight_monero_v8 variables */ \
+	{ \
+		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
+		idx0 = h0[0] ^ h0[4]; \
+		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
+		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
+		if(ALGO == cryptonight_monero_v8) \
+		{ \
+			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
+			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
+			assign(sqrt_result, h0[13]); \
+			set_float_rounding_mode(); \
+		} \
+	} \
+	__m128i *ptr0
 
-		idx0 = al0;
+#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \
+	__m128i cx; \
+	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+	cx = _mm_load_si128(ptr0); \
+	if (ALGO == cryptonight_bittube2) \
+	{ \
+		cx = aes_round_bittube2(cx, ax0); \
+	} \
+	else \
+	{ \
+		if(SOFT_AES) \
+			cx = soft_aesenc(cx, ax0); \
+		else \
+			cx = _mm_aesenc_si128(cx, ax0); \
+	} \
+	CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1)
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
-		{
-			int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-			int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-			int64_t q = n / (d | 0x5);
+#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
+	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+		cryptonight_monero_tweak<ALGO>((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \
+	else \
+		_mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \
+	idx0 = _mm_cvtsi128_si64(cx); \
+	\
+	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+	if(PREFETCH) \
+		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
+	if(ALGO != cryptonight_monero_v8) \
+		bx0 = cx
+
+#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \
+	uint64_t lo, cl, ch; \
+	uint64_t al0 = _mm_cvtsi128_si64(ax0); \
+	uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
+	cl = ((uint64_t*)ptr0)[0]; \
+	ch = ((uint64_t*)ptr0)[1]; \
+	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
+	{ \
+		uint64_t hi; \
+		lo = _umul128(idx0, cl, &hi); \
+		CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \
+		ah0 += lo; \
+		al0 += hi; \
+	} \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		bx1 = bx0; \
+		bx0 = cx; \
+	} \
+	((uint64_t*)ptr0)[0] = al0; \
+	if(PREFETCH) \
+		_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
 
-			((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-			idx0 = d ^ q;
-		}
-		else if(ALGO == cryptonight_haven)
-		{
-			int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-			int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-			int64_t q = n / (d | 0x5);
+#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
+	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
+	{ \
+		if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \
+		else \
+			((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \
+	} \
+	else \
+		((uint64_t*)ptr0)[1] = ah0; \
+	al0 ^= cl; \
+	ah0 ^= ch; \
+	ax0 = _mm_set_epi64x(ah0, al0); \
+	idx0 = al0;
 
-			((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-			idx0 = (~d) ^ q;
-		}
+#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \
+	{ \
+		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+		int64_t u  = ((int64_t*)ptr0)[0]; \
+		int32_t d  = ((int32_t*)ptr0)[2]; \
+		int64_t q = u / (d | 0x5); \
+		\
+		((int64_t*)ptr0)[0] = u ^ q; \
+		idx0 = d ^ q; \
+	} \
+	else if(ALGO == cryptonight_haven) \
+	{ \
+		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
+		int64_t u  = ((int64_t*)ptr0)[0]; \
+		int32_t d  = ((int32_t*)ptr0)[2]; \
+		int64_t q = u / (d | 0x5); \
+		\
+		((int64_t*)ptr0)[0] = u ^ q; \
+		idx0 = (~d) ^ q; \
 	}
 
-	// Optim - 90% time boundary
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
-
-	// Optim - 99% time boundary
+#define CN_FINALIZE(n) \
+	/* Optim - 90% time boundary */ \
+	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \
+	/* Optim - 99% time boundary */ \
+	keccakf((uint64_t*)ctx[n]->hash_state, 24); \
+	extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n)
 
-	keccakf((uint64_t*)ctx0->hash_state, 24);
-	extra_hashes[ctx0->hash_state[0] & 3](ctx0->hash_state, 200, (char*)output);
-}
+//! defer the evaluation of an macro
+#ifndef _MSC_VER
+#	define CN_DEFER(...) __VA_ARGS__
+#else
+#	define CN_EMPTY(...)
+#	define CN_DEFER(...) __VA_ARGS__ CN_EMPTY()
+#endif
 
-// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
-// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
-// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+//! execute the macro f with the passed arguments
+#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__)
+
+/** add append n to all arguments and keeps n as first argument
+ *
+ * @param n number which is appended to the arguments (expect the first argument n)
+ *
+ * @code{.cpp}
+ * CN_ENUM_2(1, foo, bar)
+ * // is transformed to
+ * 1, foo1, bar1
+ * @endcode
+ */
+#define CN_ENUM_0(n, ...) n
+#define CN_ENUM_1(n, x1) n, x1 ## n
+#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n
+#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n
+#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n
+#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n
+#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n
+#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n
+#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n
+#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n
+#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n
+#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n
+#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n
+#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n
+#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n
+#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n
+
+/** repeat a macro call multiple times
+ *
+ * @param n number of arguments followed after f
+ * @param f name of the macro which should be executed
+ * @param ... n parameter which name will get appended by a unique number
+ *
+ * @code{.cpp}
+ * REPEAT_2(2, f, foo, bar)
+ * // is transformed to
+ * f(0, foo0, bar); f(1, foo1, bar1)
+ * @endcode
+ */
+#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__))
+#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__))
+#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__))
+#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__))
+#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__))
+
+template< size_t N>
+struct Cryptonight_hash;
+
+template< >
+struct Cryptonight_hash<1>
 {
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
+	static constexpr size_t N = 1;
 
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		memset(output, 0, 64);
-		return;
-	}
-
-	keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
-	keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200);
-
-	uint64_t monero_const_0, monero_const_1;
-	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-	{
-		monero_const_0  =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35);
-		monero_const_0 ^=  *(reinterpret_cast<const uint64_t*>(ctx[0]->hash_state) + 24);
-		monero_const_1  =  *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + len + 35);
-		monero_const_1 ^=  *(reinterpret_cast<const uint64_t*>(ctx[1]->hash_state) + 24);
-	}
-
-	// Optim - 99% time boundary
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
+		CN_INIT_SINGLE;
+		REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
-	uint64_t axl0 = h0[0] ^ h0[4];
-	uint64_t axh0 = h0[1] ^ h0[5];
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	uint64_t axl1 = h1[0] ^ h1[4];
-	uint64_t axh1 = h1[1] ^ h1[5];
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-
-	uint64_t idx0 = h0[0] ^ h0[4];
-	uint64_t idx1 = h1[0] ^ h1[4];
-
-	// Optim - 90% time boundary
-	for (size_t i = 0; i < ITERATIONS; i++)
-	{
-		__m128i cx;
-		cx = _mm_load_si128((__m128i *)&l0[idx0 & MASK]);
-
-		if (ALGO == cryptonight_bittube2)
-		{
-			cx = aes_round_bittube2(cx, _mm_set_epi64x(axh0, axl0));
-		} 
-		else
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			if(SOFT_AES)
-				cx = soft_aesenc(cx, _mm_set_epi64x(axh0, axl0));
-			else
-				cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0));
+			REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
 
-		if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-			cryptonight_monero_tweak<ALGO>((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
-		else
-			_mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+		REPEAT_1(0, CN_FINALIZE);
+	}
+};
 
-		idx0 = _mm_cvtsi128_si64(cx);
-		bx0 = cx;
+template< >
+struct Cryptonight_hash<2>
+{
+	static constexpr size_t N = 2;
 
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-		cx = _mm_load_si128((__m128i *)&l1[idx1 & MASK]);
+		CN_INIT_SINGLE;
+		REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
-		if (ALGO == cryptonight_bittube2)
-		{
-			cx = aes_round_bittube2(cx, _mm_set_epi64x(axh1, axl1));
-		} 
-		else
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			if(SOFT_AES)
-				cx = soft_aesenc(cx, _mm_set_epi64x(axh1, axl1));
-			else
-				cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1));
+			REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
 
-		if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
-			cryptonight_monero_tweak<ALGO>((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
-		else
-			_mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
-
-		idx1 = _mm_cvtsi128_si64(cx);
-		bx1 = cx;
-
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
-
-		uint64_t hi, lo, cl, ch;
-		cl = ((uint64_t*)&l0[idx0 & MASK])[0];
-		ch = ((uint64_t*)&l0[idx0 & MASK])[1];
-
-		lo = _umul128(idx0, cl, &hi);
+		REPEAT_2(0, CN_FINALIZE);
+	}
+};
 
-		axl0 += hi;
-		axh0 += lo;
-		((uint64_t*)&l0[idx0 & MASK])[0] = axl0;
+template< >
+struct Cryptonight_hash<3>
+{
+	static constexpr size_t N = 3;
 
-		if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) {
-			if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
-				((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0 ^ ((uint64_t*)&l0[idx0 & MASK])[0];
-			else
-				((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0;
-		} else
-			((uint64_t*)&l0[idx0 & MASK])[1] = axh0;
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-		axh0 ^= ch;
-		axl0 ^= cl;
-		idx0 = axl0;
+		CN_INIT_SINGLE;
+		REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-			int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-			int64_t q = n / (d | 0x5);
-
-			((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-			idx0 = d ^ q;
+			REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
-		else if(ALGO == cryptonight_haven)
-		{
-			int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-			int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-			int64_t q = n / (d | 0x5);
-
-			((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-			idx0 = (~d) ^ q;
-		}
-
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
-
-		cl = ((uint64_t*)&l1[idx1 & MASK])[0];
-		ch = ((uint64_t*)&l1[idx1 & MASK])[1];
 
-		lo = _umul128(idx1, cl, &hi);
+		REPEAT_3(0, CN_FINALIZE);
+	}
+};
 
-		axl1 += hi;
-		axh1 += lo;
-		((uint64_t*)&l1[idx1 & MASK])[0] = axl1;
+template< >
+struct Cryptonight_hash<4>
+{
+	static constexpr size_t N = 4;
 
-		if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) {
-			if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
-				((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1 ^ ((uint64_t*)&l1[idx1 & MASK])[0];
-			else
-				((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1;
-		} else
-			((uint64_t*)&l1[idx1 & MASK])[1] = axh1;
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-		axh1 ^= ch;
-		axl1 ^= cl;
-		idx1 = axl1;
+		CN_INIT_SINGLE;
+		REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			int64_t n  = ((int64_t*)&l1[idx1 & MASK])[0];
-			int32_t d  = ((int32_t*)&l1[idx1 & MASK])[2];
-			int64_t q = n / (d | 0x5);
-
-			((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
-			idx1 = d ^ q;
+			REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
-		else if(ALGO == cryptonight_haven)
-		{
-			int64_t n  = ((int64_t*)&l1[idx1 & MASK])[0];
-			int32_t d  = ((int32_t*)&l1[idx1 & MASK])[2];
-			int64_t q = n / (d | 0x5);
 
-			((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
-			idx1 = (~d) ^ q;
-		}
-
-		if(PREFETCH)
-			_mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
+		REPEAT_4(0, CN_FINALIZE);
 	}
+};
 
-	// Optim - 90% time boundary
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
-
-	// Optim - 99% time boundary
-
-	keccakf((uint64_t*)ctx[0]->hash_state, 24);
-	extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
-	keccakf((uint64_t*)ctx[1]->hash_state, 24);
-	extra_hashes[ctx[1]->hash_state[0] & 3](ctx[1]->hash_state, 200, (char*)output + 32);
-}
+template< >
+struct Cryptonight_hash<5>
+{
+	static constexpr size_t N = 5;
 
-#define CN_STEP1(a, b, c, l, ptr, idx)				\
-	ptr = (__m128i *)&l[idx & MASK];			\
-	if(PREFETCH)						\
-		_mm_prefetch((const char*)ptr, _MM_HINT_T0);	\
-	c = _mm_load_si128(ptr);
-
-#define CN_STEP2(a, b, c, l, ptr, idx)				\
-	if (ALGO == cryptonight_bittube2)	\
-	{	\
-		c = aes_round_bittube2(c, a);	\
-	}	\
-	else	\
-	{	\
-		if(SOFT_AES)					\
-			c = soft_aesenc(c, a);			\
-		else						\
-			c = _mm_aesenc_si128(c, a);		\
-	} 							\
-	b = _mm_xor_si128(b, c);				\
-	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-		cryptonight_monero_tweak<ALGO>((uint64_t*)ptr, b); \
-	else \
-		_mm_store_si128(ptr, b);\
-
-#define CN_STEP3(a, b, c, l, ptr, idx)				\
-	idx = _mm_cvtsi128_si64(c);				\
-	ptr = (__m128i *)&l[idx & MASK];			\
-	if(PREFETCH)						\
-		_mm_prefetch((const char*)ptr, _MM_HINT_T0);	\
-	b = _mm_load_si128(ptr);
-
-#define CN_STEP4(a, b, c, l, mc, ptr, idx)				\
-	lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi);		\
-	a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi));		\
-	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
-	{ \
-		_mm_store_si128(ptr, _mm_xor_si128(a, mc)); \
-		if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \
-			((uint64_t*)ptr)[1] ^= ((uint64_t*)ptr)[0];\
-	} \
-	else \
-		_mm_store_si128(ptr, a);\
-	a = _mm_xor_si128(a, b); \
-	idx = _mm_cvtsi128_si64(a);	\
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \
-	{ \
-		int64_t n  = ((int64_t*)&l[idx & MASK])[0]; \
-		int32_t d  = ((int32_t*)&l[idx & MASK])[2]; \
-		int64_t q = n / (d | 0x5); \
-		((int64_t*)&l[idx & MASK])[0] = n ^ q; \
-		idx = d ^ q; \
-	} \
-	else if(ALGO == cryptonight_haven) \
-	{ \
-		int64_t n  = ((int64_t*)&l[idx & MASK])[0]; \
-		int32_t d  = ((int32_t*)&l[idx & MASK])[2]; \
-		int64_t q = n / (d | 0x5); \
-		((int64_t*)&l[idx & MASK])[0] = n ^ q; \
-		idx = (~d) ^ q; \
-	}
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-#define CONST_INIT(ctx, n) \
-	__m128i mc##n = _mm_set_epi64x(*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + n * len + 35) ^ \
-	*(reinterpret_cast<const uint64_t*>((ctx)->hash_state) + 24), 0);
+		CN_INIT_SINGLE;
+		REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
 
-// This lovelier creation will do 3 cn hashes at a time.
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
-{
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
+		// Optim - 90% time boundary
+		for(size_t i = 0; i < ITERATIONS; i++)
+		{
+			REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
+		}
 
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
-	{
-		memset(output, 0, 32 * 3);
-		return;
+		REPEAT_5(0, CN_FINALIZE);
 	}
+};
 
-	for (size_t i = 0; i < 3; i++)
-	{
-		keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
-		cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
-	}
+extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1);
 
-	CONST_INIT(ctx[0], 0);
-	CONST_INIT(ctx[1], 1);
-	CONST_INIT(ctx[2], 2);
-
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-	uint8_t* l2 = ctx[2]->long_state;
-	uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-
-	__m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	__m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-	__m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
-	__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-	__m128i cx0 = _mm_set_epi64x(0, 0);
-	__m128i cx1 = _mm_set_epi64x(0, 0);
-	__m128i cx2 = _mm_set_epi64x(0, 0);
-
-	uint64_t idx0, idx1, idx2;
-	idx0 = _mm_cvtsi128_si64(ax0);
-	idx1 = _mm_cvtsi128_si64(ax1);
-	idx2 = _mm_cvtsi128_si64(ax2);
-
-	for (size_t i = 0; i < ITERATIONS/2; i++)
-	{
-		uint64_t hi, lo;
-		__m128i *ptr0, *ptr1, *ptr2;
-
-		// EVEN ROUND
-		CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-
-		CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
-
-		// ODD ROUND
-		CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-
-		CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-
-		CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-
-		CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
-	}
 
-	for (size_t i = 0; i < 3; i++)
-	{
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
-		keccakf((uint64_t*)ctx[i]->hash_state, 24);
-		extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
-	}
-}
+template< size_t N, size_t asm_version>
+struct Cryptonight_hash_asm;
 
-// This even lovelier creation will do 4 cn hashes at a time.
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+template<size_t asm_version>
+struct Cryptonight_hash_asm<1, asm_version>
 {
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
+	static constexpr size_t N = 1;
 
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
+	template<xmrstak_algo ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		memset(output, 0, 32 * 4);
-		return;
-	}
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	for (size_t i = 0; i < 4; i++)
-	{
-		keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
-		cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
-	}
+		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+		cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
 
-	CONST_INIT(ctx[0], 0);
-	CONST_INIT(ctx[1], 1);
-	CONST_INIT(ctx[2], 2);
-	CONST_INIT(ctx[3], 3);
-
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-	uint8_t* l2 = ctx[2]->long_state;
-	uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-	uint8_t* l3 = ctx[3]->long_state;
-	uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
-
-	__m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	__m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-	__m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
-	__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-	__m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
-	__m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-	__m128i cx0 = _mm_set_epi64x(0, 0);
-	__m128i cx1 = _mm_set_epi64x(0, 0);
-	__m128i cx2 = _mm_set_epi64x(0, 0);
-	__m128i cx3 = _mm_set_epi64x(0, 0);
-
-	uint64_t idx0, idx1, idx2, idx3;
-	idx0 = _mm_cvtsi128_si64(ax0);
-	idx1 = _mm_cvtsi128_si64(ax1);
-	idx2 = _mm_cvtsi128_si64(ax2);
-	idx3 = _mm_cvtsi128_si64(ax3);
-
-	for (size_t i = 0; i < ITERATIONS/2; i++)
-	{
-		uint64_t hi, lo;
-		__m128i *ptr0, *ptr1, *ptr2, *ptr3;
-
-		// EVEN ROUND
-		CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
-
-		CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
-		CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
-
-		// ODD ROUND
-		CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
-
-		CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
-
-		CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
-
-		CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
-		CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
-	}
+		if(asm_version == 0)
+			cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
+		else if(asm_version == 1)
+			cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
 
-	for (size_t i = 0; i < 4; i++)
-	{
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
-		keccakf((uint64_t*)ctx[i]->hash_state, 24);
-		extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+		keccakf((uint64_t*)ctx[0]->hash_state, 24);
+		extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
 	}
-}
+};
 
-// This most lovely creation will do 5 cn hashes at a time.
-template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+// double hash only for intel
+template< >
+struct Cryptonight_hash_asm<2, 0>
 {
-	constexpr size_t MASK = cn_select_mask<ALGO>();
-	constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-	constexpr size_t MEM = cn_select_memory<ALGO>();
+	static constexpr size_t N = 2;
 
-	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43)
+	template<xmrstak_algo ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
 	{
-		memset(output, 0, 32 * 5);
-		return;
-	}
+		constexpr size_t MEM = cn_select_memory<ALGO>();
 
-	for (size_t i = 0; i < 5; i++)
-	{
-		keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
-		cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
-	}
+		for(size_t i = 0; i < N; ++i)
+		{
+			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
+			/* Optim - 99% time boundary */
+			cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+		}
 
-	CONST_INIT(ctx[0], 0);
-	CONST_INIT(ctx[1], 1);
-	CONST_INIT(ctx[2], 2);
-	CONST_INIT(ctx[3], 3);
-	CONST_INIT(ctx[4], 4);
-
-	uint8_t* l0 = ctx[0]->long_state;
-	uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
-	uint8_t* l1 = ctx[1]->long_state;
-	uint64_t* h1 = (uint64_t*)ctx[1]->hash_state;
-	uint8_t* l2 = ctx[2]->long_state;
-	uint64_t* h2 = (uint64_t*)ctx[2]->hash_state;
-	uint8_t* l3 = ctx[3]->long_state;
-	uint64_t* h3 = (uint64_t*)ctx[3]->hash_state;
-	uint8_t* l4 = ctx[4]->long_state;
-	uint64_t* h4 = (uint64_t*)ctx[4]->hash_state;
-
-	__m128i ax0 = _mm_set_epi64x(h0[1] ^ h0[5], h0[0] ^ h0[4]);
-	__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-	__m128i ax1 = _mm_set_epi64x(h1[1] ^ h1[5], h1[0] ^ h1[4]);
-	__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-	__m128i ax2 = _mm_set_epi64x(h2[1] ^ h2[5], h2[0] ^ h2[4]);
-	__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-	__m128i ax3 = _mm_set_epi64x(h3[1] ^ h3[5], h3[0] ^ h3[4]);
-	__m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-	__m128i ax4 = _mm_set_epi64x(h4[1] ^ h4[5], h4[0] ^ h4[4]);
-	__m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
-	__m128i cx0 = _mm_set_epi64x(0, 0);
-	__m128i cx1 = _mm_set_epi64x(0, 0);
-	__m128i cx2 = _mm_set_epi64x(0, 0);
-	__m128i cx3 = _mm_set_epi64x(0, 0);
-	__m128i cx4 = _mm_set_epi64x(0, 0);
-
-	uint64_t idx0, idx1, idx2, idx3, idx4;
-	idx0 = _mm_cvtsi128_si64(ax0);
-	idx1 = _mm_cvtsi128_si64(ax1);
-	idx2 = _mm_cvtsi128_si64(ax2);
-	idx3 = _mm_cvtsi128_si64(ax3);
-	idx4 = _mm_cvtsi128_si64(ax4);
-
-	for (size_t i = 0; i < ITERATIONS/2; i++)
-	{
-		uint64_t hi, lo;
-		__m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
-
-		// EVEN ROUND
-		CN_STEP1(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP1(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		CN_STEP2(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP2(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		CN_STEP3(ax0, bx0, cx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
-		CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4);
-
-		CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
-		CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
-		CN_STEP4(ax4, bx4, cx4, l4, mc4, ptr4, idx4);
-
-		// ODD ROUND
-		CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP1(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP1(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP1(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP1(ax4, cx4, bx4, l4, ptr4, idx4);
-
-		CN_STEP2(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP2(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP2(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP2(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP2(ax4, cx4, bx4, l4, ptr4, idx4);
-
-		CN_STEP3(ax0, cx0, bx0, l0, ptr0, idx0);
-		CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
-		CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
-		CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
-		CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4);
-
-		CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
-		CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
-		CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
-		CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
-		CN_STEP4(ax4, cx4, bx4, l4, mc4, ptr4, idx4);
-	}
+		cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
 
-	for (size_t i = 0; i < 5; i++)
-	{
-		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
-		keccakf((uint64_t*)ctx[i]->hash_state, 24);
-		extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		for(size_t i = 0; i < N; ++i)
+		{
+			/* Optim - 90% time boundary */
+			cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+			/* Optim - 99% time boundary */
+			keccakf((uint64_t*)ctx[i]->hash_state, 24);
+			extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
+		}
 	}
-}
+};
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index a478c9b2a6a3c3a16d7ac628061336a252aaf955..a7e4696a8f836458394ab72258bd5fda3a243486 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -216,6 +216,8 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 		ptr->long_state = (uint8_t*)_mm_malloc(hashMemSize, hashMemSize);
 		ptr->ctx_info[0] = 0;
 		ptr->ctx_info[1] = 0;
+		if(ptr->long_state == NULL)
+			printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str());
 		return ptr;
 	}
 
@@ -243,25 +245,25 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 		return ptr;
 	}
 #else
-
+//http://man7.org/linux/man-pages/man2/mmap.2.html
 #if defined(__APPLE__)
-	ptr->long_state  = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state  = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
 #elif defined(__FreeBSD__)
-	ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0);
 #elif defined(__OpenBSD__)
-	ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
 		MAP_PRIVATE | MAP_ANON, -1, 0);
 #else
-	ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
-		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0);
+	ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE,
+		MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
 #endif
 
 	if (ptr->long_state == MAP_FAILED)
 	{
 		_mm_free(ptr);
-		msg->warning = "mmap failed";
+		msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'";
 		return NULL;
 	}
 
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index 49da7ae2d2b575421bd227af5663b49f2799c924..a14be1732b9fca69f9ab2d96941102339c320635 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -108,12 +108,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *mode, *no_prefetch, *aff;
+	const Value *mode, *no_prefetch, *aff, *asm_version;
 	mode = GetObjectMember(oThdConf, "low_power_mode");
 	no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+	asm_version = GetObjectMember(oThdConf, "asm");
 
-	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
+	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr || asm_version == nullptr)
 		return false;
 
 	if(!mode->IsBool() && !mode->IsNumber())
@@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	else
 		cfg.iCpuAff = -1;
 
+	if(!asm_version->IsString())
+		return false;
+	cfg.asm_version_str = asm_version->GetString();
+
 	return true;
 }
 
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index be855036eceaeb32f1e7fb8094dd1b244fb29cc7..4ec9165d59ec11b0a3ad5ed929d08218e95ca7ed 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -24,6 +24,7 @@ public:
 	struct thd_cfg {
 		int iMultiway;
 		bool bNoPrefetch;
+		std::string asm_version_str;
 		long long iCpuAff;
 	};
 
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 2e7169ef7c54c852894e52e75d75c6b6d4a08aed..a9f18d1b0e0c4fbd20b6bf60f5bdd274f205606b 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -27,6 +27,7 @@
 #include "xmrstak/backend/iBackend.hpp"
 #include "xmrstak/backend/globalStates.hpp"
 #include "xmrstak/misc/configEditor.hpp"
+#include "xmrstak/backend/cpu/cpuType.hpp"
 #include "xmrstak/params.hpp"
 #include "jconf.hpp"
 
@@ -104,7 +105,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 #endif
 }
 
-minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity)
+minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version)
 {
 	this->backendType = iBackend::CPU;
 	oWork = pWork;
@@ -113,6 +114,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
 	iJobNo = 0;
 	bNoPrefetch = no_prefetch;
 	this->affinity = affinity;
+	asm_version_str = asm_version;
 
 	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> order_guard = order_fix.get_future();
@@ -224,6 +226,7 @@ bool minethd::self_test()
 	{
 		if ((ctx[i] = minethd_alloc_ctx()) == nullptr)
 		{
+			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
 			for (int j = 0; j < i; j++)
 				cryptonight_free_ctx(ctx[j]);
 			return false;
@@ -232,92 +235,174 @@ bool minethd::self_test()
 
 	bool bResult = true;
 
-	if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight)
-	{
-		unsigned char out[32 * MAX_N];
-		cn_hash_fun hashf;
-		cn_hash_fun_multi hashf_multi;
-
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf("This is a test", 14, out, ctx[0]);
-		bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
-
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
-		hashf("This is a test", 14, out, ctx[0]);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
-
-		hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
-		bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-				"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
-
-		hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
-		hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
-		bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
-				"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
-
-		hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
-
-		hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
-
-		hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-		hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
-		bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
-				"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_aeon)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_ipbc)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_stellite)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_masari)
-	{
-	}
-	else if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_bittube2)
+	unsigned char out[32 * MAX_N];
+	cn_hash_fun hashf;
+	cn_hash_fun hashf_multi;
+
+	xmrstak_algo algo = xmrstak_algo::invalid_algo;
+
+	for(int algo_idx = 0; algo_idx < 2; ++algo_idx)
 	{
-		unsigned char out[32 * MAX_N];
-		cn_hash_fun hashf;
+		if(algo_idx == 0)
+			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo();
+		else
+			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+
+		if(algo == cryptonight)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf("This is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
+			hashf("This is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
+
+			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
+					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
+
+			hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
+
+			hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
+
+			hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
+			hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
+					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
+		}
+		else if(algo == cryptonight_lite)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite);
+			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
+		}
+		else if(algo == cryptonight_monero)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
+		}
+		else if(algo == cryptonight_monero_v8)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
+		}
+		else if(algo == cryptonight_aeon)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
+		}
+		else if(algo == cryptonight_ipbc)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
+		}
+		else if(algo == cryptonight_stellite)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
+		}
+		else if(algo == cryptonight_masari)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
+		}
+		else if(algo == cryptonight_heavy)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
+		}
+		else if(algo == cryptonight_haven)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
+		}
+		else if(algo == cryptonight_bittube2)
+		{
+			unsigned char out[32 * MAX_N];
+			cn_hash_fun hashf;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
 
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
+			hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
 
-		hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx[0]);
-		bResult = memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
+			hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx);
+			bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
 
-		hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx[0]);
-		bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
+			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx);
+			bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
+		}
 
-		hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx[0]);
-		bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
+		if(!bResult)
+			printer::inst()->print_msg(L0,
+				"Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
 	}
+
 	for (int i = 0; i < MAX_N; i++)
 		cryptonight_free_ctx(ctx[i]);
 
-	if(!bResult)
-		printer::inst()->print_msg(L0,
-			"Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
-
 	return bResult;
 }
 
@@ -359,15 +444,41 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 		else
 			printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway);
 
-		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff);
+		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str);
 		pvThreads.push_back(thd);
 	}
 
 	return pvThreads;
 }
 
-minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+/** get the supported asm name
+ *
+ * @return asm type based on the number of hashes per thread the internal
+ *             evaluated cpu type
+ */
+static std::string getAsmName(const uint32_t num_hashes)
 {
+	std::string asm_type = "off";
+	if(num_hashes != 0)
+	{
+		auto cpu_model = getModel();
+
+		if(cpu_model.avx && cpu_model.aes)
+		{
+			if(cpu_model.type_name.find("Intel") != std::string::npos)
+				asm_type = "intel_avx";
+			else if(cpu_model.type_name.find("AMD") != std::string::npos && num_hashes == 1)
+				asm_type = "amd_avx";
+		}
+	}
+	return asm_type;
+}
+
+template<size_t N>
+minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
+{
+	static_assert(N >= 1, "number of threads must be >= 1" );
+
 	// We have two independent flag bits in the functions
 	// therefore we will build a binary digit and select the
 	// function as a two digit binary
@@ -405,388 +516,119 @@ minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmr
 	case cryptonight_bittube2:
 		algv = 9;
 		break;
+	case cryptonight_monero_v8:
+		algv = 10;
+		break;
 	default:
 		algv = 2;
 		break;
 	}
 
 	static const cn_hash_fun func_table[] = {
-		cryptonight_hash<cryptonight_monero, false, false>,
-		cryptonight_hash<cryptonight_monero, true, false>,
-		cryptonight_hash<cryptonight_monero, false, true>,
-		cryptonight_hash<cryptonight_monero, true, true>,
-		cryptonight_hash<cryptonight_lite, false, false>,
-		cryptonight_hash<cryptonight_lite, true, false>,
-		cryptonight_hash<cryptonight_lite, false, true>,
-		cryptonight_hash<cryptonight_lite, true, true>,
-		cryptonight_hash<cryptonight, false, false>,
-		cryptonight_hash<cryptonight, true, false>,
-		cryptonight_hash<cryptonight, false, true>,
-		cryptonight_hash<cryptonight, true, true>,
-		cryptonight_hash<cryptonight_heavy, false, false>,
-		cryptonight_hash<cryptonight_heavy, true, false>,
-		cryptonight_hash<cryptonight_heavy, false, true>,
-		cryptonight_hash<cryptonight_heavy, true, true>,
-		cryptonight_hash<cryptonight_aeon, false, false>,
-		cryptonight_hash<cryptonight_aeon, true, false>,
-		cryptonight_hash<cryptonight_aeon, false, true>,
-		cryptonight_hash<cryptonight_aeon, true, true>,
-		cryptonight_hash<cryptonight_ipbc, false, false>,
-		cryptonight_hash<cryptonight_ipbc, true, false>,
-		cryptonight_hash<cryptonight_ipbc, false, true>,
-		cryptonight_hash<cryptonight_ipbc, true, true>,
-		cryptonight_hash<cryptonight_stellite, false, false>,
-		cryptonight_hash<cryptonight_stellite, true, false>,
-		cryptonight_hash<cryptonight_stellite, false, true>,
-		cryptonight_hash<cryptonight_stellite, true, true>,
-		cryptonight_hash<cryptonight_masari, false, false>,
-		cryptonight_hash<cryptonight_masari, true, false>,
-		cryptonight_hash<cryptonight_masari, false, true>,
-		cryptonight_hash<cryptonight_masari, true, true>,
-		cryptonight_hash<cryptonight_haven, false, false>,
-		cryptonight_hash<cryptonight_haven, true, false>,
-		cryptonight_hash<cryptonight_haven, false, true>,
-		cryptonight_hash<cryptonight_haven, true, true>,
-		cryptonight_hash<cryptonight_bittube2, false, false>,
-		cryptonight_hash<cryptonight_bittube2, true, false>,
-		cryptonight_hash<cryptonight_bittube2, false, true>,
-		cryptonight_hash<cryptonight_bittube2, true, true>
+		Cryptonight_hash<N>::template hash<cryptonight_monero, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_lite, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_lite, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_lite, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_lite, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_heavy, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_aeon, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_ipbc, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_stellite, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_masari, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_masari, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_masari, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_masari, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_haven, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_haven, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_haven, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_haven, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>
 	};
 
 	std::bitset<2> digit;
 	digit.set(0, !bHaveAes);
 	digit.set(1, !bNoPrefetch);
 
-	return func_table[ algv << 2 | digit.to_ulong() ];
-}
-
-void minethd::work_main()
-{
-	if(affinity >= 0) //-1 means no affinity
-		bindMemoryToNUMANode(affinity);
-
-	order_fix.set_value();
-	std::unique_lock<std::mutex> lck(thd_aff_set);
-	lck.release();
-	std::this_thread::yield();
-
-	cryptonight_ctx* ctx;
-	uint64_t iCount = 0;
-	uint64_t* piHashVal;
-	uint32_t* piNonce;
-	job_result result;
-
-	// start with root algorithm and switch later if fork version is reached
-	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
-	ctx = minethd_alloc_ctx();
+	auto selected_function = func_table[ algv << 2 | digit.to_ulong() ];
 
-	piHashVal = (uint64_t*)(result.bResult + 24);
-	piNonce = (uint32_t*)(oWork.bWorkBlob + 39);
-	result.iThreadId = iThreadNo;
 
-	uint8_t version = 0;
-	size_t lastPoolId = 0;
-
-	while (bQuit == 0)
+	// check for asm optimized version for cryptonight_v8
+	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
 	{
-		if (oWork.bStall)
-		{
-			/* We are stalled here because the executor didn't find a job for us yet,
-			 * either because of network latency, or a socket problem. Since we are
-			 * raison d'etre of this software it us sensible to just wait until we have something
-			 */
-
-			while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
-				std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-			globalStates::inst().consume_work(oWork, iJobNo);
-			continue;
-		}
-
-		size_t nonce_ctr = 0;
-		constexpr size_t nonce_chunk = 4096; // Needs to be a power of 2
-
-		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
-		memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID));
-
-		if(oWork.bNiceHash)
-			result.iNonce = *piNonce;
-
-		uint8_t new_version = oWork.getVersion();
-		if(new_version != version || oWork.iPoolId != lastPoolId)
-		{
-			coinDescription coinDesc = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(oWork.iPoolId);
-			if(new_version >= coinDesc.GetMiningForkVersion())
-			{
-				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
-			}
-			else
-			{
-				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
-			}
-			result.algorithm = miner_algo;
-			lastPoolId = oWork.iPoolId;
-			version = new_version;
-		}
+		std::string selected_asm = asm_version_str;
+		if(selected_asm == "auto")
+				selected_asm = cpu::getAsmName(N);
 
-		while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
+		if(selected_asm != "off")
 		{
-			if ((iCount++ & 0xF) == 0) //Store stats every 16 hashes
+			if(selected_asm == "intel_avx")
 			{
-				uint64_t iStamp = get_timestamp_ms();
-				iHashCount.store(iCount, std::memory_order_relaxed);
-				iTimestamp.store(iStamp, std::memory_order_relaxed);
+				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+				if(N == 1)
+					selected_function = Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
+				else if(N == 2)
+					selected_function = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
 			}
-
-			if((nonce_ctr++ & (nonce_chunk-1)) == 0)
+			// supports only 1 thread per hash
+			if(N == 1 && selected_asm == "amd_avx")
 			{
-				globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk);
-				// check if the job is still valid, there is a small posibility that the job is switched
-				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
-					break;
+				// AMD Ryzen (1xxx and 2xxx series)
+				selected_function = Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
 			}
-
-			*piNonce = result.iNonce;
-
-			hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
-
-			if (*piHashVal < oWork.iTarget)
-				executor::inst()->push_event(ex_event(result, oWork.iPoolId));
-			result.iNonce++;
-
-			std::this_thread::yield();
+			if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx"))
+				printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str());
+			else if(selected_asm != "intel_avx" && selected_asm != "amd_avx") // unknown asm type
+				printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str());
 		}
-
-		globalStates::inst().consume_work(oWork, iJobNo);
 	}
-
-	cryptonight_free_ctx(ctx);
+	
+	return selected_function;
 }
 
-minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
 {
-	// We have two independent flag bits in the functions
-	// therefore we will build a binary digit and select the
-	// function as a two digit binary
-
-	uint8_t algv;
-	switch(algo)
-	{
-	case cryptonight:
-		algv = 2;
-		break;
-	case cryptonight_lite:
-		algv = 1;
-		break;
-	case cryptonight_monero:
-		algv = 0;
-		break;
-	case cryptonight_heavy:
-		algv = 3;
-		break;
-	case cryptonight_aeon:
-		algv = 4;
-		break;
-	case cryptonight_ipbc:
-		algv = 5;
-		break;
-	case cryptonight_stellite:
-		algv = 6;
-		break;
-	case cryptonight_masari:
-		algv = 7;
-		break;
-	case cryptonight_haven:
-		algv = 8;
-		break;
-	case cryptonight_bittube2:
-		algv = 9;
-		break;
-	default:
-		algv = 2;
-		break;
-	}
-
-	static const cn_hash_fun_multi func_table[] = {
-		cryptonight_double_hash<cryptonight_monero, false, false>,
-		cryptonight_double_hash<cryptonight_monero, true, false>,
-		cryptonight_double_hash<cryptonight_monero, false, true>,
-		cryptonight_double_hash<cryptonight_monero, true, true>,
-		cryptonight_triple_hash<cryptonight_monero, false, false>,
-		cryptonight_triple_hash<cryptonight_monero, true, false>,
-		cryptonight_triple_hash<cryptonight_monero, false, true>,
-		cryptonight_triple_hash<cryptonight_monero, true, true>,
-		cryptonight_quad_hash<cryptonight_monero, false, false>,
-		cryptonight_quad_hash<cryptonight_monero, true, false>,
-		cryptonight_quad_hash<cryptonight_monero, false, true>,
-		cryptonight_quad_hash<cryptonight_monero, true, true>,
-		cryptonight_penta_hash<cryptonight_monero, false, false>,
-		cryptonight_penta_hash<cryptonight_monero, true, false>,
-		cryptonight_penta_hash<cryptonight_monero, false, true>,
-		cryptonight_penta_hash<cryptonight_monero, true, true>,
-
-		cryptonight_double_hash<cryptonight_lite, false, false>,
-		cryptonight_double_hash<cryptonight_lite, true, false>,
-		cryptonight_double_hash<cryptonight_lite, false, true>,
-		cryptonight_double_hash<cryptonight_lite, true, true>,
-		cryptonight_triple_hash<cryptonight_lite, false, false>,
-		cryptonight_triple_hash<cryptonight_lite, true, false>,
-		cryptonight_triple_hash<cryptonight_lite, false, true>,
-		cryptonight_triple_hash<cryptonight_lite, true, true>,
-		cryptonight_quad_hash<cryptonight_lite, false, false>,
-		cryptonight_quad_hash<cryptonight_lite, true, false>,
-		cryptonight_quad_hash<cryptonight_lite, false, true>,
-		cryptonight_quad_hash<cryptonight_lite, true, true>,
-		cryptonight_penta_hash<cryptonight_lite, false, false>,
-		cryptonight_penta_hash<cryptonight_lite, true, false>,
-		cryptonight_penta_hash<cryptonight_lite, false, true>,
-		cryptonight_penta_hash<cryptonight_lite, true, true>,
-
-		cryptonight_double_hash<cryptonight, false, false>,
-		cryptonight_double_hash<cryptonight, true, false>,
-		cryptonight_double_hash<cryptonight, false, true>,
-		cryptonight_double_hash<cryptonight, true, true>,
-		cryptonight_triple_hash<cryptonight, false, false>,
-		cryptonight_triple_hash<cryptonight, true, false>,
-		cryptonight_triple_hash<cryptonight, false, true>,
-		cryptonight_triple_hash<cryptonight, true, true>,
-		cryptonight_quad_hash<cryptonight, false, false>,
-		cryptonight_quad_hash<cryptonight, true, false>,
-		cryptonight_quad_hash<cryptonight, false, true>,
-		cryptonight_quad_hash<cryptonight, true, true>,
-		cryptonight_penta_hash<cryptonight, false, false>,
-		cryptonight_penta_hash<cryptonight, true, false>,
-		cryptonight_penta_hash<cryptonight, false, true>,
-		cryptonight_penta_hash<cryptonight, true, true>,
-
-		cryptonight_double_hash<cryptonight_heavy, false, false>,
-		cryptonight_double_hash<cryptonight_heavy, true, false>,
-		cryptonight_double_hash<cryptonight_heavy, false, true>,
-		cryptonight_double_hash<cryptonight_heavy, true, true>,
-		cryptonight_triple_hash<cryptonight_heavy, false, false>,
-		cryptonight_triple_hash<cryptonight_heavy, true, false>,
-		cryptonight_triple_hash<cryptonight_heavy, false, true>,
-		cryptonight_triple_hash<cryptonight_heavy, true, true>,
-		cryptonight_quad_hash<cryptonight_heavy, false, false>,
-		cryptonight_quad_hash<cryptonight_heavy, true, false>,
-		cryptonight_quad_hash<cryptonight_heavy, false, true>,
-		cryptonight_quad_hash<cryptonight_heavy, true, true>,
-		cryptonight_penta_hash<cryptonight_heavy, false, false>,
-		cryptonight_penta_hash<cryptonight_heavy, true, false>,
-		cryptonight_penta_hash<cryptonight_heavy, false, true>,
-		cryptonight_penta_hash<cryptonight_heavy, true, true>,
-
-		cryptonight_double_hash<cryptonight_aeon, false, false>,
-		cryptonight_double_hash<cryptonight_aeon, true, false>,
-		cryptonight_double_hash<cryptonight_aeon, false, true>,
-		cryptonight_double_hash<cryptonight_aeon, true, true>,
-		cryptonight_triple_hash<cryptonight_aeon, false, false>,
-		cryptonight_triple_hash<cryptonight_aeon, true, false>,
-		cryptonight_triple_hash<cryptonight_aeon, false, true>,
-		cryptonight_triple_hash<cryptonight_aeon, true, true>,
-		cryptonight_quad_hash<cryptonight_aeon, false, false>,
-		cryptonight_quad_hash<cryptonight_aeon, true, false>,
-		cryptonight_quad_hash<cryptonight_aeon, false, true>,
-		cryptonight_quad_hash<cryptonight_aeon, true, true>,
-		cryptonight_penta_hash<cryptonight_aeon, false, false>,
-		cryptonight_penta_hash<cryptonight_aeon, true, false>,
-		cryptonight_penta_hash<cryptonight_aeon, false, true>,
-		cryptonight_penta_hash<cryptonight_aeon, true, true>,
-
-		cryptonight_double_hash<cryptonight_ipbc, false, false>,
-		cryptonight_double_hash<cryptonight_ipbc, true, false>,
-		cryptonight_double_hash<cryptonight_ipbc, false, true>,
-		cryptonight_double_hash<cryptonight_ipbc, true, true>,
-		cryptonight_triple_hash<cryptonight_ipbc, false, false>,
-		cryptonight_triple_hash<cryptonight_ipbc, true, false>,
-		cryptonight_triple_hash<cryptonight_ipbc, false, true>,
-		cryptonight_triple_hash<cryptonight_ipbc, true, true>,
-		cryptonight_quad_hash<cryptonight_ipbc, false, false>,
-		cryptonight_quad_hash<cryptonight_ipbc, true, false>,
-		cryptonight_quad_hash<cryptonight_ipbc, false, true>,
-		cryptonight_quad_hash<cryptonight_ipbc, true, true>,
-		cryptonight_penta_hash<cryptonight_ipbc, false, false>,
-		cryptonight_penta_hash<cryptonight_ipbc, true, false>,
-		cryptonight_penta_hash<cryptonight_ipbc, false, true>,
-		cryptonight_penta_hash<cryptonight_ipbc, true, true>,
-
-		cryptonight_double_hash<cryptonight_stellite, false, false>,
-		cryptonight_double_hash<cryptonight_stellite, true, false>,
-		cryptonight_double_hash<cryptonight_stellite, false, true>,
-		cryptonight_double_hash<cryptonight_stellite, true, true>,
-		cryptonight_triple_hash<cryptonight_stellite, false, false>,
-		cryptonight_triple_hash<cryptonight_stellite, true, false>,
-		cryptonight_triple_hash<cryptonight_stellite, false, true>,
-		cryptonight_triple_hash<cryptonight_stellite, true, true>,
-		cryptonight_quad_hash<cryptonight_stellite, false, false>,
-		cryptonight_quad_hash<cryptonight_stellite, true, false>,
-		cryptonight_quad_hash<cryptonight_stellite, false, true>,
-		cryptonight_quad_hash<cryptonight_stellite, true, true>,
-		cryptonight_penta_hash<cryptonight_stellite, false, false>,
-		cryptonight_penta_hash<cryptonight_stellite, true, false>,
-		cryptonight_penta_hash<cryptonight_stellite, false, true>,
-		cryptonight_penta_hash<cryptonight_stellite, true, true>,
-
-		cryptonight_double_hash<cryptonight_masari, false, false>,
-		cryptonight_double_hash<cryptonight_masari, true, false>,
-		cryptonight_double_hash<cryptonight_masari, false, true>,
-		cryptonight_double_hash<cryptonight_masari, true, true>,
-		cryptonight_triple_hash<cryptonight_masari, false, false>,
-		cryptonight_triple_hash<cryptonight_masari, true, false>,
-		cryptonight_triple_hash<cryptonight_masari, false, true>,
-		cryptonight_triple_hash<cryptonight_masari, true, true>,
-		cryptonight_quad_hash<cryptonight_masari, false, false>,
-		cryptonight_quad_hash<cryptonight_masari, true, false>,
-		cryptonight_quad_hash<cryptonight_masari, false, true>,
-		cryptonight_quad_hash<cryptonight_masari, true, true>,
-		cryptonight_penta_hash<cryptonight_masari, false, false>,
-		cryptonight_penta_hash<cryptonight_masari, true, false>,
-		cryptonight_penta_hash<cryptonight_masari, false, true>,
-		cryptonight_penta_hash<cryptonight_masari, true, true>,
-		
-		cryptonight_double_hash<cryptonight_haven, false, false>,
-		cryptonight_double_hash<cryptonight_haven, true, false>,
-		cryptonight_double_hash<cryptonight_haven, false, true>,
-		cryptonight_double_hash<cryptonight_haven, true, true>,
-		cryptonight_triple_hash<cryptonight_haven, false, false>,
-		cryptonight_triple_hash<cryptonight_haven, true, false>,
-		cryptonight_triple_hash<cryptonight_haven, false, true>,
-		cryptonight_triple_hash<cryptonight_haven, true, true>,
-		cryptonight_quad_hash<cryptonight_haven, false, false>,
-		cryptonight_quad_hash<cryptonight_haven, true, false>,
-		cryptonight_quad_hash<cryptonight_haven, false, true>,
-		cryptonight_quad_hash<cryptonight_haven, true, true>,
-		cryptonight_penta_hash<cryptonight_haven, false, false>,
-		cryptonight_penta_hash<cryptonight_haven, true, false>,
-		cryptonight_penta_hash<cryptonight_haven, false, true>,
-		cryptonight_penta_hash<cryptonight_haven, true, true>,
-
-		cryptonight_double_hash<cryptonight_bittube2, false, false>,
-		cryptonight_double_hash<cryptonight_bittube2, true, false>,
-		cryptonight_double_hash<cryptonight_bittube2, false, true>,
-		cryptonight_double_hash<cryptonight_bittube2, true, true>,
-		cryptonight_triple_hash<cryptonight_bittube2, false, false>,
-		cryptonight_triple_hash<cryptonight_bittube2, true, false>,
-		cryptonight_triple_hash<cryptonight_bittube2, false, true>,
-		cryptonight_triple_hash<cryptonight_bittube2, true, true>,
-		cryptonight_quad_hash<cryptonight_bittube2, false, false>,
-		cryptonight_quad_hash<cryptonight_bittube2, true, false>,
-		cryptonight_quad_hash<cryptonight_bittube2, false, true>,
-		cryptonight_quad_hash<cryptonight_bittube2, true, true>,
-		cryptonight_penta_hash<cryptonight_bittube2, false, false>,
-		cryptonight_penta_hash<cryptonight_bittube2, true, false>,
-		cryptonight_penta_hash<cryptonight_bittube2, false, true>,
-		cryptonight_penta_hash<cryptonight_bittube2, true, true>
-	};
-
-	std::bitset<2> digit;
-	digit.set(0, !bHaveAes);
-	digit.set(1, !bNoPrefetch);
+	return func_multi_selector<1>(bHaveAes, bNoPrefetch, algo);
+}
 
-	return func_table[algv << 4 | (N-2) << 2 | digit.to_ulong()];
+void minethd::work_main()
+{
+	multiway_work_main<1u>();
 }
 
 void minethd::double_work_main()
@@ -843,6 +685,13 @@ void minethd::multiway_work_main()
 	for (size_t i = 0; i < N; i++)
 	{
 		ctx[i] = minethd_alloc_ctx();
+		if(ctx[i] == nullptr)
+		{
+			printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory.");
+			for (int j = 0; j < i; j++)
+				cryptonight_free_ctx(ctx[j]);
+			win_exit(1);
+		}
 		piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24);
 		piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr;
 	}
@@ -854,7 +703,7 @@ void minethd::multiway_work_main()
 
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun_multi hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
@@ -889,12 +738,12 @@ void minethd::multiway_work_main()
 			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
 				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			else
 			{
 				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			lastPoolId = oWork.iPoolId;
 			version = new_version;
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 2d40ce3144a381d1cadcca493fb6d9ef1f5aafa0..eb77749f6236735ab0f5132016cf6c0b29181542 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -22,7 +22,7 @@ public:
 	static std::vector<iBackend*> thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
 
 	static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
 	static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id);
@@ -30,10 +30,11 @@ public:
 	static cryptonight_ctx* minethd_alloc_ctx();
 
 private:
-	typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**);
-	static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
 
-	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity);
+	template<size_t N>
+	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "off");
+
+	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
 
 	template<uint32_t N>
 	void multiway_work_main();
@@ -59,6 +60,7 @@ private:
 
 	bool bQuit;
 	bool bNoPrefetch;
+	std::string asm_version_str = "off";
 };
 
 } // namespace cpu
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index b6f656138024e78e6877a8d86a839f39e31f9aaa..6b1afa928d772cdf6edc7cb4eda007d18e17dd7b 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -16,6 +16,7 @@ enum xmrstak_algo
 	cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari
 	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
+	cryptonight_monero_v8 = 11
 };
 
 // define aeon settings
@@ -45,6 +46,9 @@ inline constexpr size_t cn_select_memory<cryptonight_lite>() { return CRYPTONIGH
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_monero>() { return CRYPTONIGHT_MEMORY; }
 
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_monero_v8>() { return CRYPTONIGHT_MEMORY; }
+
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MEMORY; }
 
@@ -72,6 +76,7 @@ inline size_t cn_select_memory(xmrstak_algo algo)
 	{
 	case cryptonight_stellite:
 	case cryptonight_monero:
+	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
 		return CRYPTONIGHT_MEMORY;
@@ -100,6 +105,9 @@ inline constexpr uint32_t cn_select_mask<cryptonight_lite>() { return CRYPTONIGH
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_monero>() { return CRYPTONIGHT_MASK; }
 
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_monero_v8>() { return CRYPTONIGHT_MASK; }
+
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MASK; }
 
@@ -127,6 +135,7 @@ inline size_t cn_select_mask(xmrstak_algo algo)
 	{
 	case cryptonight_stellite:
 	case cryptonight_monero:
+	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
 		return CRYPTONIGHT_MASK;
@@ -155,6 +164,9 @@ inline constexpr uint32_t cn_select_iter<cryptonight_lite>() { return CRYPTONIGH
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_monero>() { return CRYPTONIGHT_ITER; }
 
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_monero_v8>() { return CRYPTONIGHT_ITER; }
+
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_ITER; }
 
@@ -182,6 +194,7 @@ inline size_t cn_select_iter(xmrstak_algo algo)
 	{
 	case cryptonight_stellite:
 	case cryptonight_monero:
+	case cryptonight_monero_v8:
 	case cryptonight:
 		return CRYPTONIGHT_ITER;
 	case cryptonight_ipbc:
diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp
index 3bd7d0eea18a4e2289106f77796b685341945a15..4eeed3c4b832c74ecaba9a7266d363dc098e531c 100644
--- a/xmrstak/backend/globalStates.cpp
+++ b/xmrstak/backend/globalStates.cpp
@@ -39,7 +39,7 @@ void globalStates::consume_work( miner_work& threadWork, uint64_t& currentJobId)
 
 	threadWork = oGlobalWork;
 	currentJobId = iGlobalJobNo.load(std::memory_order_relaxed);
-	
+
 	jobLock.UnLock();
 }
 
@@ -51,7 +51,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat)
 	 * To avoid duplicated shared this must be done before the nonce is exchanged.
 	 */
 	iGlobalJobNo++;
-	
+
 	size_t xid = dat.pool_id;
 	dat.pool_id = pool_id;
 	pool_id = xid;
@@ -62,7 +62,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat)
 	 */
 	dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_relaxed);
 	oGlobalWork = pWork;
-	
+
 	jobLock.UnLock();
 }
 
diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp
index 12468093c9fa043ce9aa43337aa378ceca889444..2755e03d2f66fbc9668f1b1e2a0d0f3d165d6e91 100644
--- a/xmrstak/backend/nvidia/autoAdjust.hpp
+++ b/xmrstak/backend/nvidia/autoAdjust.hpp
@@ -96,6 +96,7 @@ private:
 					"    \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" +
 					"    \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" :  " + std::to_string(ctx.device_bsleep) + ",\n" +
 					"    \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" +
+					"    \"mem_mode\" : 1,\n" +
 					"  },\n";
 			}
 		}
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index 2aa68dc46254d85c8916aad55f4a7397756e5bf7..8a5982b57cb05316e222390ec311ce84a93fcbe4 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * GPU configuration. You should play around with threads and blocks as the fastest settings will vary.
  * index         - GPU index number usually starts from 0.
@@ -15,6 +16,9 @@ R"===(
  *                 1 = cudaDeviceScheduleSpin - create a high load on one cpu thread per gpu
  *                 2 = cudaDeviceScheduleYield
  *                 3 = cudaDeviceScheduleBlockingSync (default)
+ * mem_mode      - select the memory access pattern (this option has only a meaning for cryptonight_v8 and monero)
+ *                 0 = 64bit memory loads
+ *                 1 = 256bit memory loads   
  *
  * On the first run the miner will look at your system and suggest a basic configuration that will work,
  * you can try to tweak it from there to get the best performance.
@@ -23,7 +27,7 @@ R"===(
  * "gpu_threads_conf" :
  * [
  *     { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" :  0,
- *       "affine_to_cpu" : false, "sync_mode" : 3,
+ *       "affine_to_cpu" : false, "sync_mode" : 3, "mem_mode" : 1
  *     },
  * ],
  * If you do not wish to mine with your nVidia GPU(s) then use:
@@ -35,5 +39,4 @@ R"===(
 [
 GPUCONFIG
 ],
-
 )==="
diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp
index c9d4f194c0097d747e16dddb0cfac33731bb6f36..6c443343b91310d9698d9466f241b16018c884e4 100644
--- a/xmrstak/backend/nvidia/jconf.cpp
+++ b/xmrstak/backend/nvidia/jconf.cpp
@@ -123,7 +123,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode;
+	const Value *gid, *blocks, *threads, *bfactor, *bsleep, *aff, *syncMode, *memMode;
 	gid = GetObjectMember(oThdConf, "index");
 	blocks = GetObjectMember(oThdConf, "blocks");
 	threads = GetObjectMember(oThdConf, "threads");
@@ -131,9 +131,11 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 	bsleep = GetObjectMember(oThdConf, "bsleep");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	syncMode = GetObjectMember(oThdConf, "sync_mode");
+	memMode = GetObjectMember(oThdConf, "mem_mode");
 
 	if(gid == nullptr || blocks == nullptr || threads == nullptr ||
-		bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr)
+		bfactor == nullptr || bsleep == nullptr || aff == nullptr || syncMode == nullptr ||
+		memMode == nullptr)
 	{
 		return false;
 	}
@@ -158,16 +160,25 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg)
 
 	if(!syncMode->IsNumber() || syncMode->GetInt() < 0 || syncMode->GetInt() > 3)
 	{
-		printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or no number. ( range: 0 <= sync_mode < 4.)");
+		printer::inst()->print_msg(L0, "Error NVIDIA: sync_mode out of range or not a number. ( range: 0 <= sync_mode < 4.)");
 		return false;
 	}
+
+	if(!memMode->IsNumber() || memMode->GetInt() < 0 || memMode->GetInt() > 1)
+	{
+		printer::inst()->print_msg(L0, "Error NVIDIA: mem_mode out of range or not a number. (range: 0 or 1)");
+		return false;
+	}
+
+
 	cfg.id = gid->GetInt();
 	cfg.blocks = blocks->GetInt();
 	cfg.threads = threads->GetInt();
 	cfg.bfactor = bfactor->GetInt();
 	cfg.bsleep = bsleep->GetInt();
 	cfg.syncMode = syncMode->GetInt();
-
+	cfg.memMode = memMode->GetInt();
+	
 	if(aff->IsNumber())
 		cfg.cpu_aff = aff->GetInt();
 	else
diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp
index b4ebaa03549c003ae82b1d7e96e524070f8d47d6..40b72f880898485058c2775b11afdc9c0dfaff1d 100644
--- a/xmrstak/backend/nvidia/jconf.hpp
+++ b/xmrstak/backend/nvidia/jconf.hpp
@@ -29,6 +29,7 @@ public:
 		bool bNoPrefetch;
 		int32_t cpu_aff;
 		int syncMode;
+		int memMode;
 
 		long long iCpuAff;
 	};
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 88a1acc324e1484f0bc1e038a186e6dffc8948ef..6460628de75b17b625418e8244414cce9752a0ad 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -78,6 +78,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 	ctx.device_bfactor = (int)cfg.bfactor;
 	ctx.device_bsleep = (int)cfg.bsleep;
 	ctx.syncMode = cfg.syncMode;
+	ctx.memMode = cfg.memMode;
 	this->affinity = cfg.cpu_aff;
 
 	std::future<void> numa_guard = numa_promise.get_future();
@@ -144,6 +145,8 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 {
 	std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>();
 
+	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+
 	if(!configEditor::file_exist(params::inst().configFileNVIDIA))
 	{
 		autoAdjust adjust;
@@ -162,6 +165,10 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 		std::cout<<"WARNING: NVIDIA no device found"<<std::endl;
 		return pvThreads;
 	}
+	else
+	{
+		std::cout<<"NVIDIA: found "<< deviceCount <<" potential device's"<<std::endl;
+	}
 
 	size_t i, n = jconf::inst()->GetGPUThreadCount();
 	pvThreads->reserve(n);
@@ -275,7 +282,7 @@ void minethd::work_main()
 			if((round_ctr++ & 0xF) == 0)
 			{
 				globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, h_per_round * 16);
-				// check if the job is still valid, there is a small posibility that the job is switched
+				// check if the job is still valid, there is a small possibility that the job is switched
 				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
 					break;
 			}
@@ -300,7 +307,7 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp
index d4ae03864a038201d15c1dc05cae4398b2fd405a..389356842b325269150b2f6d399fe91d7cc1a2ce 100644
--- a/xmrstak/backend/nvidia/minethd.hpp
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -28,7 +28,7 @@ public:
 	static bool self_test();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
 
 	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
 	void start_mining();
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index d588641b4849864dfa653c44652963c557bccb10..8fda8d401323a185479bc5f2756cac3c5ca71af9 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -16,6 +16,7 @@ typedef struct {
 	int device_bfactor;
 	int device_bsleep;
 	int syncMode;
+	bool memMode;
 
 	uint32_t *d_input;
 	uint32_t inputlen;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 6c6475150995cc8612842debcedadcfc852dc9a1..7742e740e5f3d0ad8dfe9951dd4b88b19f627a99 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -5,8 +5,11 @@
 #include <string.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <bitset>
 
 #include "xmrstak/jconf.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
+
 
 #ifdef _WIN32
 #include <windows.h>
@@ -67,9 +70,9 @@ typedef uint64_t IndexType;
 typedef int IndexType;
 #endif
 
-__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi )
+__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi )
 {
-	*product_hi = __umul64hi( multiplier, multiplicand );
+	product_hi = __umul64hi( multiplier, multiplicand );
 	return (multiplier * multiplicand );
 }
 
@@ -194,11 +197,289 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_
 #endif
 }
 
+template<size_t group_n>
+__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2)
+{
+	uint64_t tmp;
+	((uint32_t*)&tmp)[0] = shuffle<group_n>(ptr, sub, val, src);
+	((uint32_t*)&tmp)[1] = shuffle<group_n>(ptr, sub, val, src2);
+	return tmp;
+}
+
+struct u64 : public uint2
+{
+
+	__forceinline__ __device__ u64(){}
+
+	__forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1)
+	{
+		uint2::x = x0;
+		uint2::y = x1;
+	}
+
+	__forceinline__ __device__ operator uint64_t() const
+	{
+		return *((uint64_t*)this);
+	}
+
+	__forceinline__ __device__ u64( const uint64_t x0)
+	{
+		((uint64_t*)&this->x)[0] = x0;
+	}
+
+	__forceinline__ __device__ u64 operator^=(const u64& other)
+	{
+		uint2::x ^= other.x;
+		uint2::y ^= other.y;
+
+		return *this;
+	}
+
+	__forceinline__ __device__ u64 operator+(const u64& other) const
+	{
+		u64 tmp;
+		((uint64_t*)&tmp.x)[0] = ((uint64_t*)&(this->x))[0] + ((uint64_t*)&(other.x))[0];
+
+		return tmp;
+	}
+
+	__forceinline__ __device__ u64 operator+=(const uint64_t& other)
+	{
+		return ((uint64_t*)&this->x)[0] += other;
+	}
+
+	__forceinline__ __device__ void print(int i) const
+	{
+		if(i<2)
+			printf("gpu: %lu\n", ((uint64_t*)&this->x)[0]);
+	}
+};
+
+/** cryptonight with two threads per hash
+ *
+ * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory
+ *                   else if `1` 256bit operations will be used
+ */
+template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO, uint32_t MEM_MODE>
+#ifdef XMR_STAK_THREADS
+__launch_bounds__( XMR_STAK_THREADS * 2 )
+#endif
+__global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+		uint32_t startNonce, uint32_t * __restrict__ d_input )
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init( sharedMemory );
+
+#if( __CUDA_ARCH__ < 300 )
+	extern __shared__ uint64_t externShared[];
+	// 8 x 64bit values
+	volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8);
+    volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8)  + (threadIdx.x & 0xFFFFFFFE);
+#else
+	extern __shared__ uint64_t chunkMem[];
+    volatile uint32_t* sPtr = NULL;
+	// 8 x 64bit values
+	volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8);
+
+#endif
+
+	__syncthreads( );
+
+	const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = tid >> 1;
+	const uint32_t sub = tid & 1;
+
+	if ( thread >= threads )
+		return;
+
+	uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY];
+
+	uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub];
+	uint64_t bx0;
+	uint32_t idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
+
+	uint64_t* ptr0;
+
+	uint64_t bx1;
+	uint32_t sqrt_result;
+	uint64_t division_result;
+	if(ALGO == cryptonight_monero_v8)
+	{
+		bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub];
+		bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub];
+
+		division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
+		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
+	}
+	else
+		 bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub];
+
+	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+	const int start = partidx * batchsize;
+	const int end = start + batchsize;
+
+	for(int i = start; i < end; ++i)
+	{
+		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+
+		if(MEM_MODE == 0)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				myChunks[x + sub] = ptr0[ x + sub ];
+			}
+		}
+		else
+			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
+
+		uint32_t idx1 = (idx0 & 0x30) >> 3;
+
+		const u64 cx = myChunks[ idx1 + sub ];
+		const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ];
+
+		u64 cx_aes = ax0 ^ u64(
+			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ),
+			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) )
+		);
+
+		if(ALGO == cryptonight_monero_v8)
+		{
+
+			const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ];
+			const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+			const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
+#if (__CUDACC_VER_MAJOR__ >= 9)
+			__syncwarp();
+#else
+			__syncthreads( );
+#endif
+			myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
+			myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
+			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+		}
+
+		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
+		if(MEM_MODE == 0)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				ptr0[ x + sub ] = myChunks[x + sub];
+			}
+		}
+		else
+			((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
+
+		idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
+		idx1 = (idx0 & 0x30) >> 3;
+		ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+
+		if(MEM_MODE == 0)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				myChunks[x + sub] = ptr0[ x + sub ];
+			}
+		}
+		else
+			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
+
+		if(ALGO != cryptonight_monero_v8)
+			bx0 = cx_aes;
+
+		uint64_t cx_mul;
+		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
+		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
+
+		if(ALGO == cryptonight_monero_v8 && sub == 1)
+		{
+			// Use division and square root results from the _previous_ iteration to hide the latency
+			((uint32_t*)&division_result)[1] ^= sqrt_result;
+
+			((uint64_t*)myChunks)[ idx1 ] ^= division_result;
+
+			const uint32_t dd = (static_cast<uint32_t>(cx_mul) + (sqrt_result << 1)) | 0x80000001UL;
+			division_result = fast_div_v2(cx_aes, dd);
+
+			// Use division_result as an input for the square root to prevent parallel implementation in hardware
+			sqrt_result = fast_sqrt_v2(cx_mul + division_result);
+		}
+#if (__CUDACC_VER_MAJOR__ >= 9)
+				__syncwarp();
+#else
+				__syncthreads( );
+#endif
+		uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ];
+
+		{
+			uint64_t cl = ((uint64_t*)myChunks)[ idx1 ];
+			// sub 0 -> hi, sub 1 -> lo
+			uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
+			if(ALGO == cryptonight_monero_v8)
+			{
+				const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
+				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+				res ^= ((uint64_t*)&chunk2)[0];
+				const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
+#if (__CUDACC_VER_MAJOR__ >= 9)
+				__syncwarp();
+#else
+				__syncthreads( );
+#endif
+				myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
+				myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
+				myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+			}
+			ax0 += res;
+		}
+		if(ALGO == cryptonight_monero_v8)
+		{
+			bx1 = bx0;
+			bx0 = cx_aes;
+		}
+		myChunks[ idx1 + sub ] = ax0;
+		if(MEM_MODE == 0)
+		{
+			#pragma unroll 4
+			for(int x = 0; x < 8; x += 2)
+			{
+				ptr0[ x + sub ] = myChunks[x + sub];
+			}
+		}
+		else
+			((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
+		ax0 ^= c;
+		idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
+	}
+
+	if ( bfactor > 0 )
+	{
+		((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
+		if(ALGO == cryptonight_monero_v8)
+		{
+			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
+			((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
+
+			if(sub == 1)
+			{
+				// must be valid only for `sub == 1`
+				((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0] = division_result;
+				(d_ctx_b + thread * 12 + 4 * 2 + 2)[0] = sqrt_result;
+			}
+		}
+		else
+			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
+	}
+}
+
 template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
 #endif
-__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+__global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
 		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
 	__shared__ uint32_t sharedMemory[1024];
@@ -448,11 +729,12 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
 }
 
-template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO>
+template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, uint32_t MEM_MODE>
 void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 {
 	dim3 grid( ctx->device_blocks );
 	dim3 block( ctx->device_threads );
+	dim3 block2( ctx->device_threads << 2 );
 	dim3 block4( ctx->device_threads << 2 );
 	dim3 block8( ctx->device_threads << 3 );
 
@@ -483,25 +765,53 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	for ( int i = 0; i < partcount; i++ )
 	{
-        CUDA_CHECK_MSG_KERNEL(
-			ctx->device_id,
-			"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
-			cryptonight_core_gpu_phase2<ITERATIONS,MEMORY,MASK,ALGO><<<
-				grid,
-				block4,
-				block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-			>>>(
-				ctx->device_blocks*ctx->device_threads,
-				ctx->device_bfactor,
-				i,
-				ctx->d_long_state,
-				ctx->d_ctx_a,
-				ctx->d_ctx_b,
-				ctx->d_ctx_state,
-				nonce,
-				ctx->d_input
-			)
-	    );
+		if(ALGO == cryptonight_monero_v8)
+		{
+			// two threads per block
+			CUDA_CHECK_MSG_KERNEL(
+				ctx->device_id,
+				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
+				cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO, MEM_MODE><<<
+					grid,
+					block2,
+					sizeof(uint64_t) * block2.x * 8 +
+						// shuffle memory for fermi gpus
+						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+				>>>(
+					ctx->device_blocks*ctx->device_threads,
+					ctx->device_bfactor,
+					i,
+					ctx->d_long_state,
+					ctx->d_ctx_a,
+					ctx->d_ctx_b,
+					ctx->d_ctx_state,
+					nonce,
+					ctx->d_input
+				)
+			);
+		}
+		else
+		{
+			CUDA_CHECK_MSG_KERNEL(
+				ctx->device_id,
+				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
+				cryptonight_core_gpu_phase2_quad<ITERATIONS,MEMORY,MASK,ALGO><<<
+					grid,
+					block4,
+					block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+				>>>(
+					ctx->device_blocks*ctx->device_threads,
+					ctx->device_bfactor,
+					i,
+					ctx->d_long_state,
+					ctx->d_ctx_a,
+					ctx->d_ctx_b,
+					ctx->d_ctx_state,
+					nonce,
+					ctx->d_input
+				)
+			);
+		}
 
 		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
 	}
@@ -529,46 +839,48 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
 {
+	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
+	
+	if(miner_algo == invalid_algo) return;
+	
+	static const cuda_hash_fn func_table[] = {
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 1>,
 
-	if(miner_algo == cryptonight_monero)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_heavy)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_lite)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_aeon)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_ipbc)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_stellite)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_masari)
-	{
-		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_haven)
-	{
-	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven>(ctx, startNonce);
-	}
-	else if(miner_algo == cryptonight_bittube2)
-	{
-	  cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2>(ctx, startNonce);
-	}
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>
+	};
+
+	std::bitset<1> digit;
+	digit.set(0, ctx->memMode == 1);
 
+	cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ];
+	selected_function(ctx, startNonce);
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 3b049ace83e7ce705357764d5ac63dd8f97c2eea..433e175ddb5496429c7367c367886fd896244a06 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -142,7 +142,19 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
 	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
 	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
-	memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+	if(ALGO == cryptonight_monero_v8)
+	{
+		memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 );
+		// bx1
+		XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b );
+		memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 );
+		// division_result
+		memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 );
+		// sqrt_result
+		memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 );
+	}
+	else
+		memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
 
 	memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 );
 	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
@@ -271,13 +283,9 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		break;
 
 	};
-	const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1];
 
-	/* Disable L1 cache for GPUs before Volta.
-	 * L1 speed is increased and latency reduced with Volta.
-	 */
-	if(gpuArch < 70)
-		CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
+	// prefer shared memory over L1 cache
+	CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
 
 	size_t hashMemSize = std::max(
 		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
@@ -298,6 +306,12 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		// create a double buffer for the state to exchange the mixed state to phase1
 		CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
 	}
+	else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+			cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+	{
+		// bx1 (16byte), division_result (8byte) and sqrt_result (8byte)
+		ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize;
+	}
 	else
 		ctx->d_ctx_state2 = ctx->d_ctx_state;
 
@@ -340,6 +354,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
+	if(miner_algo == cryptonight_monero_v8)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
 	else
 	{
 		/* pass two times d_ctx_state because the second state is used later in phase1,
@@ -396,7 +415,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 	}
 
 	CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
-	CUDA_CHECK(ctx->device_id, cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ));
+	CUDA_CHECK_MSG(
+		ctx->device_id,
+		"\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**",
+		cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost )
+	);
 
 	/* There is only a 32bit limit for the counter on the device side
 	 * therefore this value can be greater than 10, in that case limit rescount
@@ -450,19 +473,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 	if(version < CUDART_VERSION)
 	{
-		printf("Driver does not support CUDA %d.%d API! Update your nVidia driver!\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
+		printf("WARNING: Driver supports CUDA %d.%d but this was compiled for CUDA %d.%d API! Update your nVidia driver or compile with older CUDA!\n",
+			version / 1000, (version % 1000 / 10),
+			CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
 		return 1;
 	}
 
 	int GPU_N;
 	if(cuda_get_devicecount(&GPU_N) == 0)
 	{
+		printf("WARNING: CUDA claims zero devices?\n");
 		return 1;
 	}
 
 	if(ctx->device_id >= GPU_N)
 	{
-		printf("Invalid device ID!\n");
+		printf("WARNING: Invalid device ID '%i'!\n", ctx->device_id);
 		return 1;
 	}
 
@@ -483,6 +509,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 	ctx->name = std::string(props.name);
 
+	printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"... ",
+		version / 1000, (version % 1000 / 10),
+		CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10,
+		ctx->device_id, gpuArch, ctx->device_name);
+
 	std::vector<int> arch;
 #define XMRSTAK_PP_TOSTRING1(str) #str
 #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str)
@@ -496,13 +527,14 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	while ( ss >> tmpArch )
 		arch.push_back( tmpArch );
 
+	#define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n"
 	if(gpuArch >= 20 && gpuArch < 30)
 	{
 		// compiled binary must support sm_20 for fermi
 		std::vector<int>::iterator it = std::find(arch.begin(), arch.end(), 20);
 		if(it == arch.end())
 		{
-			printf("WARNING: NVIDIA GPU %d: miner not compiled for CUDA architecture %d.\n", ctx->device_id, gpuArch);
+			printf(MSG_CUDA_NO_ARCH);
 			return 5;
 		}
 	}
@@ -520,7 +552,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 				minSupportedArch = arch[i];
 		if(minSupportedArch < 30 || gpuArch < minSupportedArch)
 		{
-			printf("WARNING: NVIDIA GPU %d: miner not compiled for CUDA architecture %d.\n", ctx->device_id, gpuArch);
+			printf(MSG_CUDA_NO_ARCH);
 			return 5;
 		}
 	}
@@ -529,8 +561,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	if(ctx->device_blocks == -1)
 	{
 		/* good values based of my experience
-		 *	 - 3 * SMX count >=sm_30
-		 *   - 2 * SMX count for <sm_30
+		 *   - 3 * SMX count for >=sm_30
+		 *   - 2 * SMX count for  <sm_30
 		 */
 		ctx->device_blocks = props.multiProcessorCount *
 			( props.major < 3 ? 2 : 3 );
@@ -582,18 +614,19 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 
 		int* tmp;
 		cudaError_t err;
+		#define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n"
 		// a device must be selected to get the right memory usage later on
 		err = cudaSetDevice(ctx->device_id);
 		if(err != cudaSuccess)
 		{
-			printf("WARNING: NVIDIA GPU %d: cannot be selected.\n", ctx->device_id);
+			printf(MSG_CUDA_FUNC_FAIL, "cudaSetDevice");
 			return 2;
 		}
 		// trigger that a context on the gpu will be allocated
 		err = cudaMalloc(&tmp, 256);
 		if(err != cudaSuccess)
 		{
-			printf("WARNING: NVIDIA GPU %d: context cannot be created.\n", ctx->device_id);
+			printf(MSG_CUDA_FUNC_FAIL, "cudaMalloc");
 			return 3;
 		}
 
@@ -626,9 +659,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		size_t usedMem = totalMemory - freeMemory;
 		if(usedMem >= maxMemUsage)
 		{
-			printf("WARNING: NVIDIA GPU %d: already %s MiB memory in use, skip GPU.\n",
-				ctx->device_id,
-				std::to_string(usedMem/byteToMiB).c_str());
+			printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem/byteToMiB).c_str());
 			return 4;
 		}
 		else
@@ -660,7 +691,27 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			ctx->device_threads = 64;
 		}
 
+		// check if cryptonight_monero_v8 is selected for the user pool
+		bool useCryptonight_v8 =
+			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
+		// overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0
+		if(useCryptonight_v8 && gpuArch >= 50)
+		{
+			// 4 based on my test maybe it must be adjusted later
+			size_t threads = 4;
+			// 8 is chosen by checking the occupancy calculator
+			size_t blockOptimal = 8 * ctx->device_mpcount;
+
+			if(blockOptimal * threads * hashMemSize < limitedMemory)
+			{
+				ctx->device_threads = threads;
+				ctx->device_blocks = blockOptimal;
+			}
+		}
 	}
+	printf("device init succeeded\n");
 
 	return 0;
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..796b7adda3b8e236a8e67e2e450f2cdbf2d3e512
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <stdint.h>
+
+__device__ __forceinline__ uint32_t get_reciprocal(uint32_t a)
+{
+	const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23));
+	const float a_lo = __uint2float_rn(a & 0xFF);
+
+	float r;
+	asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi));
+	const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23));
+
+	const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f));
+	return (__float_as_uint(r) << 9) - __float2int_rn(h * r_scaled);
+}
+
+__device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b)
+{
+	const uint32_t r = get_reciprocal(b);
+	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a;
+
+	uint32_t q[2];
+	q[0] = ((uint32_t*)&k)[1];
+
+	int64_t tmp = a - (uint64_t)(q[0]) * b;
+	((int32_t*)(&tmp))[1] -= (k < a) ? b : 0;
+
+	const bool overshoot = ((int32_t*)(&tmp))[1] < 0;
+	const bool undershoot = tmp >= b;
+
+	q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U);
+	q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U);
+
+	return *((uint64_t*)(q));
+}
+
+__device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
+{
+	float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23));
+	float x1;
+	asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x));
+	asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x));
+
+	// The following line does x1 *= 4294967296.0f;
+	x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23));
+
+	const uint32_t x0 = __float_as_uint(x) - (158U << 23);
+	const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18);
+	const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1;
+
+	uint32_t result = (x0 << 10) + __float2int_rn(delta);
+	const uint32_t s = result >> 1;
+	const uint32_t b = result & 1;
+
+	const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1;
+	const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0;
+	const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0;
+	result += (overshoot+undershoot);
+	return result;
+}
diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp
index 1811af22490fe5b1da1160994e81054393165b0e..5c7dfe16a38376c2700bea15991ca9ed522b8f10 100644
--- a/xmrstak/backend/plugin.hpp
+++ b/xmrstak/backend/plugin.hpp
@@ -27,8 +27,11 @@ namespace xmrstak
 struct plugin
 {
 
-	plugin(const std::string backendName, const std::string libName) : fn_startBackend(nullptr), m_backendName(backendName)
+	plugin() = default;
+
+	void load(const std::string backendName, const std::string libName)
 	{
+		m_backendName = backendName;
 #ifdef WIN32
 		libBackend = LoadLibrary(TEXT((libName + ".dll").c_str()));
 		if(!libBackend)
@@ -81,32 +84,36 @@ struct plugin
 		if(fn_startBackend == nullptr)
 		{
 			std::vector<iBackend*>* pvThreads = new std::vector<iBackend*>();
-			std::cerr << "WARNING: " << m_backendName << " Backend disabled"<< std::endl;
 			return pvThreads;
 		}
 
 		return fn_startBackend(threadOffset, pWork, env);
 	}
 
+	void unload()
+	{
+		if(libBackend)
+		{
+#ifdef WIN32
+			FreeLibrary(libBackend);
+#else
+			dlclose(libBackend);
+#endif
+		}
+		fn_startBackend = nullptr;
+	}
+
 	std::string m_backendName;
 
 	typedef std::vector<iBackend*>* (*startBackend_t)(uint32_t threadOffset, miner_work& pWork, environment& env);
 
-	startBackend_t fn_startBackend;
+	startBackend_t fn_startBackend = nullptr;
 
 #ifdef WIN32
 	HINSTANCE libBackend;
 #else
-	void *libBackend;
-#endif
-
-/* \todo add unload to destructor and change usage of plugin that libs kept open until the miner ends
-#ifdef WIN32
-	FreeLibrary(libBackend);
-#else
-	dlclose(libBackend);
+	void *libBackend = nullptr;
 #endif
- * */
 };
 
 } // namespace xmrstak
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index ae39d2505e1b96fc2bb088bfb5d6567f2c64b31b..40fb9d94859e03910de2ec642119703dc56ada9b 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -749,6 +749,7 @@ int main(int argc, char *argv[])
 
 	if (!BackendConnector::self_test())
 	{
+		printer::inst()->print_msg(L0, "Self test not passed!");
 		win_exit();
 		return 1;
 	}
@@ -782,11 +783,24 @@ int main(int argc, char *argv[])
 	char buffer[64];
 	snprintf(buffer, sizeof(buffer), "\nConfigurable dev donation level is set to %.1f%%\n\n", fDevDonationLevel * 100.0);
 	printer::inst()->print_str(buffer);
+	printer::inst()->print_str("-------------------------------------------------------------------\n");
 	printer::inst()->print_str("You can use following keys to display reports:\n");
 	printer::inst()->print_str("'h' - hashrate\n");
 	printer::inst()->print_str("'r' - results\n");
 	printer::inst()->print_str("'c' - connection\n");
 	printer::inst()->print_str("-------------------------------------------------------------------\n");
+	printer::inst()->print_str("Upcoming xmr-stak-gui is sponsored by:\n");
+	printer::inst()->print_str("   #####   ______               ____\n");
+	printer::inst()->print_str(" ##     ## | ___ \\             /  _ \\\n");
+	printer::inst()->print_str("#    _    #| |_/ /_   _   ___  | / \\/ _   _  _ _  _ _  ___  _ __    ___  _   _\n");
+	printer::inst()->print_str("#   |_|   #|    /| | | | / _ \\ | |   | | | || '_|| '_|/ _ \\| '_ \\  / __|| | | |\n");
+	printer::inst()->print_str("#         #| |\\ \\| |_| || (_) || \\_/\\| |_| || |  | | |  __/| | | || (__ | |_| |\n");
+	printer::inst()->print_str(" ##     ## \\_| \\_|\\__, | \\___/ \\____/ \\__,_||_|  |_|  \\___||_| |_| \\___| \\__, |\n");
+	printer::inst()->print_str("   #####           __/ |                                                  __/ |\n");
+	printer::inst()->print_str("                  |___/   https://ryo-currency.com                       |___/\n\n");
+	printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n");
+	printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n");
+	printer::inst()->print_str("-------------------------------------------------------------------\n");
 	printer::inst()->print_msg(L0, "Mining coin: %s", jconf::inst()->GetMiningCoin().c_str());
 
 	if(params::inst().benchmark_block_version >= 0)
diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl
index 14330a829f483a6262742eaae95c4f5939b84c1c..73ae054c2346661360022271b64e4ec9b2f48feb 100644
--- a/xmrstak/config.tpl
+++ b/xmrstak/config.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * Network timeouts.
  * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make
@@ -58,40 +59,48 @@ R"===(
  * Large pages need a properly set up OS. It can be difficult if you are not used to systems administration,
  * but the performance results are worth the trouble - you will get around 20% boost. Slow memory mode is
  * meant as a backup, you won't get stellar results there. If you are running into trouble, especially
- * on Windows, please read the common issues in the README.
- *
- * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows.
- * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN
- *
- * 1. On the Start menu, click Run. In the Open box, type gpedit.msc.
- * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings.
- * 3. Expand Security Settings, and then expand Local Policies.
- * 4. Select the User Rights Assignment folder.
- * 5. The policies will be displayed in the details pane.
- * 6. In the pane, double-click Lock pages in memory.
- * 7. In the Local Security Setting â€“ Lock pages in memory dialog box, click Add User or Group.
- * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on
- * 9. Reboot for change to take effect.
- *
- * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need
- * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory.
- *
- * On Linux you will need to configure large page support "sudo sysctl -w vm.nr_hugepages=128" and increase your
- * ulimit -l. To do do this you need to add following lines to /etc/security/limits.conf - "* soft memlock 262144"
- * and "* hard memlock 262144". You can also do it Windows-style and simply run-as-root, but this is NOT
- * recommended for security reasons.
- *
- * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a
- * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between
- * locked and unlocked memory. If that is your setup see option "no_mlck".
- */
-
-/*
+ * on Windows, please read the common issues in the README and FAQ.
+ *
+ * By default we will try to allocate large pages. This means you need to "Run As Administrator" on Windows.---WINDOWS
+ * You need to edit your system's group policies to enable locking large pages. Here are the steps from MSDN---WINDOWS
+ *---WINDOWS
+ * 1. On the Start menu, click Run. In the Open box, type gpedit.msc.---WINDOWS
+ * 2. On the Local Group Policy Editor console, expand Computer Configuration, and then expand Windows Settings.---WINDOWS
+ * 3. Expand Security Settings, and then expand Local Policies.---WINDOWS
+ * 4. Select the User Rights Assignment folder.---WINDOWS
+ * 5. The policies will be displayed in the details pane.---WINDOWS
+ * 6. In the pane, double-click Lock pages in memory.---WINDOWS
+ * 7. In the Local Security Setting â€“ Lock pages in memory dialog box, click Add User or Group.---WINDOWS
+ * 8. In the Select Users, Service Accounts, or Groups dialog box, add an account that you will run the miner on---WINDOWS
+ * 9. Reboot for change to take effect.---WINDOWS
+ *---WINDOWS
+ * Windows also tends to fragment memory a lot. If you are running on a system with 4-8GB of RAM you might need---WINDOWS
+ * to switch off all the auto-start applications and reboot to have a large enough chunk of contiguous memory.---WINDOWS
+ * On Linux you will need to configure large page support and increase your memlock limit (ulimit -l).---LINUX
+ *---LINUX
+ * To set large page support, add the following to "/etc/sysctl.d/60-hugepages.conf":---LINUX
+ *     vm.nr_hugepages=128---LINUX
+ * You WILL need to run "sudo sysctl --system" for these settings to take effect on your system (or reboot).---LINUX
+ *  In some cases (many threads, very large CPU, etc) you may need more than 128---LINUX
+ *   (try 256 if there are still complaints from thread inits)---LINUX
+ *---LINUX
+ * To increase the memlock (ulimit -l), add following lines to /etc/security/limits.d/60-memlock.conf:---LINUX
+ *     *    - memlock 262144---LINUX
+ *     root - memlock 262144---LINUX
+ * You WILL need to log out and log back in for these settings to take effect on your user (no need to reboot, just relogin in your session).---LINUX
+ *---LINUX
+ * Check with "/sbin/sysctl vm.nr_hugepages ; ulimit -l" to validate---LINUX
+ *---LINUX
+ * Memory locking means that the kernel can't swap out the page to disk - something that is unlikely to happen on a---LINUX
+ * command line system that isn't starved of memory. I haven't observed any difference on a CLI Linux system between---LINUX
+ * locked and unlocked memory. If that is your setup see option "no_mlck".---LINUX
+ *
+ *
  * use_slow_memory defines our behaviour with regards to large pages. There are three possible options here:
  * always  - Don't even try to use large pages. Always use slow memory.
  * warn    - We will try to use large pages, but fall back to slow memory if that fails.
- * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory.
- *           It will never use slow memory, but it won't attempt to mlock
+ * no_mlck - This option is only relevant on Linux, where we can use large pages without locking memory.---LINUX
+ *           It will never use slow memory, but it won't attempt to mlock---LINUX
  * never   - If we fail to allocate large pages we will print an error and exit.
  */
 "use_slow_memory" : "warn",
@@ -149,6 +158,4 @@ R"===(
  *               This setting will only be needed in 2020's. No need to worry about it now.
  */
 "prefer_ipv4" : true,
-
 )==="
-
diff --git a/xmrstak/cpputil/read_write_lock.h b/xmrstak/cpputil/read_write_lock.h
index 9139dfd22883e93b8e3e65ce5978dab34eb74471..51f42a2e65695a236d0b14c71e78cc1b6b9adeb2 100644
--- a/xmrstak/cpputil/read_write_lock.h
+++ b/xmrstak/cpputil/read_write_lock.h
@@ -61,7 +61,7 @@ class RWLock
 		std::unique_lock<std::mutex> lck(mtx_);
 		if (status_ == -1) {
 			status_ = 0;
-		} 
+		}
 		else
 		{
 			status_ -= 1;
@@ -72,7 +72,7 @@ class RWLock
 			{
 				write_cv_.notify_one();
 			}
-		} 
+		}
 		else
 		{
 			read_cv_.notify_all();
diff --git a/xmrstak/donate-level.hpp b/xmrstak/donate-level.hpp
index 71b79628ad5a4e79948e7c8bfbac237c116a7935..0f851f8f8f17001389a36e9679b3f24538892356 100644
--- a/xmrstak/donate-level.hpp
+++ b/xmrstak/donate-level.hpp
@@ -1,18 +1,19 @@
 #pragma once
 
 /*
- * Dev donation.
- * Percentage of your hashing power that you want to donate to the developer, can be 0.0 if you don't want to do that.
- * Example of how it works for the default setting of 2.0:
- * You miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minute.
- * Switching is instant, and only happens after a successful connection, so you never loose any hashes.
+ * DEV DONATION SETTING
+ * This setting is a percentage of your hashing power that the miner donates to the developers of this app.
+ * It can be 0.0 if you don't want to help the developers. The default setting of 2.0 means that
+ * the miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2.0 minutes.
+ * Switching pools is instant and it only happens after a successful connection, so you don't lose any hash time.
  *
- * If you plan on changing this setting to 0.0 please consider making a one off donation to our wallets:
+ * If you plan on changing this setting to 0.0, please consider making a one time donation to our wallets:
  * fireice-uk:
  * 4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk
  * psychocrypt:
  * 43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU
  *
+ * Thank you for your support.
  */
 
 constexpr double fDevDonationLevel = 2.0 / 100.0;
diff --git a/xmrstak/http/httpd.cpp b/xmrstak/http/httpd.cpp
index dba7d7cdbcd3ae51fd153ff4ca3b20cb256cb73f..ed9abc2bc261c06ddbc891131989a45ef8f0b9f3 100644
--- a/xmrstak/http/httpd.cpp
+++ b/xmrstak/http/httpd.cpp
@@ -71,17 +71,17 @@ int httpd::req_handler(void * cls,
 		if (username == NULL)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
-			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, MHD_NO);
+			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, MHD_NO);
 			MHD_destroy_response(rsp);
 			return ret;
 		}
 		free(username);
 
-		ret = MHD_digest_auth_check(connection, sHttpAuthRelam, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300);
+		ret = MHD_digest_auth_check(connection, sHttpAuthRealm, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300);
 		if (ret == MHD_INVALID_NONCE || ret == MHD_NO)
 		{
 			rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT);
-			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRelam, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO);
+			ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO);
 			MHD_destroy_response(rsp);
 			return ret;
 		}
diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp
index d6ee66e8dc9c011cc3354abfb4ea395de1c1ff0b..93e2175193ae9382a22bfd833495f023d6d67b10 100644
--- a/xmrstak/http/webdesign.cpp
+++ b/xmrstak/http/webdesign.cpp
@@ -113,7 +113,7 @@ extern const char sHtmlCssFile [] =
 
 size_t sHtmlCssSize = sizeof(sHtmlCssFile) - 1;
 
-extern const char sHttpAuthRelam[] = "XMR-Stak-Miner";
+extern const char sHttpAuthRealm[] = "XMR-Stak-Miner";
 extern const char sHttpAuthOpaque[] = "6c071f0df539e234cadbcd79164af7a594e23ab42bccb834df796aead6ce96e4";
 
 extern const char sHtmlAccessDenied[] =
diff --git a/xmrstak/http/webdesign.hpp b/xmrstak/http/webdesign.hpp
index 48adfea98c39f6f3aaccff89371fc6f00afbce9e..bcbe5ae1da2c4a96666fe40b77edd652703acb37 100644
--- a/xmrstak/http/webdesign.hpp
+++ b/xmrstak/http/webdesign.hpp
@@ -7,7 +7,7 @@ extern size_t sHtmlCssSize;
 extern const char sHtmlAccessDenied[];
 extern size_t sHtmlAccessDeniedSize;
 
-extern const char sHttpAuthRelam[];
+extern const char sHttpAuthRealm[];
 extern const char sHttpAuthOpaque[];
 
 extern const char sHtmlCommonHeader[];
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 354388849c0ee916a77e2cea99fcef220691be1a..b608c0028984491b62c0ee3766201846235e5e5c 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -87,27 +87,29 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
 
 xmrstak::coin_selection coins[] = {
 	// name, userpool, devpool, default_pool_suggestion
-	{ "aeon7",               {cryptonight_aeon, cryptonight_lite, 7u},     {cryptonight_aeon, cryptonight_lite, 7u},     "mine.aeon-pool.com:5555" },
-	{ "bbscoin",             {cryptonight_monero, cryptonight, 3u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "bittube",             {cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"},
-	{ "cryptonight",         {cryptonight_monero, cryptonight, 255u},      {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "cryptonight_bittube2",{cryptonight_bittube2, cryptonight_bittube2, 0}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr},
-	{ "cryptonight_masari",  {cryptonight_monero, cryptonight_masari, 255u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr },
-	{ "cryptonight_haven",   {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "cryptonight_heavy",   {cryptonight_heavy, cryptonight_heavy, 0u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "cryptonight_lite",    {cryptonight_aeon, cryptonight_lite, 255u},   {cryptonight_aeon, cryptonight_lite, 7u},     nullptr },
-	{ "cryptonight_lite_v7", {cryptonight_lite, cryptonight_aeon, 255u},   {cryptonight_aeon, cryptonight_lite, 7u},     nullptr },
-	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 255u}, nullptr },
-	{ "cryptonight_v7",      {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "cryptonight_v7_stellite", {cryptonight_monero, cryptonight_stellite, 255u}, {cryptonight_monero, cryptonight_monero, 255u}, nullptr },
-	{ "graft",               {cryptonight_monero, cryptonight, 8u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "haven",               {cryptonight_haven, cryptonight_heavy, 3u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "intense",             {cryptonight_monero, cryptonight, 4u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "masari",              {cryptonight_masari, cryptonight_monero, 7u},   {cryptonight_monero, cryptonight_monero, 0u},nullptr },
-	{ "monero7",             {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" },
-	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "stellite",            {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
-	{ "turtlecoin",          {cryptonight_lite, cryptonight_aeon, 255u},   {cryptonight_aeon, cryptonight_lite, 7u},     nullptr }
+	{ "aeon7",               {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     "mine.aeon-pool.com:5555" },
+	{ "bbscoin",             {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
+	{ "bittube",             {cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"},
+	{ "cryptonight",         {cryptonight_monero_v8, cryptonight, 255u},          {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "cryptonight_bittube2",{cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},nullptr},
+	{ "cryptonight_masari",  {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
+	{ "cryptonight_haven",   {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "cryptonight_heavy",   {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "cryptonight_lite",    {cryptonight_aeon, cryptonight_lite, 255u},          {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
+	{ "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
+	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u},      {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
+	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "cryptonight_v8",      {cryptonight_monero, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "graft",               {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "haven",               {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "intense",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "masari",              {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
+	{ "monero",              {cryptonight_monero_v8, cryptonight_monero, 8u},     {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" },
+	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "stellite",            {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "turtlecoin",          {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
 };
 
 constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
@@ -323,13 +325,6 @@ bool jconf::IsOnAlgoList(std::string& needle)
 {
 	std::transform(needle.begin(), needle.end(), needle.begin(), ::tolower);
 
-	if(needle == "monero")
-	{
-		printer::inst()->print_msg(L0, "You entered Monero as coin name. Monero will hard-fork the PoW.\nThis means it will stop being compatible with other cryptonight coins.\n"
-			"Please use 'monero7' (support auto switch to new POW) if you want to mine Monero, \nor name the coin that you want to mine.");
-		return false;
-	}
-
 	for(size_t i=0; i < coin_algo_size; i++)
 	{
 		if(needle == coins[i].coin_name)
@@ -614,13 +609,6 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools)
 
 	for(size_t i=0; i < coin_algo_size; i++)
 	{
-		if(ctmp == "monero")
-		{
-			printer::inst()->print_msg(L0, "You entered Monero as coin name. Monero will hard-fork the PoW.\nThis means it will stop being compatible with other cryptonight coins.\n"
-				"Please use monero7 (support auto switch to new POW) if you want to mine Monero, or name the coin that you want to mine.");
-			return false;
-		}
-
 		if(ctmp == coins[i].coin_name)
 		{
 			currentCoin = coins[i];
diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp
index d95ea6b7222cdfc43788039b5982599463f17d62..3f79df44cb2aa282d371af430f4aa1e0ff16361d 100644
--- a/xmrstak/misc/configEditor.hpp
+++ b/xmrstak/misc/configEditor.hpp
@@ -6,6 +6,7 @@
 #include <streambuf>
 #include <regex>
 
+#include "../version.hpp"
 
 namespace xmrstak
 {
@@ -42,6 +43,24 @@ struct configEditor
 
 	void write(const std::string filename)
 	{
+		// endmarks: for filtering full lines inside the template string
+		// Platform marks are done globally here
+		// "---WINDOWS" endmark keeps lines when compiled for Windows
+		// "---LINUX"   endmark keeps lines when compiled for Linux (and anything not-windows)
+#if defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__WINDOWS__)
+		// windows:
+		//   completely drop lines with endmark-linux
+		replace(".*---LINUX\n", "");
+		//   strip off windows endmarks, keep the lines
+		replace("---WINDOWS\n", "\n");
+#else
+		// not-windows:
+		//   completely drop lines with endmark-windows
+		replace(".*---WINDOWS\n", "");
+		//   strip off linux endmarks, keep the lines
+		replace("---LINUX\n", "\n");
+#endif
+		replace("XMRSTAK_VERSION", get_version_str());
 		std::ofstream out(filename);
 		out << m_fileContent;
 		out.close();
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index 11d0f6df05b96f07bf181a015e3cf2678c7effb6..4fcce1f97bb17f78e14d686aa5ce5db28713ea84 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -560,7 +560,7 @@ void executor::ex_main()
 		else
 			pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true);
 		break;
-
+	case cryptonight_monero_v8:
 	case cryptonight_monero:
 		if(dev_tls)
 			pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false);
@@ -1255,7 +1255,7 @@ void executor::http_json_report(std::string& out)
 		if(i != 0) cn_error.append(1, ',');
 
 		snprintf(buffer, sizeof(buffer), sJsonApiConnectionError,
-			int_port(duration_cast<seconds>(vMineResults[i].time.time_since_epoch()).count()),
+			int_port(duration_cast<seconds>(vSocketLog[i].time.time_since_epoch()).count()),
 			vSocketLog[i].msg.c_str());
 		cn_error.append(buffer);
 	}
diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp
index 5642f6b66779bfec21c105fd84fce853c00546f2..197da8ecaac4398bce23df354579e16afa8500a4 100644
--- a/xmrstak/misc/telemetry.cpp
+++ b/xmrstak/misc/telemetry.cpp
@@ -49,6 +49,7 @@ telemetry::telemetry(size_t iThd)
 
 double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 {
+	std::unique_lock<std::mutex> lk(mtx);
 	uint64_t iTimeNow = get_timestamp_ms();
 
 	uint64_t iEarliestHashCnt = 0;
@@ -98,6 +99,7 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread)
 
 void telemetry::push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp)
 {
+	std::unique_lock<std::mutex> lk(mtx);
 	size_t iTop = iBucketTop[iThd];
 	ppHashCounts[iThd][iTop] = iHashCount;
 	ppTimestamps[iThd][iTop] = iTimestamp;
diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp
index 309fd6d06055f446737238565117005ea7d721ce..1813c00e6a07d41917ffe621d348a36be0ab8706 100644
--- a/xmrstak/misc/telemetry.hpp
+++ b/xmrstak/misc/telemetry.hpp
@@ -2,6 +2,7 @@
 
 #include <cstdint>
 #include <cstring>
+#include <mutex>
 
 namespace xmrstak
 {
@@ -14,6 +15,7 @@ public:
 	double calc_telemetry_data(size_t iLastMillisec, size_t iThread);
 
 private:
+	mutable std::mutex mtx;
 	constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations
 	constexpr static size_t iBucketMask = iBucketSize - 1;
 	uint32_t* iBucketTop;
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index 9fce9b7e59cffd066cd51c0ce411ee766dc9dd14..d20ba082f0b72aa02bafac45abdb4a18718783a7 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -685,6 +685,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 		case cryptonight_monero:
 			algo_name = "cryptonight_v7";
 			break;
+		case cryptonight_monero_v8:
+			algo_name = "cryptonight_v8";
+			break;
 		case cryptonight_aeon:
 			algo_name = "cryptonight_lite_v7";
 			break;
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 6960d63bb1e5b2a2085f0d7a1be130d137922162..e86e2a537aaae54004efdd2a6e14c630d128965a 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -1,4 +1,5 @@
-R"===(
+R"===(// generated by XMRSTAK_VERSION
+
 /*
  * pool_address    - Pool address should be in the form "pool.supportxmr.com:3333". Only stratum pools are supported.
  * wallet_address  - Your wallet, or pool login.
@@ -22,12 +23,13 @@ POOLCONF],
  *
  *    aeon7 (use this for Aeon's new PoW)
  *    bbscoin (automatic switch with block version 3 to cryptonight_v7)
- *    bittube (uses cyrptonight_bittube2 algorithm)
+ *    bittube (uses cryptonight_bittube2 algorithm)
  *    graft
  *    haven (automatic switch with block version 3 to cryptonight_haven)
  *    intense
  *    masari
- *    monero7 (use this for Monero's new PoW)
+ *    monero (use this to support Monero's Oct 2018 fork)
+ *    qrl - Quantum Resistant Ledger
  *    ryo
  *    turtlecoin
  *
@@ -40,13 +42,12 @@ POOLCONF],
  *    # 2MiB scratchpad memory
  *    cryptonight
  *    cryptonight_v7
+ *    cryptonight_v8
  *    # 4MiB scratchpad memory
- *    cyrptonight_bittube2
+ *    cryptonight_bittube2
  *    cryptonight_haven
  *    cryptonight_heavy
  */
 
 "currency" : "CURRENCY",
-
 )==="
-
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index cd8bb52f57d0573d6c0af2035d634fae66471ff4..80c25c6b8042ecfe312d8ffa821907d5813bc980 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.4.7"
+#define XMR_STAK_VERSION "2.5.0"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"