diff --git a/CMakeLists.txt b/CMakeLists.txt
index a642b385d..067bbd0a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -458,6 +458,15 @@ if(MICROHTTPD_ENABLE)
 endif()
 target_link_libraries(xmr-stak-c ${LIBS})
 
+enable_language(ASM)
+# asm optimized monero v8 code
+add_library(xmr-stak-asm
+    STATIC
+    "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
+)
+set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)
+
+
 # compile generic backend files
 file(GLOB BACKEND_CPP
     "xmrstak/*.cpp"
@@ -472,7 +481,7 @@ add_library(xmr-stak-backend
     STATIC
     ${BACKEND_CPP}
 )
-target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS})
+target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm)
 
 # compile CUDA backend
 if(CUDA_FOUND)
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 87721ac8f..15171a1a0 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -390,9 +390,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 
 		char options[512];
 		snprintf(options, sizeof(options),
-			"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d  -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
+			"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d  -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d -DCN_UNROLL=%d",
 		hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0,
-			int_port(hashMemSize), int(miner_algo[ii]));
+			int_port(hashMemSize), int(miner_algo[ii]), ctx->unroll);
 		/* create a hash for the compile time cache
 		 * used data:
 		 *   - source code
@@ -885,6 +885,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	//char* source_code = LoadTextFile(sSourcePath);
 
+	const char *fastIntMathV2CL =
+			#include "./opencl/fast_int_math_v2.cl"
+	;
 	const char *cryptonightCL =
 			#include "./opencl/cryptonight.cl"
 	;
@@ -905,6 +908,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	;
 
 	std::string source_code(cryptonightCL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
@@ -914,16 +918,37 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	// create a directory  for the OpenCL compile cache
 	create_directory(get_home() + "/.openclcache");
 
+	// check if cryptonight_monero_v8 is selected for the user or dev pool
+	bool useCryptonight_v8 =
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
 	for(int i = 0; i < num_gpus; ++i)
 	{
+		const std::string backendName = xmrstak::params::inst().openCLVendor;
 		if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
 		{
 			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
 			ctx[i].rawIntensity = reduced_intensity;
-			const std::string backendName = xmrstak::params::inst().openCLVendor;
 			printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
 		}
 
+		if(useCryptonight_v8)
+		{
+			if(ctx[i].stridedIndex == 1)
+			{
+				printer::inst()->print_msg(L0, "ERROR %s: gpu %d stridedIndex is not allowed to be `true` or `1` for the selected currency", backendName.c_str(), ctx[i].deviceIdx);
+				return ERR_STUPID_PARAMS;
+			}
+			if(ctx[i].stridedIndex == 2 && ctx[i].memChunk < 2)
+			{
+				printer::inst()->print_msg(L0, "ERROR %s: gpu %d memChunk bust be >= 2 for the selected currency", backendName.c_str(), ctx[i].deviceIdx);
+				return ERR_STUPID_PARAMS;
+			}
+		}
+
 		if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
 		{
 			return ret;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 5ab80b82a..63c5029d7 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -27,6 +27,7 @@ struct GpuContext
 	size_t workSize;
 	int stridedIndex;
 	int memChunk;
+	int unroll = 0;
 	bool isNVIDIA = false;
 	int compMode;
 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 002472d3a..28d56a463 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
 }
 #endif
 
+//#include "opencl/fast_int_math_v2.cl"
+XMRSTAK_INCLUDE_FAST_INT_MATH_V2
 //#include "opencl/wolf-aes.cl"
 XMRSTAK_INCLUDE_WOLF_AES
 //#include "opencl/wolf-skein.cl"
@@ -416,6 +418,9 @@ void AESExpandKey256(uint *keybuf)
 	}
 }
 
+)==="
+R"===(
+
 #define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)
 
 #if(STRIDED_INDEX==0)
@@ -551,7 +556,15 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
-		
+
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+#	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
+#	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
+#else
+#	define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)])
+#endif
+	
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
@@ -560,9 +573,29 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 )
 {
-	ulong a[2], b[2];
+	ulong a[2];
+
+// cryptonight_monero_v8
+#if(ALGO==11)		
+	ulong b[4];
+	uint4 b_x[2];
+// NVIDIA
+#	ifdef __NV_CL_C_VERSION
+	__local uint16 scratchpad_line_buf[WORKSIZE];
+ 	__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
+#	endif
+#else
+	ulong b[2];
+	uint4 b_x[1];
+#endif
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 
+// cryptonight_monero_v8
+#if(ALGO==11)
+	__local uint RCP[256];
+	uint2 division_result;
+	uint sqrt_result;
+#endif
 	const ulong gIdx = getIdx();
 
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
@@ -572,6 +605,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		AES1[i] = rotate(tmp, 8U);
 		AES2[i] = rotate(tmp, 16U);
 		AES3[i] = rotate(tmp, 24U);
+// cryptonight_monero_v8
+#if(ALGO==11)
+		RCP[i] = RCP_C[i];
+#endif
 	}
 
 	barrier(CLK_LOCAL_MEM_FENCE);
@@ -579,7 +616,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
     uint2 tweak1_2;
 #endif
-	uint4 b_x;
+
 #if(COMP_MODE==1)
 	// do not use early return here
 	if(gIdx < Threads)
@@ -599,7 +636,17 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		a[1] = states[1] ^ states[5];
 		b[1] = states[3] ^ states[7];
 
-		b_x = ((uint4 *)b)[0];
+		b_x[0] = ((uint4 *)b)[0];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+		a[1] = states[1] ^ states[5];
+		b[2] = states[8] ^ states[10];
+		b[3] = states[9] ^ states[11];
+		b_x[1] = ((uint4 *)b)[1];
+		division_result = as_uint2(states[12]);
+		sqrt_result = as_uint2(states[13]).s0;
+#endif
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 		tweak1_2 = as_uint2(input[4]);
@@ -617,37 +664,96 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	if(gIdx < Threads)
 #endif
 	{
-		ulong idx0 = a[0];
+		ulong idx0 = a[0] & MASK;
 
-		#pragma unroll 8
+		#pragma unroll CN_UNROLL
 		for(int i = 0; i < ITERATIONS; ++i)
 		{
 			ulong c[2];
+// cryptonight_monero_v8 && NVIDIA
+#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+			ulong idxS = idx0 & 0x30;
+ 			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
+#endif
 
-			((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
+			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
 // cryptonight_bittube2
 #if(ALGO == 10)
 			((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #else
 			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #endif
-			b_x ^= ((uint4 *)c)[0];
+
+// cryptonight_monero_v8
+#if(ALGO==11)
+			{
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+			}
+#endif
+
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 			uint table = 0x75310U;
+			b_x[0] ^= ((uint4 *)c)[0];
 // cryptonight_stellite
 #	if(ALGO == 7)
-			uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2);
+			uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2);
+#	else
+			uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2);
+#	endif
+			b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
+			SCRATCHPAD_CHUNK(0) = b_x[0];
+			idx0 = c[0] & MASK;
+// cryptonight_monero_v8
+#elif(ALGO==11)
+			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
+#	ifdef __NV_CL_C_VERSION
+			// flush shuffeled data
+			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+ 			idx0 = c[0] & MASK;
+ 			idxS = idx0 & 0x30;
+ 			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #	else
-			uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2);
+			idx0 = c[0] & MASK;
 #	endif
-			b_x.s2 ^= ((table >> index) & 0x30U) << 24;
+#else
+			b_x[0] ^= ((uint4 *)c)[0];
+			SCRATCHPAD_CHUNK(0) = b_x[0];
+			idx0 = c[0] & MASK;
 #endif
-			Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;
-
 			uint4 tmp;
-			tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];
-
+			tmp = SCRATCHPAD_CHUNK(0);
+// cryptonight_monero_v8
+#if(ALGO==11)
+			// Use division and square root results from the _previous_ iteration to hide the latency
+			tmp.s0 ^= division_result.s0;
+			tmp.s1 ^= division_result.s1 ^ sqrt_result;
+ 			// Most and least significant bits in the divisor are set to 1
+			// to make sure we don't divide by a small or even number,
+			// so there are no shortcuts for such cases
+			const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL;
+ 			// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
+			// We drop the highest bit to fit both quotient and remainder in 32 bits
+			division_result = fast_div_v2(RCP, c[1], d);
+ 			// Use division_result as an input for the square root to prevent parallel implementation in hardware
+			sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
+#endif
+// cryptonight_monero_v8
+#if(ALGO==11)
+			{
+				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+				SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+			}
+#endif
 			a[1] += c[0] * as_ulong2(tmp).s0;
 			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
 
@@ -658,44 +764,55 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #	if(ALGO == 6 || ALGO == 10)
 			uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
 #	else
 			((uint2 *)&(a[1]))[0] ^= tweak1_2;
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 			((uint2 *)&(a[1]))[0] ^= tweak1_2;
 #	endif
 
 #else
-			Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 #endif
 
 			((uint4 *)a)[0] ^= tmp;
-			idx0 = a[0];
 
-			b_x = ((uint4 *)c)[0];
+// cryptonight_monero_v8
+#if (ALGO == 11)
+#	if defined(__NV_CL_C_VERSION)
+			// flush shuffeled data
+			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
+#	endif
+			b_x[1] = b_x[0];
+#endif
+			b_x[0] = ((uint4 *)c)[0];
+			idx0 = a[0] & MASK;
 
 // cryptonight_heavy || cryptonight_bittube2
 #if (ALGO == 4 || ALGO == 10)
-			long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
-			int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = n / (d | 0x5);
-			*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
-			idx0 = d ^ q;
-#endif
+			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+			idx0 = (d ^ q) & MASK;
 // cryptonight_haven
-#if (ALGO == 9)
-			long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
-			int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
+#elif (ALGO == 9)
+			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
+			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = n / (d | 0x5);
-			*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
-			idx0 = (~d) ^ q;
+			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
+			idx0 = ((~d) ^ q) & MASK;
 #endif
+		
 		}
 	}
 	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
+)==="
+R"===(
+
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
 __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
 {
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index d6acec971..c5b331c87 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -127,6 +127,24 @@ class autoAdjust
 				minFreeMem = 512u * byteToMiB;
 			}
 
+			// check if cryptonight_monero_v8 is selected for the user or dev pool
+			bool useCryptonight_v8 =
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+
+			// set strided index to default
+			ctx.stridedIndex = 1;
+
+			// nvidia performance is very bad if the scratchpad is not contiguous
+			if(ctx.isNVIDIA)
+				ctx.stridedIndex = 0;
+
+			// use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2`
+			if(useCryptonight_v8)
+				ctx.stridedIndex = 2;
+
 			// increase all intensity limits by two for aeon
 			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
 				maxThreads *= 2u;
@@ -153,8 +171,8 @@ class autoAdjust
 				// set 8 threads per block (this is a good value for the most gpus)
 				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-					"    \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n"
-					"    \"comp_mode\" : true\n" +
+					"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+					"    \"unroll\" : 8, \"comp_mode\" : true\n" +
 					"  },\n";
 			}
 			else
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 28855f070..0101b7e2f 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -13,13 +13,15 @@ R"===(
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2
  *                 element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
+ * unroll        - allow to control how often the POW main loop is unrolled; valid range [0;128]
  * comp_mode     - Compatibility enable/disable the automatic guard around compute kernel which allows
  *                 to use a intensity which is not the multiple of the worksize.
  *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
  *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true },
+ *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, 
+ *    "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
  * ],
  * If you do not wish to mine with your AMD GPU(s) then use:
  * "gpu_threads_conf" :
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 0f39ff2b9..36ae9fd43 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -106,17 +106,18 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
 	stridedIndex = GetObjectMember(oThdConf, "strided_index");
 	memChunk = GetObjectMember(oThdConf, "mem_chunk");
+	unroll = GetObjectMember(oThdConf, "unroll");
 	compMode = GetObjectMember(oThdConf, "comp_mode");
 
 	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
-		stridedIndex == nullptr || compMode == nullptr)
+		stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
 		return false;
 
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -150,6 +151,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 		return false;
 	}
 
+	if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 )
+	{
+		printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128");
+		return false;
+	}
+	cfg.unroll = (int)unroll->GetInt64();
+
 	if(!compMode->IsBool())
 		return false;
 
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index 580b69fe7..b852c5940 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -28,6 +28,7 @@ class jconf
 		long long cpu_aff;
 		int stridedIndex;
 		int memChunk;
+		int unroll;
 		bool compMode;
 	};
 
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index d6051ffcd..5ac246335 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -99,6 +99,7 @@ bool minethd::init_gpus()
 		vGpuData[i].stridedIndex = cfg.stridedIndex;
 		vGpuData[i].memChunk = cfg.memChunk;
 		vGpuData[i].compMode = cfg.compMode;
+		vGpuData[i].unroll = cfg.unroll;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 57dbef053..8588fea8c 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -82,7 +82,7 @@ class autoAdjust
 
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string(double_mode ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(aff_id);
 				conf += std::string(" },\n");
 
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index 01d2280d8..a73de8618 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -70,7 +70,7 @@ class autoAdjust
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
 				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
-				conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : ");
+				conf += std::string(", \"no_prefetch\" : true,  \"asm\" : \"auto\", \"affine_to_cpu\" : ");
 				conf += std::to_string(id & 0x7FFFFFF);
 				conf += std::string(" },\n");
 			}
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index 2fc9a47ec..bfffc851e 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -7,10 +7,15 @@ R"===(
  *                  the maximum performance. When set to a number N greater than 1, this mode will increase the
  *                  cache usage and single thread performance by N times.
  *
- * no_prefetch -    Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
+ * no_prefetch    - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make
  *                  things slower.
  *
- * affine_to_cpu -  This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
+ * asm            - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, intel, ryzen]
+ *                    - auto: used the default implementation (no assembler version)
+ *                    - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+ *                    - ryzen: AMD Ryzen (1xxx and 2xxx series)
+ *
+ * affine_to_cpu  - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading
  *                  systems it is better to assign threads to physical cores. On Windows this usually means selecting
  *                  even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4
  *                  physical core CPU you should select cpu numbers 0-3.
@@ -21,8 +26,8 @@ R"===(
  * A filled out configuration should look like this:
  * "cpu_threads_conf" :
  * [
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 },
- *      { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 },
+ *      { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 },
  * ],
  * If you do not wish to mine with your CPU(s) then use:
  * "cpu_threads_conf" :
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 89c508990..0ab47e390 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -19,6 +19,7 @@
 #include "xmrstak/backend/cryptonight.hpp"
 #include <memory.h>
 #include <stdio.h>
+#include <cfenv>
 
 #ifdef __GNUC__
 #include <x86intrin.h>
@@ -422,6 +423,27 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 	_mm_store_si128(output + 11, xout7);
 }
 
+inline __m128i int_sqrt33_1_double_precision(const uint64_t n0)
+{
+	__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
+	x = _mm_sqrt_sd(_mm_setzero_pd(), x);
+	uint64_t r = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
+
+	const uint64_t s = r >> 20;
+	r >>= 19;
+
+	uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+
+#if defined _MSC_VER || (__GNUC__ >= 7)
+	_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
+#else
+	// GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence
+ 	// Fallback to simpler code
+ 	if (x2 < n0) ++r;
+#endif
+	return _mm_cvtsi64_si128(r);
+}
+
 inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
 {
 	alignas(16) uint32_t k[4];
@@ -467,6 +489,51 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 
 }
 
+inline void set_float_rounding_mode()
+{
+#ifdef _MSC_VER
+	_control87(RC_DOWN, MCW_RC);
+#else
+	std::fesetround(FE_DOWNWARD);
+#endif
+}
+
+#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \
+	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	}
+
+#define CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl) \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		const uint64_t sqrt_result = static_cast<uint64_t>(_mm_cvtsi128_si64(sqrt_result_xmm)); \
+		/* Use division and square root results from the _previous_ iteration to hide the latency */ \
+		const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \
+		cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result << 32); \
+		const uint32_t d = (cx_64 + (sqrt_result << 1)) | 0x80000001UL; \
+		/* Most and least significant bits in the divisor are set to 1 \
+		 * to make sure we don't divide by a small or even number, \
+		 * so there are no shortcuts for such cases \
+		 * \
+		 * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \
+		 * We drop the highest bit to fit both quotient and remainder in 32 bits \
+		 */  \
+		/* Compiler will optimize it to a single div instruction */ \
+		const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+		const uint64_t division_result = static_cast<uint32_t>(cx_s / d) + ((cx_s % d) << 32); \
+		division_result_xmm = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
+		/* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \
+		sqrt_result_xmm = int_sqrt33_1_double_precision(cx_64 + division_result); \
+	}
+
 #define CN_INIT_SINGLE \
 	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \
 	{ \
@@ -474,7 +541,7 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 		return; \
 	}
 
-#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0) \
+#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm) \
 	keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
 	uint64_t monero_const; \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -489,16 +556,27 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 	uint64_t idx0; \
 	__m128i bx0; \
 	uint8_t* l0 = ctx[n]->long_state; \
+	/* BEGIN cryptonight_monero_v8 variables */ \
+	__m128i bx1; \
+	__m128i division_result_xmm; \
+	__m128i sqrt_result_xmm; \
+	/* END cryptonight_monero_v8 variables */ \
 	{ \
 		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
 		idx0 = h0[0] ^ h0[4]; \
 		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
 		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
+		if(ALGO == cryptonight_monero_v8) \
+		{ \
+			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
+			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
+			sqrt_result_xmm = _mm_cvtsi64_si128(h0[13]); \
+			set_float_rounding_mode(); \
+		} \
 	} \
 	__m128i *ptr0
 
-
-#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
+#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \
 	__m128i cx; \
 	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 	cx = _mm_load_si128(ptr0); \
@@ -512,7 +590,8 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 			cx = soft_aesenc(cx, ax0); \
 		else \
 			cx = _mm_aesenc_si128(cx, ax0); \
-	}
+	} \
+	CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1)
 
 #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -524,15 +603,22 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
-	bx0 = cx; \
+	if(ALGO != cryptonight_monero_v8) \
+		bx0 = cx
 
-#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
+#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm) \
 	uint64_t lo, cl, ch; \
 	uint64_t al0 = _mm_cvtsi128_si64(ax0); \
 	uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
 	cl = ((uint64_t*)ptr0)[0]; \
 	ch = ((uint64_t*)ptr0)[1]; \
-	\
+	CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl); \
+	CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \
+	if(ALGO == cryptonight_monero_v8) \
+	{ \
+		bx1 = bx0; \
+		bx0 = cx; \
+	} \
 	{ \
 		uint64_t hi; \
 		lo = _umul128(idx0, cl, &hi); \
@@ -542,7 +628,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 	((uint64_t*)ptr0)[0] = al0; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0)
-	
 
 #define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \
 	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -622,6 +707,9 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 #define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n
 #define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n
 #define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n
+#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n
+#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n
+#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n
 
 /** repeat a macro call multiple times
  *
@@ -657,15 +745,14 @@ struct Cryptonight_hash<1>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_1(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-
-			REPEAT_1(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_1(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -687,14 +774,14 @@ struct Cryptonight_hash<2>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_2(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_2(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_2(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -716,14 +803,14 @@ struct Cryptonight_hash<3>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_3(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_3(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_3(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -745,14 +832,14 @@ struct Cryptonight_hash<4>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_4(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_4(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_4(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -774,14 +861,14 @@ struct Cryptonight_hash<5>
 		constexpr size_t MEM = cn_select_memory<ALGO>();
 
 		CN_INIT_SINGLE;
-		REPEAT_5(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0);
+		REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_5(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
+			REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_5(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
+			REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm);
 			REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -789,3 +876,24 @@ struct Cryptonight_hash<5>
 		REPEAT_5(0, CN_FINALIZE);
 	}
 };
+
+extern "C" void cryptonigh_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0);
+extern "C" void cryptonigh_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0);
+
+template<xmrstak_algo ALGO, int asm_version>
+void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+{
+	constexpr size_t MEM = cn_select_memory<ALGO>();
+
+	keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+	cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+
+	if (asm_version == 1)
+		cryptonigh_v8_mainloop_ivybridge_asm(ctx[0]);
+	else
+		cryptonigh_v8_mainloop_ryzen_asm(ctx[0]);
+
+	cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+	keccakf((uint64_t*)ctx[0]->hash_state, 24);
+	extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
+}
diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp
index 49da7ae2d..1f9501c40 100644
--- a/xmrstak/backend/cpu/jconf.cpp
+++ b/xmrstak/backend/cpu/jconf.cpp
@@ -108,10 +108,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *mode, *no_prefetch, *aff;
+	const Value *mode, *no_prefetch, *aff, *asm_version;
 	mode = GetObjectMember(oThdConf, "low_power_mode");
 	no_prefetch = GetObjectMember(oThdConf, "no_prefetch");
 	aff = GetObjectMember(oThdConf, "affine_to_cpu");
+	asm_version = GetObjectMember(oThdConf, "asm");
 
 	if(mode == nullptr || no_prefetch == nullptr || aff == nullptr)
 		return false;
@@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	else
 		cfg.iCpuAff = -1;
 
+	if(!asm_version->IsString())
+		return false;
+	cfg.asm_version_str = asm_version->GetString();
+
 	return true;
 }
 
diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp
index be855036e..4ec9165d5 100644
--- a/xmrstak/backend/cpu/jconf.hpp
+++ b/xmrstak/backend/cpu/jconf.hpp
@@ -24,6 +24,7 @@ class jconf
 	struct thd_cfg {
 		int iMultiway;
 		bool bNoPrefetch;
+		std::string asm_version_str;
 		long long iCpuAff;
 	};
 
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 93ce218a3..f07c71481 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -104,7 +104,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
 #endif
 }
 
-minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity)
+minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version)
 {
 	this->backendType = iBackend::CPU;
 	oWork = pWork;
@@ -113,6 +113,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch,
 	iJobNo = 0;
 	bNoPrefetch = no_prefetch;
 	this->affinity = affinity;
+	asm_version_str = asm_version;
 
 	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> order_guard = order_fix.get_future();
@@ -305,6 +306,16 @@ bool minethd::self_test()
 			hashf("This is a test This is a test This is a test", 44, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 		}
+		else if(algo == cryptonight_monero_v8)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult &= memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0;
+		}
 		else if(algo == cryptonight_aeon)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon);
@@ -431,7 +442,7 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 		else
 			printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway);
 
-		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff);
+		minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str);
 		pvThreads.push_back(thd);
 	}
 
@@ -439,9 +450,31 @@ std::vector<iBackend*> minethd::thread_starter(uint32_t threadOffset, miner_work
 }
 
 template<size_t N>
-minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
 {
 	static_assert(N >= 1, "number of threads must be >= 1" );
+	
+	// check for asm optimized version for cryptonight_v8
+	if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes)
+	{
+		if(asm_version_str != "auto")
+		{
+			if(asm_version_str == "intel")
+			{
+				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 1>;
+			}
+			if(asm_version_str == "ryzen")
+			{
+				// AMD Ryzen (1xxx and 2xxx series)
+				return cryptonight_hash_v2_asm<cryptonight_monero_v8, 2>;
+			}
+			else
+			{
+				printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str());
+			}
+		}
+	}
 	// We have two independent flag bits in the functions
 	// therefore we will build a binary digit and select the
 	// function as a two digit binary
@@ -479,6 +512,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	case cryptonight_bittube2:
 		algv = 9;
 		break;
+	case cryptonight_monero_v8:
+		algv = 10;
+		break;
 	default:
 		algv = 2;
 		break;
@@ -533,7 +569,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_bittube2, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, true>
+		Cryptonight_hash<N>::template hash<cryptonight_bittube2, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>
 	};
 
 	std::bitset<2> digit;
@@ -618,7 +659,7 @@ void minethd::multiway_work_main()
 
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
@@ -653,12 +694,12 @@ void minethd::multiway_work_main()
 			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
 				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			else
 			{
 				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo);
+				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			lastPoolId = oWork.iPoolId;
 			version = new_version;
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 26478542c..53ff93c15 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -32,9 +32,9 @@ class minethd : public iBackend
 private:
 
 	template<size_t N>
-	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
+	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "auto");
 
-	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity);
+	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
 
 	template<uint32_t N>
 	void multiway_work_main();
@@ -60,6 +60,7 @@ class minethd : public iBackend
 
 	bool bQuit;
 	bool bNoPrefetch;
+	std::string asm_version_str = "auto";
 };
 
 } // namespace cpu
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index b6f656138..6b1afa928 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -16,6 +16,7 @@ enum xmrstak_algo
 	cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari
 	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
+	cryptonight_monero_v8 = 11
 };
 
 // define aeon settings
@@ -45,6 +46,9 @@ inline constexpr size_t cn_select_memory<cryptonight_lite>() { return CRYPTONIGH
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_monero>() { return CRYPTONIGHT_MEMORY; }
 
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_monero_v8>() { return CRYPTONIGHT_MEMORY; }
+
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MEMORY; }
 
@@ -72,6 +76,7 @@ inline size_t cn_select_memory(xmrstak_algo algo)
 	{
 	case cryptonight_stellite:
 	case cryptonight_monero:
+	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
 		return CRYPTONIGHT_MEMORY;
@@ -100,6 +105,9 @@ inline constexpr uint32_t cn_select_mask<cryptonight_lite>() { return CRYPTONIGH
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_monero>() { return CRYPTONIGHT_MASK; }
 
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_monero_v8>() { return CRYPTONIGHT_MASK; }
+
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MASK; }
 
@@ -127,6 +135,7 @@ inline size_t cn_select_mask(xmrstak_algo algo)
 	{
 	case cryptonight_stellite:
 	case cryptonight_monero:
+	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
 		return CRYPTONIGHT_MASK;
@@ -155,6 +164,9 @@ inline constexpr uint32_t cn_select_iter<cryptonight_lite>() { return CRYPTONIGH
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_monero>() { return CRYPTONIGHT_ITER; }
 
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_monero_v8>() { return CRYPTONIGHT_ITER; }
+
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_ITER; }
 
@@ -182,6 +194,7 @@ inline size_t cn_select_iter(xmrstak_algo algo)
 	{
 	case cryptonight_stellite:
 	case cryptonight_monero:
+	case cryptonight_monero_v8:
 	case cryptonight:
 		return CRYPTONIGHT_ITER;
 	case cryptonight_ipbc:
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 6c6475150..1273f89e9 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -194,6 +194,31 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_
 #endif
 }
 
+template<size_t group_n>
+__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2)
+{
+	uint64_t tmp;
+	((uint32_t*)&tmp)[0] = shuffle<group_n>(ptr, sub, val, src);
+	((uint32_t*)&tmp)[1] = shuffle<group_n>(ptr, sub, val, src2);
+	return tmp;
+}
+
+__forceinline__ __device__ uint64_t int_sqrt33_1_double_precision(int i,const uint64_t n0)
+{
+	uint64_t x = (n0 >> 12) + (1023ULL << 52);
+	const double xx = sqrt( *reinterpret_cast<double*>(&x) );
+	uint64_t r = *reinterpret_cast<const uint64_t*>(&xx);
+
+	const uint64_t s = r >> 20;
+	r >>= 19;
+
+	uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+
+ 	if (x2 < n0) ++r;
+
+	return r;
+}
+
 template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
@@ -250,7 +275,19 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 			idx0 = *(d_ctx_b + threads * 4 + thread);
 		}
 	}
-	d[1] = (d_ctx_b + thread * 4)[sub];
+
+	uint32_t bx1, division_result, sqrt_result;
+	if(ALGO == cryptonight_monero_v8)
+	{
+		d[1] = (d_ctx_b + thread * 12)[sub];
+		bx1 = (d_ctx_b + thread * 12 + 4)[sub];
+
+		// must be valid only for `sub < 2`
+		division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2];
+		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2];
+	}
+	else
+		d[1] = (d_ctx_b + thread * 4)[sub];
 
 	#pragma unroll 2
 	for ( i = start; i < end; ++i )
@@ -259,7 +296,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 		for ( int x = 0; x < 2; ++x )
 		{
 			j = ( ( idx0 & MASK ) >> 2 ) + sub;
-
+			
 			if(ALGO == cryptonight_bittube2)
 			{
 				uint32_t k[4];
@@ -290,6 +327,57 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 					}
 				}
 			}
+			else if(ALGO == cryptonight_monero_v8)
+			{
+
+				const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
+				uint4 chunk0{};
+				chunk0.x = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[0], 0);
+				chunk0.y = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[1], 0);
+				chunk0.z = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[2], 0);
+				chunk0.w = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[3], 0);
+
+				const uint32_t x_0 = ((uint32_t*)&chunk0)[sub];
+				const uint32_t x_1 = ((uint32_t*)&chunk0)[(sub + 1) % 4];
+				const uint32_t x_2 = ((uint32_t*)&chunk0)[(sub + 2) % 4];
+				const uint32_t x_3 = ((uint32_t*)&chunk0)[(sub + 3) % 4];
+				d[x] = a ^
+					t_fn0( x_0 & 0xff ) ^
+					t_fn1( (x_1 >> 8) & 0xff ) ^
+					t_fn2( (x_2 >> 16) & 0xff ) ^
+					t_fn3( ( x_3 >> 24 ) );
+
+				uint4 value;
+				const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
+				if(sub == 1)
+					((uint64_t*)&value)[0] = tmp10;
+				const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
+				if(sub == 1)
+					((uint64_t*)&value)[1] = tmp20;
+				const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
+				if(sub == 2)
+					((uint64_t*)&value)[0] = tmp11;
+				const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
+				if(sub == 2)
+					((uint64_t*)&value)[1] = tmp21;
+				const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
+				if(sub == 3)
+					((uint64_t*)&value)[0] = tmp12;
+				const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
+				if(sub == 3)
+					((uint64_t*)&value)[1] = tmp22;
+
+				if(sub > 0)
+				{
+					uint4 store{};
+					((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
+					((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
+
+					const int dest = sub + 1;
+					const int dest2 = dest == 4 ? 1 : dest;
+					*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
+				}
+			}
 			else
 			{
 				const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
@@ -302,7 +390,6 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 					t_fn2( (x_2 >> 16) & 0xff ) ^
 					t_fn3( ( x_3 >> 24 ) );
 			}
-
 			//XOR_BLOCKS_DST(c, b, &long_state[j]);
 			t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
 
@@ -331,10 +418,82 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 
 			uint32_t yy[2];
 			*( (uint64_t*) yy ) = loadGlobal64<uint64_t>( ( (uint64_t *) long_state )+( j >> 1 ) );
+
+			if(ALGO == cryptonight_monero_v8 )
+			{
+				const uint64_t sqrt_result_64 = shuffle64<4>(sPtr, sub, sqrt_result, 0, 1);
+
+				// Use division and square root results from the _previous_ iteration to hide the latency
+				const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1);
+
+
+				const uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1);
+				const uint64_t cl_rhs = division_result_64 ^ (sqrt_result_64 << 32);
+
+				if(sub < 2)
+					*((uint64_t*)yy) ^= cl_rhs;
+
+
+				const uint32_t dd = (cx0 + (sqrt_result_64 << 1)) | 0x80000001UL;
+
+				// Most and least significant bits in the divisor are set to 1
+				// to make sure we don't divide by a small or even number,
+				// so there are no shortcuts for such cases
+				//
+				// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
+				// We drop the highest bit to fit both quotient and remainder in 32 bits
+
+				// Compiler will optimize it to a single div instruction
+				const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3);
+
+
+				const uint64_t division_result_tmp = static_cast<uint32_t>(cx1 / dd) + ((cx1 % dd) << 32);
+
+				division_result = ((uint32_t*)&division_result_tmp)[sub % 2];
+								
+				// Use division_result as an input for the square root to prevent parallel implementation in hardware
+				const uint64_t sqrt_result_tmp = int_sqrt33_1_double_precision(i, cx0 + division_result_tmp);
+				sqrt_result = ((uint32_t*)&sqrt_result_tmp)[sub % 2];
+			}
+
 			uint32_t zz[2];
 			zz[0] = shuffle<4>(sPtr,sub, yy[0], 0);
 			zz[1] = shuffle<4>(sPtr,sub, yy[1], 0);
-
+			// Shuffle the other 3x16 byte chunks in the current 64-byte cache line
+			if(ALGO == cryptonight_monero_v8)
+			{
+				uint4 value;
+				const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1);
+				if(sub == 1)
+					((uint64_t*)&value)[0] = tmp10;
+				const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3);
+				if(sub == 1)
+					((uint64_t*)&value)[1] = tmp20;
+				const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1);
+				if(sub == 2)
+					((uint64_t*)&value)[0] = tmp11;
+				const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3);
+				if(sub == 2)
+					((uint64_t*)&value)[1] = tmp21;
+				const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1);
+				if(sub == 3)
+					((uint64_t*)&value)[0] = tmp12;
+				const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3);
+				if(sub == 3)
+					((uint64_t*)&value)[1] = tmp22;
+				if(sub > 0)
+				{
+					const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) );
+					uint4 store{};
+					((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0];
+					((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1];
+
+					const int dest = sub + 1;
+					const int dest2 = dest == 4 ? 1 : dest;
+					*( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store;
+				}
+			}
+			
 			t1[1] = shuffle<4>(sPtr,sub, d[x], 1);
 			#pragma unroll
 			for ( k = 0; k < 2; k++ )
@@ -384,13 +543,31 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
 
 				idx0 = (~d) ^ q;
 			}
+			if(ALGO == cryptonight_monero_v8)
+			{
+				bx1 = d[(x + 1) % 2];
+			}
 		}
 	}
 
 	if ( bfactor > 0 )
 	{
 		(d_ctx_a + thread * 4)[sub] = a;
-		(d_ctx_b + thread * 4)[sub] = d[1];
+		if(ALGO == cryptonight_monero_v8)
+		{
+			(d_ctx_b + thread * 12)[sub] = d[1];
+			(d_ctx_b + thread * 12 + 4)[sub] = bx1;
+
+			if(sub < 2)
+			{
+				// must be valid only for `sub < 2`
+				(d_ctx_b + thread * 12 + 4 * 2)[sub % 2] = division_result;
+				(d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2] = sqrt_result;
+			}
+		}
+		else
+			(d_ctx_b + thread * 4)[sub] = d[1];
+			
 		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
 			if(sub&1)
 				*(d_ctx_b + threads * 4 + thread) = idx0;
@@ -534,6 +711,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 	{
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero>(ctx, startNonce);
 	}
+	else if(miner_algo == cryptonight_monero_v8)
+	{
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8>(ctx, startNonce);
+	}
 	else if(miner_algo == cryptonight_heavy)
 	{
 		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy>(ctx, startNonce);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index b455f55ca..1ea54ddba 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -142,7 +142,19 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
 	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
 	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
-	memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
+	if(ALGO == cryptonight_monero_v8)
+	{
+		memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 );
+		// bx1
+		XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b );
+		memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 );
+		// division_result
+		memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 );
+		// sqrt_result
+		memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 );
+	}
+	else
+		memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
 
 	memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 );
 	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
@@ -298,6 +310,12 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		// create a double buffer for the state to exchange the mixed state to phase1
 		CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
 	}
+	else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+			cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+	{
+		// bx1 (16byte), division_result (8byte) and sqrt_result (8byte)
+		ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize;
+	}
 	else
 		ctx->d_ctx_state2 = ctx->d_ctx_state;
 
@@ -340,6 +358,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
+	if(miner_algo == cryptonight_monero_v8)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
 	else
 	{
 		/* pass two times d_ctx_state because the second state is used later in phase1,
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index b6580ea9a..609b55f72 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -99,12 +99,13 @@ xmrstak::coin_selection coins[] = {
 	{ "cryptonight_lite_v7", {cryptonight_lite, cryptonight_aeon, 255u},   {cryptonight_aeon, cryptonight_lite, 7u},     nullptr },
 	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 255u}, nullptr },
 	{ "cryptonight_v7",      {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
+	{ "cryptonight_v8",      {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "cryptonight_v7_stellite", {cryptonight_monero, cryptonight_stellite, 255u}, {cryptonight_monero, cryptonight_monero, 255u}, nullptr },
 	{ "graft",               {cryptonight_monero, cryptonight, 8u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "haven",               {cryptonight_haven, cryptonight_heavy, 3u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
 	{ "intense",             {cryptonight_monero, cryptonight, 4u},        {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "masari",              {cryptonight_masari, cryptonight_monero, 7u},   {cryptonight_monero, cryptonight_monero, 0u},nullptr },
-	{ "monero7",             {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" },
+	{ "monero8",             {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" },
 	{ "qrl",             	 {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
 	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},   {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
 	{ "stellite",            {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr },
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index 11d0f6df0..02ac8b7f5 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -560,7 +560,7 @@ void executor::ex_main()
 		else
 			pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true);
 		break;
-
+	case cryptonight_monero_v8:
 	case cryptonight_monero:
 		if(dev_tls)
 			pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false);
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index 9fce9b7e5..d20ba082f 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -685,6 +685,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 		case cryptonight_monero:
 			algo_name = "cryptonight_v7";
 			break;
+		case cryptonight_monero_v8:
+			algo_name = "cryptonight_v8";
+			break;
 		case cryptonight_aeon:
 			algo_name = "cryptonight_lite_v7";
 			break;