diff --git a/CMakeLists.txt b/CMakeLists.txt index a642b385d..067bbd0a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -458,6 +458,15 @@ if(MICROHTTPD_ENABLE) endif() target_link_libraries(xmr-stak-c ${LIBS}) +enable_language(ASM) +# asm optimized monero v8 code +add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S" +) +set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) + + # compile generic backend files file(GLOB BACKEND_CPP "xmrstak/*.cpp" @@ -472,7 +481,7 @@ add_library(xmr-stak-backend STATIC ${BACKEND_CPP} ) -target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS}) +target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm) # compile CUDA backend if(CUDA_FOUND) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 87721ac8f..15171a1a0 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -390,9 +390,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char options[512]; snprintf(options, sizeof(options), - "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d", + "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d -DCN_UNROLL=%d", hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0, - int_port(hashMemSize), int(miner_algo[ii])); + int_port(hashMemSize), int(miner_algo[ii]), ctx->unroll); /* create a hash for the compile time cache * used data: * - source code @@ -885,6 +885,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) //char* source_code = LoadTextFile(sSourcePath); + const char *fastIntMathV2CL = + #include "./opencl/fast_int_math_v2.cl" + ; const char *cryptonightCL = #include "./opencl/cryptonight.cl" ; @@ -905,6 +908,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) ; std::string source_code(cryptonightCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); @@ -914,16 +918,37 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + for(int i = 0; i < num_gpus; ++i) { + const std::string backendName = xmrstak::params::inst().openCLVendor; if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) { size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; ctx[i].rawIntensity = reduced_intensity; - const std::string backendName = xmrstak::params::inst().openCLVendor; printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity)); } + if(useCryptonight_v8) + { + if(ctx[i].stridedIndex == 1) + { + printer::inst()->print_msg(L0, "ERROR %s: gpu %d stridedIndex is not allowed to be `true` or `1` for the selected currency", backendName.c_str(), ctx[i].deviceIdx); + return ERR_STUPID_PARAMS; + } + if(ctx[i].stridedIndex == 2 && ctx[i].memChunk < 2) + { + printer::inst()->print_msg(L0, "ERROR %s: gpu %d memChunk bust be >= 2 for the selected currency", backendName.c_str(), ctx[i].deviceIdx); + return ERR_STUPID_PARAMS; + } + } + if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) { return ret; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 5ab80b82a..63c5029d7 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -27,6 +27,7 @@ struct GpuContext size_t workSize; int stridedIndex; int memChunk; + int unroll = 0; bool isNVIDIA = false; int compMode; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 002472d3a..28d56a463 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width) } #endif +//#include "opencl/fast_int_math_v2.cl" +XMRSTAK_INCLUDE_FAST_INT_MATH_V2 //#include "opencl/wolf-aes.cl" XMRSTAK_INCLUDE_WOLF_AES //#include "opencl/wolf-skein.cl" @@ -416,6 +418,9 @@ void AESExpandKey256(uint *keybuf) } } +)===" +R"===( + #define MEM_CHUNK (1<> 4))))) +#else +# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)]) +#endif + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 @@ -560,9 +573,29 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ) { - ulong a[2], b[2]; + ulong a[2]; + +// cryptonight_monero_v8 +#if(ALGO==11) + ulong b[4]; + uint4 b_x[2]; +// NVIDIA +# ifdef __NV_CL_C_VERSION + __local uint16 scratchpad_line_buf[WORKSIZE]; + __local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0); +# endif +#else + ulong b[2]; + uint4 b_x[1]; +#endif __local uint AES0[256], AES1[256], AES2[256], AES3[256]; +// cryptonight_monero_v8 +#if(ALGO==11) + __local uint RCP[256]; + uint2 division_result; + uint sqrt_result; +#endif const ulong gIdx = getIdx(); for(int i = get_local_id(0); i < 256; i += WORKSIZE) @@ -572,6 +605,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states AES1[i] = rotate(tmp, 8U); AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); +// cryptonight_monero_v8 +#if(ALGO==11) + RCP[i] = RCP_C[i]; +#endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -579,7 +616,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint2 tweak1_2; #endif - uint4 b_x; + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) @@ -599,7 +636,17 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states a[1] = states[1] ^ states[5]; b[1] = states[3] ^ states[7]; - b_x = ((uint4 *)b)[0]; + b_x[0] = ((uint4 *)b)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + a[1] = states[1] ^ states[5]; + b[2] = states[8] ^ states[10]; + b[3] = states[9] ^ states[11]; + b_x[1] = ((uint4 *)b)[1]; + division_result = as_uint2(states[12]); + sqrt_result = as_uint2(states[13]).s0; +#endif // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) tweak1_2 = as_uint2(input[4]); @@ -617,37 +664,96 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states if(gIdx < Threads) #endif { - ulong idx0 = a[0]; + ulong idx0 = a[0] & MASK; - #pragma unroll 8 + #pragma unroll CN_UNROLL for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; +// cryptonight_monero_v8 && NVIDIA +#if(ALGO==11 && defined(__NV_CL_C_VERSION)) + ulong idxS = idx0 & 0x30; + *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; +#endif - ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)]; + ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); // cryptonight_bittube2 #if(ALGO == 10) ((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #else ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif - b_x ^= ((uint4 *)c)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif + // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint table = 0x75310U; + b_x[0] ^= ((uint4 *)c)[0]; // cryptonight_stellite # if(ALGO == 7) - uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2); + uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2); +# else + uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2); +# endif + b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; + SCRATCHPAD_CHUNK(0) = b_x[0]; + idx0 = c[0] & MASK; +// cryptonight_monero_v8 +#elif(ALGO==11) + SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; +# ifdef __NV_CL_C_VERSION + // flush shuffeled data + SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; + idx0 = c[0] & MASK; + idxS = idx0 & 0x30; + *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; # else - uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2); + idx0 = c[0] & MASK; # endif - b_x.s2 ^= ((table >> index) & 0x30U) << 24; +#else + b_x[0] ^= ((uint4 *)c)[0]; + SCRATCHPAD_CHUNK(0) = b_x[0]; + idx0 = c[0] & MASK; #endif - Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x; - uint4 tmp; - tmp = Scratchpad[IDX((c[0] & MASK) >> 4)]; - + tmp = SCRATCHPAD_CHUNK(0); +// cryptonight_monero_v8 +#if(ALGO==11) + // Use division and square root results from the _previous_ iteration to hide the latency + tmp.s0 ^= division_result.s0; + tmp.s1 ^= division_result.s1 ^ sqrt_result; + // Most and least significant bits in the divisor are set to 1 + // to make sure we don't divide by a small or even number, + // so there are no shortcuts for such cases + const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL; + // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 + // We drop the highest bit to fit both quotient and remainder in 32 bits + division_result = fast_div_v2(RCP, c[1], d); + // Use division_result as an input for the square root to prevent parallel implementation in hardware + sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result)); +#endif +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif a[1] += c[0] * as_ulong2(tmp).s0; a[0] += mul_hi(c[0], as_ulong2(tmp).s0); @@ -658,44 +764,55 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # if(ALGO == 6 || ALGO == 10) uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; # else ((uint2 *)&(a[1]))[0] ^= tweak1_2; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= tweak1_2; # endif #else - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; #endif ((uint4 *)a)[0] ^= tmp; - idx0 = a[0]; - b_x = ((uint4 *)c)[0]; +// cryptonight_monero_v8 +#if (ALGO == 11) +# if defined(__NV_CL_C_VERSION) + // flush shuffeled data + SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; +# endif + b_x[1] = b_x[0]; +#endif + b_x[0] = ((uint4 *)c)[0]; + idx0 = a[0] & MASK; // cryptonight_heavy || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 10) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = d ^ q; -#endif + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = (d ^ q) & MASK; // cryptonight_haven -#if (ALGO == 9) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; +#elif (ALGO == 9) + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = (~d) ^ q; + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = ((~d) ^ q) & MASK; #endif + } } mem_fence(CLK_GLOBAL_MEM_FENCE); } +)===" +R"===( + __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads) { diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index d6acec971..c5b331c87 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -127,6 +127,24 @@ class autoAdjust minFreeMem = 512u * byteToMiB; } + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + + // set strided index to default + ctx.stridedIndex = 1; + + // nvidia performance is very bad if the scratchpad is not contiguous + if(ctx.isNVIDIA) + ctx.stridedIndex = 0; + + // use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2` + if(useCryptonight_v8) + ctx.stridedIndex = 2; + // increase all intensity limits by two for aeon if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) maxThreads *= 2u; @@ -153,8 +171,8 @@ class autoAdjust // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n" - " \"comp_mode\" : true\n" + + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" + " \"unroll\" : 8, \"comp_mode\" : true\n" + " },\n"; } else diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 28855f070..0101b7e2f 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -13,13 +13,15 @@ R"===( * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte) + * unroll - allow to control how often the POW main loop is unrolled; valid range [0;128] * comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows * to use a intensity which is not the multiple of the worksize. * If you set false and the intensity is not multiple of the worksize the miner can crash: * in this case set the intensity to a multiple of the worksize or activate comp_mode. * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 0f39ff2b9..36ae9fd43 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -106,17 +106,18 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); stridedIndex = GetObjectMember(oThdConf, "strided_index"); memChunk = GetObjectMember(oThdConf, "mem_chunk"); + unroll = GetObjectMember(oThdConf, "unroll"); compMode = GetObjectMember(oThdConf, "comp_mode"); if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr || - stridedIndex == nullptr || compMode == nullptr) + stridedIndex == nullptr || unroll == nullptr || compMode == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -150,6 +151,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) return false; } + if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 ) + { + printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128"); + return false; + } + cfg.unroll = (int)unroll->GetInt64(); + if(!compMode->IsBool()) return false; diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index 580b69fe7..b852c5940 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -28,6 +28,7 @@ class jconf long long cpu_aff; int stridedIndex; int memChunk; + int unroll; bool compMode; }; diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index d6051ffcd..5ac246335 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -99,6 +99,7 @@ bool minethd::init_gpus() vGpuData[i].stridedIndex = cfg.stridedIndex; vGpuData[i].memChunk = cfg.memChunk; vGpuData[i].compMode = cfg.compMode; + vGpuData[i].unroll = cfg.unroll; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index 57dbef053..8588fea8c 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -82,7 +82,7 @@ class autoAdjust conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(aff_id); conf += std::string(" },\n"); diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 01d2280d8..a73de8618 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -70,7 +70,7 @@ class autoAdjust { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"auto\", \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index 2fc9a47ec..bfffc851e 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -7,10 +7,15 @@ R"===( * the maximum performance. When set to a number N greater than 1, this mode will increase the * cache usage and single thread performance by N times. * - * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make + * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. * - * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading + * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [auto, intel, ryzen] + * - auto: used the default implementation (no assembler version) + * - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + * - ryzen: AMD Ryzen (1xxx and 2xxx series) + * + * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading * systems it is better to assign threads to physical cores. On Windows this usually means selecting * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 * physical core CPU you should select cpu numbers 0-3. @@ -21,8 +26,8 @@ R"===( * A filled out configuration should look like this: * "cpu_threads_conf" : * [ - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 }, - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "auto", "affine_to_cpu" : 1 }, * ], * If you do not wish to mine with your CPU(s) then use: * "cpu_threads_conf" : diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 89c508990..0ab47e390 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -19,6 +19,7 @@ #include "xmrstak/backend/cryptonight.hpp" #include #include +#include #ifdef __GNUC__ #include @@ -422,6 +423,27 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } +inline __m128i int_sqrt33_1_double_precision(const uint64_t n0) +{ + __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); + x = _mm_sqrt_sd(_mm_setzero_pd(), x); + uint64_t r = static_cast(_mm_cvtsi128_si64(_mm_castpd_si128(x))); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); + +#if defined _MSC_VER || (__GNUC__ >= 7) + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); +#else + // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence + // Fallback to simpler code + if (x2 < n0) ++r; +#endif + return _mm_cvtsi64_si128(r); +} + inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) { alignas(16) uint32_t k[4]; @@ -467,6 +489,51 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } +inline void set_float_rounding_mode() +{ +#ifdef _MSC_VER + _control87(RC_DOWN, MCW_RC); +#else + std::fesetround(FE_DOWNWARD); +#endif +} + +#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } + +#define CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl) \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t sqrt_result = static_cast(_mm_cvtsi128_si64(sqrt_result_xmm)); \ + /* Use division and square root results from the _previous_ iteration to hide the latency */ \ + const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result << 32); \ + const uint32_t d = (cx_64 + (sqrt_result << 1)) | 0x80000001UL; \ + /* Most and least significant bits in the divisor are set to 1 \ + * to make sure we don't divide by a small or even number, \ + * so there are no shortcuts for such cases \ + * \ + * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \ + * We drop the highest bit to fit both quotient and remainder in 32 bits \ + */ \ + /* Compiler will optimize it to a single div instruction */ \ + const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \ + division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \ + /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \ + sqrt_result_xmm = int_sqrt33_1_double_precision(cx_64 + division_result); \ + } + #define CN_INIT_SINGLE \ if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ { \ @@ -474,7 +541,7 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) return; \ } -#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ +#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm) \ keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ uint64_t monero_const; \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -489,16 +556,27 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) uint64_t idx0; \ __m128i bx0; \ uint8_t* l0 = ctx[n]->long_state; \ + /* BEGIN cryptonight_monero_v8 variables */ \ + __m128i bx1; \ + __m128i division_result_xmm; \ + __m128i sqrt_result_xmm; \ + /* END cryptonight_monero_v8 variables */ \ { \ uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ idx0 = h0[0] ^ h0[4]; \ ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ + sqrt_result_xmm = _mm_cvtsi64_si128(h0[13]); \ + set_float_rounding_mode(); \ + } \ } \ __m128i *ptr0 - -#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ +#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \ __m128i cx; \ ptr0 = (__m128i *)&l0[idx0 & MASK]; \ cx = _mm_load_si128(ptr0); \ @@ -512,7 +590,8 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) cx = soft_aesenc(cx, ax0); \ else \ cx = _mm_aesenc_si128(cx, ax0); \ - } + } \ + CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -524,15 +603,22 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) ptr0 = (__m128i *)&l0[idx0 & MASK]; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ - bx0 = cx; \ + if(ALGO != cryptonight_monero_v8) \ + bx0 = cx -#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm) \ uint64_t lo, cl, ch; \ uint64_t al0 = _mm_cvtsi128_si64(ax0); \ uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ cl = ((uint64_t*)ptr0)[0]; \ ch = ((uint64_t*)ptr0)[1]; \ - \ + CN_MONERO_V8_DIV(n, cx, sqrt_result_xmm, division_result_xmm, cl); \ + CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = bx0; \ + bx0 = cx; \ + } \ { \ uint64_t hi; \ lo = _umul128(idx0, cl, &hi); \ @@ -542,7 +628,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) ((uint64_t*)ptr0)[0] = al0; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0) - #define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -622,6 +707,9 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) #define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n #define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n #define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n +#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n +#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n +#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n /** repeat a macro call multiple times * @@ -657,15 +745,14 @@ struct Cryptonight_hash<1> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_1(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - - REPEAT_1(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_1(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -687,14 +774,14 @@ struct Cryptonight_hash<2> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_2(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_2(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_2(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -716,14 +803,14 @@ struct Cryptonight_hash<3> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_3(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_3(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_3(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -745,14 +832,14 @@ struct Cryptonight_hash<4> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_4(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_4(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_4(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -774,14 +861,14 @@ struct Cryptonight_hash<5> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_5(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result_xmm, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_5(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_5(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result_xmm, division_result_xmm); REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -789,3 +876,24 @@ struct Cryptonight_hash<5> REPEAT_5(0, CN_FINALIZE); } }; + +extern "C" void cryptonigh_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonigh_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); + +template +void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + constexpr size_t MEM = cn_select_memory(); + + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + + if (asm_version == 1) + cryptonigh_v8_mainloop_ivybridge_asm(ctx[0]); + else + cryptonigh_v8_mainloop_ryzen_asm(ctx[0]); + + cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); +} diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 49da7ae2d..1f9501c40 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -108,10 +108,11 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *mode, *no_prefetch, *aff; + const Value *mode, *no_prefetch, *aff, *asm_version; mode = GetObjectMember(oThdConf, "low_power_mode"); no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + asm_version = GetObjectMember(oThdConf, "asm"); if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) return false; @@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) else cfg.iCpuAff = -1; + if(!asm_version->IsString()) + return false; + cfg.asm_version_str = asm_version->GetString(); + return true; } diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index be855036e..4ec9165d5 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -24,6 +24,7 @@ class jconf struct thd_cfg { int iMultiway; bool bNoPrefetch; + std::string asm_version_str; long long iCpuAff; }; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 93ce218a3..f07c71481 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -104,7 +104,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id #endif } -minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity) +minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version) { this->backendType = iBackend::CPU; oWork = pWork; @@ -113,6 +113,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, iJobNo = 0; bNoPrefetch = no_prefetch; this->affinity = affinity; + asm_version_str = asm_version; std::unique_lock lck(thd_aff_set); std::future order_guard = order_fix.get_future(); @@ -305,6 +306,16 @@ bool minethd::self_test() hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; } + else if(algo == cryptonight_monero_v8) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult &= memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + } else if(algo == cryptonight_aeon) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon); @@ -431,7 +442,7 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work else printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway); - minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff); + minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str); pvThreads.push_back(thd); } @@ -439,9 +450,31 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work } template -minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str) { static_assert(N >= 1, "number of threads must be >= 1" ); + + // check for asm optimized version for cryptonight_v8 + if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes) + { + if(asm_version_str != "auto") + { + if(asm_version_str == "intel") + { + // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + return cryptonight_hash_v2_asm; + } + if(asm_version_str == "ryzen") + { + // AMD Ryzen (1xxx and 2xxx series) + return cryptonight_hash_v2_asm; + } + else + { + printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); + } + } + } // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary @@ -479,6 +512,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_bittube2: algv = 9; break; + case cryptonight_monero_v8: + algv = 10; + break; default: algv = 2; break; @@ -533,7 +569,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; @@ -618,7 +659,7 @@ void minethd::multiway_work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); uint8_t version = 0; size_t lastPoolId = 0; @@ -653,12 +694,12 @@ void minethd::multiway_work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } lastPoolId = oWork.iPoolId; version = new_version; diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 26478542c..53ff93c15 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -32,9 +32,9 @@ class minethd : public iBackend private: template - static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); + static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "auto"); - minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); + minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); template void multiway_work_main(); @@ -60,6 +60,7 @@ class minethd : public iBackend bool bQuit; bool bNoPrefetch; + std::string asm_version_str = "auto"; }; } // namespace cpu diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index b6f656138..6b1afa928 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -16,6 +16,7 @@ enum xmrstak_algo cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks + cryptonight_monero_v8 = 11 }; // define aeon settings @@ -45,6 +46,9 @@ inline constexpr size_t cn_select_memory() { return CRYPTONIGH template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } +template<> +inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } + template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; } @@ -72,6 +76,7 @@ inline size_t cn_select_memory(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: return CRYPTONIGHT_MEMORY; @@ -100,6 +105,9 @@ inline constexpr uint32_t cn_select_mask() { return CRYPTONIGH template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } +template<> +inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } + template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; } @@ -127,6 +135,7 @@ inline size_t cn_select_mask(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: return CRYPTONIGHT_MASK; @@ -155,6 +164,9 @@ inline constexpr uint32_t cn_select_iter() { return CRYPTONIGH template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } +template<> +inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } + template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } @@ -182,6 +194,7 @@ inline size_t cn_select_iter(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight: return CRYPTONIGHT_ITER; case cryptonight_ipbc: diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 6c6475150..1273f89e9 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -194,6 +194,31 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_ #endif } +template +__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2) +{ + uint64_t tmp; + ((uint32_t*)&tmp)[0] = shuffle(ptr, sub, val, src); + ((uint32_t*)&tmp)[1] = shuffle(ptr, sub, val, src2); + return tmp; +} + +__forceinline__ __device__ uint64_t int_sqrt33_1_double_precision(int i,const uint64_t n0) +{ + uint64_t x = (n0 >> 12) + (1023ULL << 52); + const double xx = sqrt( *reinterpret_cast(&x) ); + uint64_t r = *reinterpret_cast(&xx); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); + + if (x2 < n0) ++r; + + return r; +} + template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) @@ -250,7 +275,19 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = *(d_ctx_b + threads * 4 + thread); } } - d[1] = (d_ctx_b + thread * 4)[sub]; + + uint32_t bx1, division_result, sqrt_result; + if(ALGO == cryptonight_monero_v8) + { + d[1] = (d_ctx_b + thread * 12)[sub]; + bx1 = (d_ctx_b + thread * 12 + 4)[sub]; + + // must be valid only for `sub < 2` + division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2]; + sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2]; + } + else + d[1] = (d_ctx_b + thread * 4)[sub]; #pragma unroll 2 for ( i = start; i < end; ++i ) @@ -259,7 +296,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti for ( int x = 0; x < 2; ++x ) { j = ( ( idx0 & MASK ) >> 2 ) + sub; - + if(ALGO == cryptonight_bittube2) { uint32_t k[4]; @@ -290,6 +327,57 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti } } } + else if(ALGO == cryptonight_monero_v8) + { + + const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) ); + uint4 chunk0{}; + chunk0.x = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[0], 0); + chunk0.y = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[1], 0); + chunk0.z = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[2], 0); + chunk0.w = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[3], 0); + + const uint32_t x_0 = ((uint32_t*)&chunk0)[sub]; + const uint32_t x_1 = ((uint32_t*)&chunk0)[(sub + 1) % 4]; + const uint32_t x_2 = ((uint32_t*)&chunk0)[(sub + 2) % 4]; + const uint32_t x_3 = ((uint32_t*)&chunk0)[(sub + 3) % 4]; + d[x] = a ^ + t_fn0( x_0 & 0xff ) ^ + t_fn1( (x_1 >> 8) & 0xff ) ^ + t_fn2( (x_2 >> 16) & 0xff ) ^ + t_fn3( ( x_3 >> 24 ) ); + + uint4 value; + const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1); + if(sub == 1) + ((uint64_t*)&value)[0] = tmp10; + const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3); + if(sub == 1) + ((uint64_t*)&value)[1] = tmp20; + const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1); + if(sub == 2) + ((uint64_t*)&value)[0] = tmp11; + const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3); + if(sub == 2) + ((uint64_t*)&value)[1] = tmp21; + const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1); + if(sub == 3) + ((uint64_t*)&value)[0] = tmp12; + const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3); + if(sub == 3) + ((uint64_t*)&value)[1] = tmp22; + + if(sub > 0) + { + uint4 store{}; + ((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0]; + ((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1]; + + const int dest = sub + 1; + const int dest2 = dest == 4 ? 1 : dest; + *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store; + } + } else { const uint32_t x_0 = loadGlobal32( long_state + j ); @@ -302,7 +390,6 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti t_fn2( (x_2 >> 16) & 0xff ) ^ t_fn3( ( x_3 >> 24 ) ); } - //XOR_BLOCKS_DST(c, b, &long_state[j]); t1[0] = shuffle<4>(sPtr,sub, d[x], 0); @@ -331,10 +418,82 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti uint32_t yy[2]; *( (uint64_t*) yy ) = loadGlobal64( ( (uint64_t *) long_state )+( j >> 1 ) ); + + if(ALGO == cryptonight_monero_v8 ) + { + const uint64_t sqrt_result_64 = shuffle64<4>(sPtr, sub, sqrt_result, 0, 1); + + // Use division and square root results from the _previous_ iteration to hide the latency + const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); + + + const uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); + const uint64_t cl_rhs = division_result_64 ^ (sqrt_result_64 << 32); + + if(sub < 2) + *((uint64_t*)yy) ^= cl_rhs; + + + const uint32_t dd = (cx0 + (sqrt_result_64 << 1)) | 0x80000001UL; + + // Most and least significant bits in the divisor are set to 1 + // to make sure we don't divide by a small or even number, + // so there are no shortcuts for such cases + // + // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 + // We drop the highest bit to fit both quotient and remainder in 32 bits + + // Compiler will optimize it to a single div instruction + const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); + + + const uint64_t division_result_tmp = static_cast(cx1 / dd) + ((cx1 % dd) << 32); + + division_result = ((uint32_t*)&division_result_tmp)[sub % 2]; + + // Use division_result as an input for the square root to prevent parallel implementation in hardware + const uint64_t sqrt_result_tmp = int_sqrt33_1_double_precision(i, cx0 + division_result_tmp); + sqrt_result = ((uint32_t*)&sqrt_result_tmp)[sub % 2]; + } + uint32_t zz[2]; zz[0] = shuffle<4>(sPtr,sub, yy[0], 0); zz[1] = shuffle<4>(sPtr,sub, yy[1], 0); - + // Shuffle the other 3x16 byte chunks in the current 64-byte cache line + if(ALGO == cryptonight_monero_v8) + { + uint4 value; + const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1); + if(sub == 1) + ((uint64_t*)&value)[0] = tmp10; + const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3); + if(sub == 1) + ((uint64_t*)&value)[1] = tmp20; + const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1); + if(sub == 2) + ((uint64_t*)&value)[0] = tmp11; + const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3); + if(sub == 2) + ((uint64_t*)&value)[1] = tmp21; + const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1); + if(sub == 3) + ((uint64_t*)&value)[0] = tmp12; + const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3); + if(sub == 3) + ((uint64_t*)&value)[1] = tmp22; + if(sub > 0) + { + const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) ); + uint4 store{}; + ((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0]; + ((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1]; + + const int dest = sub + 1; + const int dest2 = dest == 4 ? 1 : dest; + *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store; + } + } + t1[1] = shuffle<4>(sPtr,sub, d[x], 1); #pragma unroll for ( k = 0; k < 2; k++ ) @@ -384,13 +543,31 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = (~d) ^ q; } + if(ALGO == cryptonight_monero_v8) + { + bx1 = d[(x + 1) % 2]; + } } } if ( bfactor > 0 ) { (d_ctx_a + thread * 4)[sub] = a; - (d_ctx_b + thread * 4)[sub] = d[1]; + if(ALGO == cryptonight_monero_v8) + { + (d_ctx_b + thread * 12)[sub] = d[1]; + (d_ctx_b + thread * 12 + 4)[sub] = bx1; + + if(sub < 2) + { + // must be valid only for `sub < 2` + (d_ctx_b + thread * 12 + 4 * 2)[sub % 2] = division_result; + (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2] = sqrt_result; + } + } + else + (d_ctx_b + thread * 4)[sub] = d[1]; + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) if(sub&1) *(d_ctx_b + threads * 4 + thread) = idx0; @@ -534,6 +711,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t { cryptonight_core_gpu_hash(ctx, startNonce); } + else if(miner_algo == cryptonight_monero_v8) + { + cryptonight_core_gpu_hash(ctx, startNonce); + } else if(miner_algo == cryptonight_heavy) { cryptonight_core_gpu_hash(ctx, startNonce); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index b455f55ca..1ea54ddba 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -142,7 +142,19 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); - memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); + if(ALGO == cryptonight_monero_v8) + { + memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 ); + // bx1 + XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b ); + memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 ); + // division_result + memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 ); + // sqrt_result + memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 ); + } + else + memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 ); memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); @@ -298,6 +310,12 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // create a double buffer for the state to exchange the mixed state to phase1 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); } + else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || + cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) + { + // bx1 (16byte), division_result (8byte) and sqrt_result (8byte) + ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize; + } else ctx->d_ctx_state2 = ctx->d_ctx_state; @@ -340,6 +358,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + if(miner_algo == cryptonight_monero_v8) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index b6580ea9a..609b55f72 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -99,12 +99,13 @@ xmrstak::coin_selection coins[] = { { "cryptonight_lite_v7", {cryptonight_lite, cryptonight_aeon, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr }, { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 255u}, nullptr }, { "cryptonight_v7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, + { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v7_stellite", {cryptonight_monero, cryptonight_stellite, 255u}, {cryptonight_monero, cryptonight_monero, 255u}, nullptr }, { "graft", {cryptonight_monero, cryptonight, 8u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "haven", {cryptonight_haven, cryptonight_heavy, 3u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "intense", {cryptonight_monero, cryptonight, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "masari", {cryptonight_masari, cryptonight_monero, 7u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr }, - { "monero7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" }, + { "monero8", {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" }, { "qrl", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "stellite", {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index 11d0f6df0..02ac8b7f5 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -560,7 +560,7 @@ void executor::ex_main() else pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true); break; - + case cryptonight_monero_v8: case cryptonight_monero: if(dev_tls) pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false); diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index 9fce9b7e5..d20ba082f 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -685,6 +685,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes case cryptonight_monero: algo_name = "cryptonight_v7"; break; + case cryptonight_monero_v8: + algo_name = "cryptonight_v8"; + break; case cryptonight_aeon: algo_name = "cryptonight_lite_v7"; break;