Skip to content

Commit

Permalink
testing Monero POW v2 fireice-uk#1831
Browse files Browse the repository at this point in the history
  • Loading branch information
Spudz76 committed Sep 16, 2018
1 parent 0f1a827 commit 3e538db
Show file tree
Hide file tree
Showing 23 changed files with 651 additions and 87 deletions.
11 changes: 10 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,15 @@ if(MICROHTTPD_ENABLE)
endif()
target_link_libraries(xmr-stak-c ${LIBS})

enable_language(ASM)
# asm optimized monero v8 code
add_library(xmr-stak-asm
STATIC
"xmrstak/backend/cpu/crypto/asm/cryptonigh_v8_main_loop.S"
)
set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C)


# compile generic backend files
file(GLOB BACKEND_CPP
"xmrstak/*.cpp"
Expand All @@ -472,7 +481,7 @@ add_library(xmr-stak-backend
STATIC
${BACKEND_CPP}
)
target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS})
target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm)

# compile CUDA backend
if(CUDA_FOUND)
Expand Down
31 changes: 28 additions & 3 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,9 +390,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_

char options[512];
snprintf(options, sizeof(options),
"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
"-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d -DCN_UNROLL=%d",
hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<<ctx->memChunk), ctx->compMode ? 1 : 0,
int_port(hashMemSize), int(miner_algo[ii]));
int_port(hashMemSize), int(miner_algo[ii]), ctx->unroll);
/* create a hash for the compile time cache
* used data:
* - source code
Expand Down Expand Up @@ -885,6 +885,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)

//char* source_code = LoadTextFile(sSourcePath);

const char *fastIntMathV2CL =
#include "./opencl/fast_int_math_v2.cl"
;
const char *cryptonightCL =
#include "./opencl/cryptonight.cl"
;
Expand All @@ -905,6 +908,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
;

std::string source_code(cryptonightCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
Expand All @@ -914,16 +918,37 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
// create a directory for the OpenCL compile cache
create_directory(get_home() + "/.openclcache");

// check if cryptonight_monero_v8 is selected for the user or dev pool
bool useCryptonight_v8 =
::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;

for(int i = 0; i < num_gpus; ++i)
{
const std::string backendName = xmrstak::params::inst().openCLVendor;
if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
{
size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
ctx[i].rawIntensity = reduced_intensity;
const std::string backendName = xmrstak::params::inst().openCLVendor;
printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
}

if(useCryptonight_v8)
{
if(ctx[i].stridedIndex == 1)
{
printer::inst()->print_msg(L0, "ERROR %s: gpu %d stridedIndex is not allowed to be `true` or `1` for the selected currency", backendName.c_str(), ctx[i].deviceIdx);
return ERR_STUPID_PARAMS;
}
if(ctx[i].stridedIndex == 2 && ctx[i].memChunk < 2)
{
printer::inst()->print_msg(L0, "ERROR %s: gpu %d memChunk bust be >= 2 for the selected currency", backendName.c_str(), ctx[i].deviceIdx);
return ERR_STUPID_PARAMS;
}
}

if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
{
return ret;
Expand Down
1 change: 1 addition & 0 deletions xmrstak/backend/amd/amd_gpu/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ struct GpuContext
size_t workSize;
int stridedIndex;
int memChunk;
int unroll = 0;
bool isNVIDIA = false;
int compMode;

Expand Down
177 changes: 147 additions & 30 deletions xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
}
#endif

//#include "opencl/fast_int_math_v2.cl"
XMRSTAK_INCLUDE_FAST_INT_MATH_V2
//#include "opencl/wolf-aes.cl"
XMRSTAK_INCLUDE_WOLF_AES
//#include "opencl/wolf-skein.cl"
Expand Down Expand Up @@ -416,6 +418,9 @@ void AESExpandKey256(uint *keybuf)
}
}

)==="
R"===(

#define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)

#if(STRIDED_INDEX==0)
Expand Down Expand Up @@ -551,7 +556,15 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}


// cryptonight_monero_v8 && NVIDIA
#if(ALGO==11 && defined(__NV_CL_C_VERSION))
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
#else
# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)])
#endif

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads
// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
Expand All @@ -560,9 +573,29 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
#endif
)
{
ulong a[2], b[2];
ulong a[2];

// cryptonight_monero_v8
#if(ALGO==11)
ulong b[4];
uint4 b_x[2];
// NVIDIA
# ifdef __NV_CL_C_VERSION
__local uint16 scratchpad_line_buf[WORKSIZE];
__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
# endif
#else
ulong b[2];
uint4 b_x[1];
#endif
__local uint AES0[256], AES1[256], AES2[256], AES3[256];

// cryptonight_monero_v8
#if(ALGO==11)
__local uint RCP[256];
uint2 division_result;
uint sqrt_result;
#endif
const ulong gIdx = getIdx();

for(int i = get_local_id(0); i < 256; i += WORKSIZE)
Expand All @@ -572,14 +605,18 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
AES1[i] = rotate(tmp, 8U);
AES2[i] = rotate(tmp, 16U);
AES3[i] = rotate(tmp, 24U);
// cryptonight_monero_v8
#if(ALGO==11)
RCP[i] = RCP_C[i];
#endif
}

barrier(CLK_LOCAL_MEM_FENCE);
// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
uint2 tweak1_2;
#endif
uint4 b_x;

#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
Expand All @@ -599,7 +636,17 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];

b_x = ((uint4 *)b)[0];
b_x[0] = ((uint4 *)b)[0];

// cryptonight_monero_v8
#if(ALGO==11)
a[1] = states[1] ^ states[5];
b[2] = states[8] ^ states[10];
b[3] = states[9] ^ states[11];
b_x[1] = ((uint4 *)b)[1];
division_result = as_uint2(states[12]);
sqrt_result = as_uint2(states[13]).s0;
#endif
// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
tweak1_2 = as_uint2(input[4]);
Expand All @@ -617,37 +664,96 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
if(gIdx < Threads)
#endif
{
ulong idx0 = a[0];
ulong idx0 = a[0] & MASK;

#pragma unroll 8
#pragma unroll CN_UNROLL
for(int i = 0; i < ITERATIONS; ++i)
{
ulong c[2];
// cryptonight_monero_v8 && NVIDIA
#if(ALGO==11 && defined(__NV_CL_C_VERSION))
ulong idxS = idx0 & 0x30;
*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
#endif

((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
// cryptonight_bittube2
#if(ALGO == 10)
((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
#else
((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
#endif
b_x ^= ((uint4 *)c)[0];

// cryptonight_monero_v8
#if(ALGO==11)
{
ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
#endif

// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
uint table = 0x75310U;
b_x[0] ^= ((uint4 *)c)[0];
// cryptonight_stellite
# if(ALGO == 7)
uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2);
uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2);
# else
uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2);
# endif
b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
SCRATCHPAD_CHUNK(0) = b_x[0];
idx0 = c[0] & MASK;
// cryptonight_monero_v8
#elif(ALGO==11)
SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
# ifdef __NV_CL_C_VERSION
// flush shuffeled data
SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
idx0 = c[0] & MASK;
idxS = idx0 & 0x30;
*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
# else
uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2);
idx0 = c[0] & MASK;
# endif
b_x.s2 ^= ((table >> index) & 0x30U) << 24;
#else
b_x[0] ^= ((uint4 *)c)[0];
SCRATCHPAD_CHUNK(0) = b_x[0];
idx0 = c[0] & MASK;
#endif
Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x;

uint4 tmp;
tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];

tmp = SCRATCHPAD_CHUNK(0);
// cryptonight_monero_v8
#if(ALGO==11)
// Use division and square root results from the _previous_ iteration to hide the latency
tmp.s0 ^= division_result.s0;
tmp.s1 ^= division_result.s1 ^ sqrt_result;
// Most and least significant bits in the divisor are set to 1
// to make sure we don't divide by a small or even number,
// so there are no shortcuts for such cases
const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL;
// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
// We drop the highest bit to fit both quotient and remainder in 32 bits
division_result = fast_div_v2(RCP, c[1], d);
// Use division_result as an input for the square root to prevent parallel implementation in hardware
sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
#endif
// cryptonight_monero_v8
#if(ALGO==11)
{
ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
#endif
a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);

Expand All @@ -658,44 +764,55 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
# if(ALGO == 6 || ALGO == 10)
uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
# else
((uint2 *)&(a[1]))[0] ^= tweak1_2;
Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
((uint2 *)&(a[1]))[0] ^= tweak1_2;
# endif

#else
Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
#endif

((uint4 *)a)[0] ^= tmp;
idx0 = a[0];

b_x = ((uint4 *)c)[0];
// cryptonight_monero_v8
#if (ALGO == 11)
# if defined(__NV_CL_C_VERSION)
// flush shuffeled data
SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
# endif
b_x[1] = b_x[0];
#endif
b_x[0] = ((uint4 *)c)[0];
idx0 = a[0] & MASK;

// cryptonight_heavy || cryptonight_bittube2
#if (ALGO == 4 || ALGO == 10)
long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
long q = n / (d | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
idx0 = d ^ q;
#endif
*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
idx0 = (d ^ q) & MASK;
// cryptonight_haven
#if (ALGO == 9)
long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2];
#elif (ALGO == 9)
long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
long q = n / (d | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q;
idx0 = (~d) ^ q;
*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
idx0 = ((~d) ^ q) & MASK;
#endif

}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}

)==="
R"===(

__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
{
Expand Down
Loading

0 comments on commit 3e538db

Please sign in to comment.