Skip to content

Commit

Permalink
NVIDIA: optimze v8
Browse files Browse the repository at this point in the history
- fix that shared memory for fast div is always used even if an algorithm is not using it
- optimize fast div algo
- store `division_result` (64_bit) per thread instead of shuffle around and store it as 32bit
  • Loading branch information
psychocrypt committed Sep 19, 2018
1 parent 9d6c718 commit 2943f5c
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 18 deletions.
30 changes: 16 additions & 14 deletions xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
__shared__ uint32_t sharedMemory[1024];

cn_aes_gpu_init( sharedMemory );
__shared__ uint32_t RCP[256];
for (int i = threadIdx.x; i < 256; i += blockDim.x)
uint32_t* RCP;
if(ALGO == cryptonight_monero_v8)
{
RCP[i] = RCP_C[i];
__shared__ uint32_t RCP_shared[256];
for (int i = threadIdx.x; i < 256; i += blockDim.x)
{
RCP_shared[i] = RCP_C[i];
}
RCP = RCP_shared;
}


Expand Down Expand Up @@ -268,14 +273,15 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
}
}

uint32_t bx1, division_result, sqrt_result;
uint32_t bx1, sqrt_result;
uint64_t division_result;
if(ALGO == cryptonight_monero_v8)
{
d[1] = (d_ctx_b + thread * 12)[sub];
bx1 = (d_ctx_b + thread * 12 + 4)[sub];

// must be valid only for `sub < 2`
division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2];
division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
}
else
Expand Down Expand Up @@ -415,21 +421,17 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
{
// Use division and square root results from the _previous_ iteration to hide the latency
const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1);

uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1);
((uint32_t*)&division_result_64)[1] ^= sqrt_result;
((uint32_t*)&division_result)[1] ^= sqrt_result;

if(sub < 2)
*((uint64_t*)yy) ^= division_result_64;
*((uint64_t*)yy) ^= division_result;

const uint32_t dd = (static_cast<uint32_t>(cx0) + (sqrt_result << 1)) | 0x80000001UL;
const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3);
const uint64_t division_result_tmp = fast_div_v2(RCP, cx1, dd);

division_result = ((uint32_t*)&division_result_tmp)[sub % 2];

division_result = fast_div_v2(RCP, cx1, dd);

// Use division_result as an input for the square root to prevent parallel implementation in hardware
sqrt_result = fast_sqrt_v2(cx0 + division_result_tmp);
sqrt_result = fast_sqrt_v2(cx0 + division_result);
}

uint32_t zz[2];
Expand Down
8 changes: 4 additions & 4 deletions xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,11 @@ __device__ __forceinline__ uint64_t fast_div_v2(const uint32_t *RCP, uint64_t a,
q[1] = (k < a) ? 1 : 0;

const int64_t tmp = a - *((uint64_t*)(q)) * b;
const bool overshoot = (tmp < 0);
const bool undershoot = (tmp >= b);
const uint32_t overshoot = (tmp < 0) ? 1u : 0U;
const uint32_t undershoot = (tmp >= b) ? 1u : 0U;

q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U);
q[1] = (uint32_t)(tmp) + (overshoot ? b : 0U) - (undershoot ? b : 0U);
q[0] += undershoot - overshoot;
q[1] = (uint32_t)(tmp) + (overshoot == 1 ? b : 0U) - (undershoot ? b : 0U);

return *((uint64_t*)(q));
}
Expand Down

0 comments on commit 2943f5c

Please sign in to comment.