Skip to content

Commit

Permalink
version bump, small speedup for vega in kernel mtp_nvidia4
Browse files Browse the repository at this point in the history
  • Loading branch information
djm34 committed Feb 13, 2019
1 parent 8d8eb15 commit 55ca9de
Show file tree
Hide file tree
Showing 11 changed files with 6,211 additions and 21 deletions.
4 changes: 2 additions & 2 deletions configure.ac
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
m4_define([v_maj], [0])
m4_define([v_min], [0])
m4_define([v_mic], [7])
m4_define([v_min], [1])
m4_define([v_mic], [0])
m4_define([v_rev], [djm34])
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
m4_ifdef([v_rev], [m4_define([v_ver], [v_maj.v_min.v_mic-v_rev])], [m4_define([v_ver], [v_maj.v_min.v_mic])])
Expand Down
110 changes: 101 additions & 9 deletions kernel/mtp.cl
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,37 @@ static inline uint64_t eorswap64(uint64_t u, uint64_t v)
G(r,7, 3,4,9,14); \
}

#define H(r,i,a,b,c,d) \
{ \
v[a] += v[b] + (m[blake2b_sigma[r][2*i+0]]); \
v[d] = eorswap64(v[d] , v[a]); \
v[c] += v[d]; \
v[b] = ROTR64X(v[b] ^ v[c], 24); \
v[a] += v[b] + (m[blake2b_sigma[r][2*i+1]]); \
v[d] = ROTR64X(v[d] ^ v[a], 16); \
v[c] += v[d]; \
}

#define ROUNDF \
{ \
G(11,0, 0,4,8,12); \
G(11,1, 1,5,9,13); \
G(11,2, 2,6,10,14); \
G(11,3, 3,7,11,15); \
if(!last){\
G(11,4, 0,5,10,15); \
G(11,5, 1,6,11,12); \
G(11,6, 2,7,8,13); \
G(11,7, 3,4,9,14); \
}else{\
H(11,4, 0,5,10,15); \
H(11,5, 1,6,11,12); \
H(11,6, 2,7,8,13); \
H(11,7, 3,4,9,14); \
}\
}


ROUND(0);
ROUND(1);
ROUND(2);
Expand All @@ -633,6 +664,22 @@ static inline uint64_t eorswap64(uint64_t u, uint64_t v)
ROUND(10);
ROUND(11);

// ROUNDF;
/*
ROUND0;
ROUND1;
ROUND2;
ROUND3;
ROUND4;
ROUND5;
ROUND6;
ROUND7;
ROUND8;
ROUND9;
ROUND10;
ROUND11;
*/

for (int i = 0; i < 8; ++i)
hzcash[i] ^= v[i] ^ v[i + 8];

Expand Down Expand Up @@ -716,7 +763,7 @@ __global uint4 * Elements)
}

__attribute__((reqd_work_group_size(TPB_MTP, 1, 1)))
__kernel void mtp_yloop(__global unsigned int* pData, __global const ulong2 * __restrict__ DBlock, __global const ulong2 * __restrict__ DBlock2,
__kernel void mtp_yloop(__global unsigned int* pData, __global const ulong8 * __restrict__ DBlock, __global const ulong8 * __restrict__ DBlock2,
__global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce, uint pTarget)
/*
__attribute__((reqd_work_group_size(TPB_MTP, 1, 1)))
Expand All @@ -730,8 +777,7 @@ __global uint8 * GYlocal, __global uint32_t * __restrict__ SmallestNonce, uin
uint32_t event_thread = get_global_id(0) - get_global_offset(0); //thread / ThreadNumber;

uint32_t NonceIterator = get_global_id(0);
uint64_t m[16];
uint64_t v[16];


// ulong2 FarReg[8];
uint32_t farIndex;
Expand Down Expand Up @@ -768,7 +814,7 @@ __global uint8 * GYlocal, __global uint32_t * __restrict__ SmallestNonce, uin
// uint32_t localIndex;
init_blocks = false;
unmatch_block = 0;
ulong8 DataTmp;
uint16 DataTmp;
#pragma unroll 1
for (int j = 1; j <= mtp_L; j++)
{
Expand All @@ -782,7 +828,7 @@ __global uint8 * GYlocal, __global uint32_t * __restrict__ SmallestNonce, uin


for (int i = 0; i<8; i++)
(( ulong*)&DataTmp)[i] = lblakeFinal[i];
(( uint2*)&DataTmp)[i] = as_uint2(lblakeFinal[i]);

// uint8 part;

Expand All @@ -794,12 +840,12 @@ __global uint8 * GYlocal, __global uint32_t * __restrict__ SmallestNonce, uin
len += last ? 32 : 128;


__global const ulong2 * __restrict__ farP = (farIndex<half_memcost)? &DBlock[farIndex * 64 + 8 * i ]
: &DBlock2[(farIndex - half_memcost) * 64 + 8 * i];
__global const ulong8 * __restrict__ farP = (farIndex<half_memcost)? &DBlock[farIndex * 16 + 2 * i ]
: &DBlock2[(farIndex - half_memcost) * 16 + 2 * i];

#pragma unroll
for (int t = 0; t<8; t++)
(( ulong2*)DataChunk)[t] = (last) ? (ulong2)(0, 0) : farP[t];
for (int t = 0; t<2; t++)
(( ulong8*)DataChunk)[t] = (last) ? (ulong8)(0, 0,0,0,0,0,0,0) : farP[t];

// (( uint16*)DataChunk)[0].lo = YLocal;

Expand Down Expand Up @@ -1226,3 +1272,49 @@ __kernel void mtp_fc(uint32_t threads, __global uint4 * __restrict__ DBlock, _

}


/*
__host__ void mtp_i_cpu(int thr_id, uint32_t *block_header) {
cudaSetDevice(device_map[thr_id]);
cudaError_t err = cudaMemcpy(Header[thr_id], block_header, 8 * sizeof(uint32_t), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
printf("%s\n", cudaGetErrorName(err));
cudaDeviceReset();
exit(1);
}
uint32_t tpb = 32;
dim3 grid(4);
dim3 block(tpb);
mtp_i<0> << <grid, block >> >(HBlock[thr_id], Header[thr_id]);
cudaDeviceSynchronize();
mtp_i<1> << <grid, block >> >(HBlock[thr_id], Header[thr_id]);
cudaDeviceSynchronize();
mtp_i<2> << <grid, block >> >(HBlock[thr_id], Header[thr_id]);
cudaDeviceSynchronize();
mtp_i<3> << <grid, block >> >(HBlock[thr_id], Header[thr_id]);
cudaDeviceSynchronize();
tpb = 256;
dim3 grid2(1048576 * 4 / tpb);
dim3 block2(tpb);
mtp_fc << <grid2, block2 >> >(1048576 * 4, HBlock[thr_id], buffer_a[thr_id]);
cudaDeviceSynchronize();
}
__host__
void mtp_fill_1b(int thr_id, uint64_t *Block, uint32_t block_nr)
{
uint4 *Blockptr = &HBlock[thr_id][block_nr * 64];
cudaError_t err = cudaMemcpy(Blockptr, Block, 256 * sizeof(uint32_t), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
printf("%s\n", cudaGetErrorName(err));
cudaDeviceReset();
exit(1);
}
}
*/
Loading

0 comments on commit 55ca9de

Please sign in to comment.