Skip to content

Commit

Permalink
Split hw (re)config between unpack/math threads
Browse files Browse the repository at this point in the history
  • Loading branch information
rdjogoTT committed Sep 20, 2024
1 parent d7a12ee commit f4e986d
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 19 deletions.
8 changes: 2 additions & 6 deletions common/inc/cunpack_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,7 @@ namespace ckernel::unpacker
((uint)unpA_dst_format_masked == (uint)DataFormat::Int32) ||
((uint)unpB_dst_format_masked == (uint)DataFormat::Int32);

constexpr uint alu_format_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG1_SrcB_MASK |
ALU_FORMAT_SPEC_REG0_SrcAUnsigned_MASK | ALU_FORMAT_SPEC_REG0_SrcBUnsigned_MASK;
alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcA = unpA_dst_format_masked;
alu_payload.f.ALU_FORMAT_SPEC_REG1_SrcB = row_pool ? ((uint) DataFormat::Float16 | (exp_width<<2)) : unpB_dst_format_masked;
constexpr uint alu_format_mask = ALU_FORMAT_SPEC_REG0_SrcAUnsigned_MASK | ALU_FORMAT_SPEC_REG0_SrcBUnsigned_MASK;

if ((uint)unpA_src_format == (uint)DataFormat::UInt8) {
alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcAUnsigned = 1;
Expand All @@ -257,10 +254,9 @@ namespace ckernel::unpacker
// NOTE: This assumes these config fields are adjacent and in same register!!
static_assert(ALU_ACC_CTRL_Fp32_enabled_ADDR32 == ALU_FORMAT_SPEC_REG0_SrcA_ADDR32);
static_assert(ALU_ACC_CTRL_Fp32_enabled_ADDR32 == ALU_ACC_CTRL_SFPU_Fp32_enabled_ADDR32);
constexpr uint alu_dest_format_mask = ALU_ACC_CTRL_INT8_math_enabled_MASK | ALU_ACC_CTRL_SFPU_Fp32_enabled_MASK | ALU_ACC_CTRL_Fp32_enabled_MASK;
constexpr uint alu_dest_format_mask = ALU_ACC_CTRL_SFPU_Fp32_enabled_MASK | ALU_ACC_CTRL_Fp32_enabled_MASK;
alu_payload.f.ALU_ACC_CTRL_Fp32_enabled = fp32_dest_acc_en;
alu_payload.f.ALU_ACC_CTRL_SFPU_Fp32_enabled = fp32_dest_acc_en;
alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = int8_math_enabled;
constexpr uint alu_stoch_rnd_mask = ALU_ROUNDING_MODE_Fpu_srnd_en_MASK | ALU_ROUNDING_MODE_Gasket_srnd_en_MASK | ALU_ROUNDING_MODE_Packer_srnd_en_MASK;
alu_payload.f.ALU_ROUNDING_MODE_Fpu_srnd_en = fpu_srnd_en;
alu_payload.f.ALU_ROUNDING_MODE_Gasket_srnd_en = pack_srnd_en;
Expand Down
29 changes: 26 additions & 3 deletions llk_lib/llk_math_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@

using namespace ckernel::math;

template <bool untilize_en>
inline void _llk_math_hw_configure() {
template <bool untilize_en = false, bool skip_inputs = false>
inline void _llk_math_hw_configure_(const std::uint32_t srca_data_format, const std::uint32_t srcb_data_format) {
//Untilize mode needs dest read access with a stride of 16
//Following bits are needed for enabling stride of 16
cfg_reg_rmw_tensix<DEST_ACCESS_CFG_remap_addrs_RMW>(untilize_en);
Expand All @@ -24,6 +24,14 @@ inline void _llk_math_hw_configure() {
// Legacy mode for ZEROACC
cfg_reg_rmw_tensix<DEST_ACCESS_CFG_zeroacc_absolute_tile_mode_RMW>(1);

if constexpr (skip_inputs == false){
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::MATH);
uint int8_math_enabled = ((uint)(srca_data_format & 0xF) == (uint)DataFormat::Int8) ||
((uint)(srcb_data_format & 0xF) == (uint)DataFormat::Int8) ||
((uint)srca_data_format == (uint)DataFormat::Int32) ||
((uint)srcb_data_format == (uint)DataFormat::Int32);
cfg_reg_rmw_tensix<ALU_ACC_CTRL_INT8_math_enabled_RMW>(int8_math_enabled);
}
}

template <DstSync Dst>
Expand Down Expand Up @@ -122,12 +130,27 @@ inline void _llk_math_debug_dump_seek_(std::uint8_t offset) {
debug_dump_seek(offset);
}

//Following functions not needed for blackhole since ALU format is inferred
// Following functions do not need to program ALU_FORMAT_SPEC_REG0_SrcA/ALU_FORMAT_SPEC_REG1_SrcB
// for blackhole since ALU format is inferred
inline void _llk_math_reconfig_data_format_srca_(const std::uint32_t srca_data_format) {
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::MATH);
uint int8_math_enabled = ((uint)(srca_data_format & 0xF) == (uint)DataFormat::Int8) ||
((uint)srca_data_format == (uint)DataFormat::Int32);
cfg_reg_rmw_tensix<ALU_ACC_CTRL_INT8_math_enabled_RMW>(int8_math_enabled);
}

inline void _llk_math_reconfig_data_format_srcb_(const std::uint32_t srcb_data_format) {
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::MATH);
uint int8_math_enabled = ((uint)(srcb_data_format & 0xF) == (uint)DataFormat::Int8) ||
((uint)srcb_data_format == (uint)DataFormat::Int32);
cfg_reg_rmw_tensix<ALU_ACC_CTRL_INT8_math_enabled_RMW>(int8_math_enabled);
}

inline void _llk_math_reconfig_data_format_(const std::uint32_t srca_data_format, const std::uint32_t srcb_data_format) {
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::MATH);
uint int8_math_enabled = ((uint)(srca_data_format & 0xF) == (uint)DataFormat::Int8) ||
((uint)(srcb_data_format & 0xF) == (uint)DataFormat::Int8) ||
((uint)srca_data_format == (uint)DataFormat::Int32) ||
((uint)srcb_data_format == (uint)DataFormat::Int32);
cfg_reg_rmw_tensix<ALU_ACC_CTRL_INT8_math_enabled_RMW>(int8_math_enabled);
}
14 changes: 4 additions & 10 deletions llk_lib/llk_unpack_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,23 +82,17 @@ inline void _llk_unpack_config_tile_dim_srcb_impl_(const std::uint32_t face_r_di

inline void _llk_unpack_reconfig_data_format_srca_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size)
{
alu_config_u alu_payload = {.val = 0};
alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcA = unpack_dst_format;
if ((uint)unpack_src_format == (uint)DataFormat::UInt8) {
alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcAUnsigned = 1;
}
alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = ((uint)(unpack_dst_format & 0xF) == (uint)DataFormat::Int8) ||
((uint)unpack_dst_format == (uint)DataFormat::Int32);
constexpr uint alu_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG0_SrcAUnsigned_MASK | ALU_ACC_CTRL_INT8_math_enabled_MASK;
cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcA_ADDR32, 0, alu_mask>(alu_payload.val);

TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK0);
cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcAUnsigned_RMW>(((uint)unpack_src_format == (uint)DataFormat::UInt8) ? 1 : 0);
cfg_reg_rmw_tensix<THCON_SEC0_REG0_TileDescriptor_ADDR32, 0, 0x0f>(unpack_src_format);
cfg_reg_rmw_tensix<THCON_SEC0_REG2_Out_data_format_RMW>(unpack_dst_format);
TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_A)); // update gpr which holds tile size A
}

inline void _llk_unpack_reconfig_data_format_srcb_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size)
{
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::UNPACK1);
cfg_reg_rmw_tensix<ALU_FORMAT_SPEC_REG0_SrcBUnsigned_RMW>(((uint)unpack_src_format == (uint)DataFormat::UInt8) ? 1 : 0);
cfg_reg_rmw_tensix<THCON_SEC1_REG0_TileDescriptor_ADDR32, 0, 0x0f>(unpack_src_format);
cfg_reg_rmw_tensix<THCON_SEC1_REG2_Out_data_format_RMW>(unpack_dst_format);
TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_B)); // update gpr which holds tile size B
Expand Down

0 comments on commit f4e986d

Please sign in to comment.