diff --git a/transcoder/basisu_transcoder.cpp b/transcoder/basisu_transcoder.cpp index 3aeba0ee..1899bb46 100644 --- a/transcoder/basisu_transcoder.cpp +++ b/transcoder/basisu_transcoder.cpp @@ -17,6 +17,21 @@ #include #include "basisu_containers_impl.h" +#define BASISD_ASTC_SIMD + +#if defined(BASISD_ASTC_SIMD) +#if defined(__clang__) + #if defined(__arm__) || defined(__aarch64__) + #include + #include + #else + #include + #endif +#else // !defined(__clang__) + #include +#endif // !defined(__clang__) +#endif // defined(BASISD_ASTC_SIMD) + #ifndef BASISD_IS_BIG_ENDIAN // TODO: This doesn't work on OSX. How can this be so difficult? //#if defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(BIG_ENDIAN) @@ -5101,6 +5116,7 @@ namespace basist // First extract the trits and the bits from the 5 input values int trits = 0, bits[5]; const uint32_t bit_mask = (1 << n) - 1; + for (int i = 0; i < 5; i++) { static const int s_muls[5] = { 1, 3, 9, 27, 81 }; @@ -9308,7 +9324,6 @@ namespace basist for (uint32_t block_y = 0; block_y < num_blocks_y; ++block_y) { void* pDst_block = (uint8_t*)pDst_blocks + block_y * output_row_pitch_in_blocks_or_pixels * output_block_or_pixel_stride_in_bytes; - for (uint32_t block_x = 0; block_x < num_blocks_x; ++block_x, ++pSource_block, pDst_block = (uint8_t *)pDst_block + output_block_or_pixel_stride_in_bytes) { switch (fmt) @@ -10288,7 +10303,7 @@ namespace basist return true; } - + bool basisu_transcoder::start_transcoding(const void* pData, uint32_t data_size) { if (!validate_header_quick(pData, data_size)) @@ -10398,7 +10413,6 @@ namespace basist } m_ready_to_transcode = true; - return true; } @@ -11878,100 +11892,233 @@ namespace basist pDst[2] |= temp[2]; pDst[3] |= temp[3]; } - const uint32_t ASTC_BLOCK_MODE_BITS = 11; - const uint32_t ASTC_PART_BITS = 2; - const uint32_t ASTC_CEM_BITS = 4; - const uint32_t ASTC_PARTITION_INDEX_BITS = 10; - const uint32_t ASTC_CCS_BITS = 2; - - const uint32_t g_uastc_mode_astc_block_mode[TOTAL_UASTC_MODES] = { 0x242, 0x42, 0x53, 0x42, 0x42, 0x53, 0x442, 0x42, 0, 0x42, 0x242, 0x442, 0x53, 0x441, 0x42, 0x242, 0x42, 0x442, 0x253 }; - - bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t uastc_mode) + void pack_astc_block_weights( uint8_t* pDst_bytes, const uint8_t* pBlockWeights, int bits_per_weight, int blockNum ) { - assert(uastc_mode < TOTAL_UASTC_MODES); - uint8_t* pDst_bytes = reinterpret_cast(pDst); - - const int total_weights = pBlock->m_dual_plane ? 32 : 16; - - // Set mode bits - see Table 146-147 - uint32_t mode = g_uastc_mode_astc_block_mode[uastc_mode]; - pDst_bytes[0] = (uint8_t)mode; - pDst_bytes[1] = (uint8_t)(mode >> 8); - + #if defined(BASISD_ASTC_SIMD) + #else // C memset(pDst_bytes + 2, 0, 16 - 2); + #endif // C - int bit_pos = ASTC_BLOCK_MODE_BITS; - - // We only support 1-5 bit weight indices - assert(!g_astc_bise_range_table[pBlock->m_weight_range][1] && !g_astc_bise_range_table[pBlock->m_weight_range][2]); - const int bits_per_weight = g_astc_bise_range_table[pBlock->m_weight_range][0]; - - // See table 143 - PART - astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_subsets - 1, ASTC_PART_BITS); - - if (pBlock->m_subsets == 1) - astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_cem, ASTC_CEM_BITS); - else - { - // See table 145 - astc_set_bits(pDst, bit_pos, pBlock->m_partition_seed, ASTC_PARTITION_INDEX_BITS); - - // Table 150 - we assume all CEM's are equal, so write 2 0's along with the CEM - astc_set_bits_1_to_9(pDst, bit_pos, (pBlock->m_cem << 2) & 63, ASTC_CEM_BITS + 2); - } - - if (pBlock->m_dual_plane) - { - const int total_weight_bits = total_weights * bits_per_weight; - - // See Illegal Encodings 23.24 - // https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_illegal_encodings - assert((total_weight_bits >= 24) && (total_weight_bits <= 96)); - - int ccs_bit_pos = 128 - total_weight_bits - ASTC_CCS_BITS; - astc_set_bits_1_to_9(pDst, ccs_bit_pos, pBlock->m_ccs, ASTC_CCS_BITS); - } - - const int num_cem_pairs = (1 + (pBlock->m_cem >> 2)) * pBlock->m_subsets; - assert(num_cem_pairs <= 9); - - astc_pack_bise(pDst, pBlock->m_endpoints, bit_pos, num_cem_pairs * 2, g_uastc_mode_endpoint_ranges[uastc_mode]); - - // Write the weight bits in reverse bit order. switch (bits_per_weight) { case 1: { - const uint32_t N = 1; - for (int i = 0; i < total_weights; i++) + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + static const uint8_t s_reverse_bits1[16] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; + uint8x16_t revmask1 = vld1q_u8( s_reverse_bits1 ); + uint8x8_t rev8; + if ( blockNum == 0 ) + { + uint8x16_t src1 = vld1q_u8( pBlockWeights ); // load 16x1 + uint8x16_t test1 = vceqq_u8( src1, vdupq_n_u8(0) ); // mask if x == 0 + uint8x16_t rev1 = vandq_u8( revmask1, vmvnq_u8(test1) ); // final byte shifted value + uint8x8_t rev2 = vpadd_u8( vget_high_u8(rev1), vget_low_u8(rev1) ); // swap(hi,lo) & merge 2 bits + uint8x8_t rev4 = vpadd_u8( rev2, vdup_n_u8(0) ); // merge 4 bits + rev8 = vpadd_u8( rev4, vdup_n_u8(0) ); // merge 8 bits + rev8 = vext_u8( vdup_n_u8(0), rev8, 6 ); // shl64(6) + } + else + { + uint8x16_t src1_0 = vld1q_u8( pBlockWeights ); // load 16x1 + uint8x16_t src1_1 = vld1q_u8( pBlockWeights + 16 ); // load 16x1 + uint8x16_t test1_0 = vceqq_u8( src1_0, vdupq_n_u8(0) ); // mask if x == 0 + uint8x16_t test1_1 = vceqq_u8( src1_1, vdupq_n_u8(0) ); // mask if x == 0 + uint8x16_t rev1_0 = vandq_u8( revmask1, vmvnq_u8(test1_0) ); // final byte shifted value + uint8x16_t rev1_1 = vandq_u8( revmask1, vmvnq_u8(test1_1) ); // final byte shifted value + uint8x8_t rev2_0 = vpadd_u8( vget_high_u8(rev1_0), vget_low_u8(rev1_0) ); // swap(hi,lo) & merge 2 bits + uint8x8_t rev2_1 = vpadd_u8( vget_high_u8(rev1_1), vget_low_u8(rev1_1) ); // swap(hi,lo) & merge 2 bits + uint8x8_t rev4 = vpadd_u8( rev2_1, rev2_0 ); // swap(hi,lo) & merge 4 buts + rev8 = vpadd_u8( rev4, vdup_n_u8(0) ); // merge 8 bits + rev8 = vext_u8( vdup_n_u8(0), rev8, 4 ); // shl64(4) + } + vst1_u8( pDst_bytes + 8, rev8 ); // store [8,15] + vst1_u8( pDst_bytes + 0, vdup_n_u8(0) ); // store [0,8] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + static const uint8_t s_reverse_bits1[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + __m128i bitsmask1 = _mm_loadu_si128( (const __m128i *) s_reverse_bits1 ); + if ( blockNum == 0 ) + { + __m128i src1 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 0 ); // load 16x8 + __m128i rev1 = _mm_shuffle_epi8( src1, bitsmask1 ); // byte reverse + uint32_t rev16 = (uint32_t) _mm_movemask_epi8( _mm_slli_epi32( rev1, 7 ) ); // move to high bit and get mask + *((uint32_t*)(pDst_bytes + 12)) = (rev16 << 16U); // store [12,15] + } + else + { + __m128i src1_0 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 0 ); // load 16x8 + __m128i src1_1 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 1 ); // load 16x8 + __m128i rev1_0 = _mm_shuffle_epi8( src1_0, bitsmask1 ); // byte reverse + __m128i rev1_1 = _mm_shuffle_epi8( src1_1, bitsmask1 ); // byte reverse + uint32_t rev16_0 = (uint32_t) _mm_movemask_epi8( _mm_slli_epi32( rev1_0, 7 ) ); // move to high bit and get mask + uint32_t rev16_1 = (uint32_t) _mm_movemask_epi8( _mm_slli_epi32( rev1_1, 7 ) ); // move to high bit and get mask + *((uint32_t*)(pDst_bytes + 12)) = (rev16_0 << 16U) | rev16_1; // store [12,15] + } + *((uint32_t*)(pDst_bytes + 8)) = 0U; // store [8,11] + *((uint64_t*)(pDst_bytes + 0)) = 0ULL; // store [0,7] + #else { - const uint32_t ofs = 128 - N - i; - assert((ofs >> 3) < 16); - pDst_bytes[ofs >> 3] |= (pBlock->m_weights[i] << (ofs & 7)); + const uint32_t N = 1; + for (int i = 0; i < ((blockNum + 1) >> 4); i++) + { + const uint32_t ofs = 128 - N - i; + assert((ofs >> 3) < 16); + pDst_bytes[ofs >> 3] |= (pBlockWeights[i] << (ofs & 7)); + } } + #endif break; } case 2: { - const uint32_t N = 2; - for (int i = 0; i < total_weights; i++) + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + if ( blockNum == 0 ) + { + uint8x16_t src2 = vld1q_u8( pBlockWeights ); + uint16x8_t src2hi = vshrq_n_u16( vreinterpretq_u16_u8(src2), 6 ); // shr16(8-2) to get in u16[2,3] + uint16x8_t src2lo = vandq_u16( vreinterpretq_u16_u8(src2), vdupq_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x8_t src4lohi = vorrq_u16( src2hi, src2lo ); // 4bits in 8 u16[0,3] + uint8x8_t src4 = vqmovn_u16( src4lohi ); // 4bits in 8 u8[0,3] + uint16x4_t rev4hi = vshr_n_u16( vreinterpret_u16_u8(src4), 4 ); // shr16(8-4) to get in u16[4,7] + uint16x4_t rev4lo = vand_u16( vreinterpret_u16_u8(src4), vdup_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x4_t rev8lohi = vorr_u16( rev4hi, rev4lo ); // 8bits in 4 u16[0,7] + uint8x8_t rev8 = vqmovn_u16( vcombine_u16( rev8lohi, vdup_n_u8(0) ) ); // 8bits in 4 u8 (clear lower 32) + #if defined(__aarch64__) + uint8x8_t rev64 = vrev64_u8( vrbit_u8( rev8 ) ); // bit reverse + vst1_u8( pDst_bytes + 8, rev64 ); // store [8,15] + #else // !defined(__aarch64__) + uint32_t rev32 = vget_lane_u32( vreinterpret_u32_u8(rev8), 0 ); // get the 32 bits + rev32 = __rbit( rev32 ); // reverse + *((uint32_t*)(pDst_bytes + 12)) = rev32; // store [12,15] + *((uint32_t*)(pDst_bytes + 8)) = 0U; // store [8,11] + #endif // !defined(__aarch64__) + } + else { - static const uint8_t s_reverse_bits2[4] = { 0, 2, 1, 3 }; - const uint32_t ofs = 128 - N - (i * N); - assert((ofs >> 3) < 16); - pDst_bytes[ofs >> 3] |= (s_reverse_bits2[pBlock->m_weights[i]] << (ofs & 7)); + uint8x16_t src2_0 = vld1q_u8( pBlockWeights ); + uint8x16_t src2_1 = vld1q_u8( pBlockWeights + 16 ); + uint16x8_t src2hi_0 = vshrq_n_u16( vreinterpretq_u16_u8(src2_0), 6 ); // shr16(8-2) to get in u16[2,3] + uint16x8_t src2hi_1 = vshrq_n_u16( vreinterpretq_u16_u8(src2_1), 6 ); // shr16(8-2) to get in u16[2,3] + uint16x8_t src2lo_0 = vandq_u16( vreinterpretq_u16_u8(src2_0), vdupq_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x8_t src2lo_1 = vandq_u16( vreinterpretq_u16_u8(src2_1), vdupq_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x8_t src4lohi_0 = vorrq_u16( src2hi_0, src2lo_0 ); // 4bits in 8 u16[0,3] + uint16x8_t src4lohi_1 = vorrq_u16( src2hi_1, src2lo_1 ); // 4bits in 8 u16[0,3] + uint16x4_t src4_0 = vreinterpret_u16_u8( vqmovn_u16( src4lohi_0 ) ); // 4bits in 8 u8[0,3] + uint16x4_t src4_1 = vreinterpret_u16_u8( vqmovn_u16( src4lohi_1 ) ); // 4bits in 8 u8[0,3] + uint16x8_t src4 = vcombine_u16( src4_0, src4_1 ); // 4bits in 8 u16[0,3] + uint16x8_t rev4hi = vshrq_n_u16( src4, 4 ); // shr16(8-4) to get in u16[4,7] + uint16x8_t rev4lo = vandq_u16( src4, vdupq_n_u16(0x00FF) ); // have to mask remainder + uint16x8_t rev8lohi = vorrq_u16( rev4hi, rev4lo ); // 8bits in 8 u16[0,7] + uint8x8_t rev8 = vqmovn_u16( rev8lohi ); // 8bits in 8 u8[0,7] + #if defined(__aarch64__) + uint8x8_t rev64 = vrev64_u8( vrbit_u8( rev8 ) ); // bit reverse + vst1_u8( pDst_bytes + 8, rev64 ); // store result + #else // !defined(__aarch64__) + uint64_t rev64 = vget_lane_u64( vreinterpret_u64_u8(rev8), 0 ); // get the 64 bits + rev64 = __rbitll( rev64 ); // bit reverse + *((uint64_t*)(pDst_bytes + 8)) = rev64; // store [8,15] + #endif // !defined(__aarch64__) + } + vst1_u8( pDst_bytes + 0, vdup_n_u8(0) ); // store [0,7] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + __m128i bitsmask2 = _mm_cvtsi32_si128( 0x03010200 ); // { 0, 2, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + if ( blockNum == 0 ) + { + __m128i src8 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) ); + __m128i rev8 = _mm_shuffle_epi8( bitsmask2, src8 ); // bit reverse + __m128i rev8lo = _mm_slli_epi16( rev8, 10 ); // shl16(8+2) to get lo2 in u16[10,11] + __m128i rev8lohi = _mm_or_si128( rev8lo, rev8 ); // 4bits in 8 u16[8,11] + __m128i rev16lo = _mm_slli_epi32( rev8lohi, 20 ); // shl32(16+4) to get lo4 in u32[20,23] + __m128i rev16lohi = _mm_or_si128( rev8lohi, rev16lo ); // 8bits in 8 u32[16,23] + static const uint8_t s_reverse_bytes2[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0F, 0x0B, 0x07, 0x03 }; + __m128i bytesmask2 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes2 ); + __m128i rev32 = _mm_shuffle_epi8( rev16lohi, bytesmask2 ); // 32bits in u128[96,127] + _mm_store_si128( (__m128i*)pDst_bytes, rev32 ); // store [0,15] + } + else // assume only 2 blocks + { + __m128i src8_0 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 0 ); + __m128i src8_1 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 1 ); + __m128i rev8_0 = _mm_shuffle_epi8( bitsmask2, src8_0 ); // bit reverse + __m128i rev8_1 = _mm_shuffle_epi8( bitsmask2, src8_1 ); // bit reverse + __m128i rev8lo_0 = _mm_slli_epi16( rev8_0, 10 ); // shl16(8+2) to get lo2 in u16[10,11] + __m128i rev8lo_1 = _mm_slli_epi16( rev8_1, 10 ); // shl16(8+2) to get lo2 in u16[10,11] + __m128i rev8lohi_0 = _mm_or_si128( rev8lo_0, rev8_0 ); // 4bits in 8 u16[8,11] + __m128i rev8lohi_1 = _mm_or_si128( rev8lo_1, rev8_1 ); // 4bits in 8 u16[8,11] + __m128i rev16lo_0 = _mm_slli_epi32( rev8lohi_0, 20 ); // shl32(16+4) to get lo4 in u32[20,23] + __m128i rev16lo_1 = _mm_slli_epi32( rev8lohi_1, 20 ); // shl32(16+4) to get lo4 in u32[20,23] + __m128i rev16lohi_0 = _mm_or_si128( rev8lohi_0, rev16lo_0 ); // 8bits in 8 u32[16,23] + __m128i rev16lohi_1 = _mm_or_si128( rev8lohi_1, rev16lo_1 ); // 8bits in 8 u32[16,23] + // try to avoid these 2 instructions + __m128i rev32_0 = _mm_srli_epi32( rev16lohi_0, 24 ); // srl32(24) to get 8bits in u32[0,7] + __m128i rev32_1 = _mm_srli_epi32( rev16lohi_1, 24 ); // srl32(24) to get 8bits in u32[0,7] + __m128i rev64 = _mm_packs_epi32( rev32_1, rev32_0 ); // 8bits in u16[0,7] + static const uint8_t s_reverse_bytes2[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x06, 0x04, 0x02, 0x00, 0x0E, 0x0C, 0x0A, 0x08 }; + __m128i bytesmask2 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes2 ); + __m128i dst = _mm_shuffle_epi8( rev64, bytesmask2 ); // 64bits high byte in reverse order + _mm_store_si128( (__m128i*)pDst_bytes, dst ); // store [0,15] + } + #else // C + { + const uint32_t N = 2; + for (int i = 0; i < ((blockNum + 1) << 4); i++) + { + static const uint8_t s_reverse_bits2[4] = { 0, 2, 1, 3 }; + const uint32_t ofs = 128 - N - (i * N); + assert((ofs >> 3) < 16); + pDst_bytes[ofs >> 3] |= (s_reverse_bits2[pBlockWeights[i]] << (ofs & 7)); + } } + #endif // C break; } - case 3: - { + case 3: // 295 => 285 + { + assert(blockNum == 0); + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + uint8x16_t src3 = vld1q_u8( pBlockWeights ); + uint16x8_t src3hi = vshrq_n_u16( vreinterpretq_u16_u8(src3), 5 ); // shr16(8-3) to get hi3 in u16[3,5] + uint16x8_t src3lo = vandq_u16( vreinterpretq_u16_u8(src3), vdupq_n_u16(0x007) ); // have to mask remainder to avoid bit collision + uint16x8_t src3lohi = vorrq_u16( src3hi, src3lo ); // 6bits in 8 u16[0,5] + uint8x8_t src6 = vqmovn_u16( src3lohi ); // 6bits in 8 u8[0,5] + uint16x4_t src6hi = vshr_n_u16( vreinterpret_u16_u8(src6), 2 ); // shr16(8-2) to get hi6 in u16[6,11] + uint16x4_t src12 = vbsl_u16( vdup_n_u16(0x003F), vreinterpret_u16_u8(src6), src6hi ); // 12bits in 4 u16[0,11] + uint32x2_t src12hi = vshr_n_u32( vreinterpret_u32_u16(src12), 4 ); // shr32(16-12) to get in hi12 u32[12,23] + uint32x2_t src24 = vbsl_u32( vdup_n_u32(0x0FFF), vreinterpret_u32_u16(src12), src12hi ); // 24bits in 2 u32[0,23] + uint32_t src24lo = vget_lane_u32( src24, 0 ); // lo 8.24 + uint32_t src24hi = vget_lane_u32( src24, 1 ); // hi 8.24 + uint32_t rev24hi = __rbit( src24lo ); // hi rev 24.8 + uint32_t rev24lo = __rbit( src24hi ); // lo rev 24.8 + uint32_t rev32hi = (rev24hi) | (rev24lo >> 24U); // [32,63] + uint32_t rev32lo = (rev24lo << 8U); // [0,31] + *((uint32_t*)(pDst_bytes + 12)) = rev32hi; // store [12,15] + *((uint32_t*)(pDst_bytes + 8)) = rev32lo; // store [8,11] + vst1_u8( pDst_bytes + 0, vdup_n_u8(0) ); // store [0,7] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + static const uint8_t s_reverse_bits3[16] = { 0, 4, 2, 6, 1, 5, 3, 7, 0, 0, 0, 0, 0, 0, 0, 0 }; + __m128i bitsmask3 = _mm_loadu_si128( (const __m128i *) s_reverse_bits3 ); + __m128i src3 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) ); + __m128i rev3 = _mm_shuffle_epi8( bitsmask3, src3 ); // bit reverse + __m128i rev3lo = _mm_slli_epi16( rev3, 11 ); // shl16(8+3) to get lo3 in u16[11,13] + __m128i rev3hi = _mm_and_si128( rev3, _mm_set1_epi32(0x07000700) ); // have to mask remainder to avoid bit collision + __m128i rev6 = _mm_or_si128( rev3hi, rev3lo ); // 6bits in 8 u16[8,13] : BA,DC,FE,HG,JI,LK,NM,PO + __m128i rev6hi = _mm_srli_epi32( rev6, 24 ); // shr32(24) to get hi6[24,29] in u32[0,5] + __m128i rev6lo = _mm_srli_epi32( rev6, 2 ); // shr32(2) to get lo6[8,13] in u32[6,11] + __m128i rev12 = _mm_or_si128( rev6hi, rev6lo ); // 12bits in 4 u32[0,11] + garbage in u32[22,31] + __m128i rev24lo = _mm_slli_epi64( rev12, 44 ); // shl64(32+12) to get lo12 in u64[44,55] + __m128i rev24hi = _mm_and_si128( rev12, _mm_set1_epi32(0x0000FFFF) ); // clean garbage in u32[22,31] + __m128i rev24 = _mm_or_si128( rev24hi, rev24lo ); // 24bits in 2 u64[32,55] + static const uint8_t s_reverse_bytes3[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0C, 0x0D, 0x0E, 0x04, 0x05, 0x06 }; + __m128i bytesmask3 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes3 ); + __m128i rev48 = _mm_shuffle_epi8( rev24, bytesmask3 ); // 48bits in u128[80,127] + _mm_store_si128( (__m128i*)pDst_bytes, rev48 ); // store [0,15] + #else // C const uint32_t N = 3; - for (int i = 0; i < total_weights; i++) + for (int i = 0; i < ((blockNum + 1) << 4); i++) { static const uint8_t s_reverse_bits3[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; const uint32_t ofs = 128 - N - (i * N); - const uint32_t rev = s_reverse_bits3[pBlock->m_weights[i]] << (ofs & 7); + const uint32_t rev = s_reverse_bits3[pBlockWeights[i]] << (ofs & 7); uint32_t index = ofs >> 3; assert(index < 16); @@ -11979,29 +12126,108 @@ namespace basist if (index < 16) pDst_bytes[index++] |= (rev >> 8); } + #endif // C break; } case 4: { - const uint32_t N = 4; - for (int i = 0; i < total_weights; i++) - { - static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; - const int ofs = 128 - N - (i * N); - assert(ofs >= 0 && (ofs >> 3) < 16); - pDst_bytes[ofs >> 3] |= (s_reverse_bits4[pBlock->m_weights[i]] << (ofs & 7)); + assert(blockNum == 0); + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + uint8x16_t src4 = vld1q_u8( pBlockWeights ); + uint16x8_t src8hi = vshrq_n_u16( vreinterpretq_u16_u8(src4), 4 ); // shr16(8-4) to get hi4 in u16[4,7] + uint16x8_t src8lo = vandq_u16( vreinterpretq_u16_u8(src4), vdupq_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x8_t src8lohi = vorrq_u16( src8hi, src8lo ); // 8bits in 8 u16[0,7] +// uint16x8_t src8lohi = vbslq_u16( vdupq_n_u16(0x0F), vreinterpretq_u16_u8(src4), src8hi ); // slower than (and + or) + uint8x8_t src8 = vqmovn_u16( src8lohi ); // 8bits in 8 u8[0,7] + #if defined(__aarch64__) + uint8x8_t rev64 = vrev64_u8( vrbit_u8( src8 ) ); // bit reverse + vst1_u8( pDst_bytes + 8, rev64 ); // store [8,15] + #else // !defined(__aarch64__) + uint64_t rev64 = vget_lane_u64( vreinterpret_u64_u8(src8), 0 ); // get the 64 bits + rev64 = __rbitll( rev64 ); // bit reverse + *((uint64_t*)(pDst_bytes + 8)) = rev64; // store [8,15] + #endif // !defined(__aarch64__) + vst1_u8( pDst_bytes + 0, vdup_n_u8(0) ); // store [0,7] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; + __m128i bitsmask4 = _mm_loadu_si128( (const __m128i *) s_reverse_bits4 ); + __m128i src8 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) ); // load 16x8 + __m128i rev8 = _mm_shuffle_epi8( bitsmask4, src8 ); // bit reverse + __m128i rev8lo = _mm_slli_epi16( rev8, 12 ); // shr16(8+4) to get lo in u16[12,15] + __m128i rev8lohi = _mm_or_si128( rev8lo, rev8 ); // 8bits in 8 u16[8,15] + static const uint8_t s_reverse_bytes4[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01 }; + __m128i bytesmask4 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes4 ); + __m128i rev64 = _mm_shuffle_epi8( rev8lohi, bytesmask4 ); // extract bytes in reverse order + _mm_store_si128( (__m128i*)pDst_bytes, rev64 ); // store [0,15] + #else // C + { + const uint32_t N = 4; + for (int i = 0; i < ((blockNum + 1) << 4); i++) + { + static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; + const int ofs = 128 - N - (i * N); + assert(ofs >= 0 && (ofs >> 3) < 16); + pDst_bytes[ofs >> 3] |= (s_reverse_bits4[pBlockWeights[i]] << (ofs & 7)); + } } + #endif // C break; } case 5: { + assert(blockNum == 0); + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + uint8x16_t src5 = vld1q_u8( pBlockWeights ); + uint16x8_t src5hi = vshrq_n_u16( vreinterpretq_u16_u8(src5), 3 ); // shiftRight 8-5 to get in u16[5,9] + uint16x8_t src10 = vbslq_u16( vdupq_n_u16(0x01F), vreinterpretq_u16_u8(src5), src5hi ); // 10bits in 8 u16[0,9] + uint32x4_t src10hi = vshrq_n_u32( vreinterpretq_u32_u16(src10), 6 ); // shiftRight 16-10 to get in u32[10,19] + uint32x4_t src20 = vbslq_u32( vdupq_n_u32(0x03FF), vreinterpretq_u32_u16(src10), src10hi ); // 20bits in 4 u32[0,19] + uint64x2_t src20hi = vshrq_n_u64( vreinterpretq_u64_u32(src20), 12 ); // shiftRight 32-20 to get in u64[20,39] + uint64x2_t src40 = vbslq_u64( vdupq_n_u64(0x0FFFFF), vreinterpretq_u64_u32(src20), src20hi ); // 40bits in 2 u64[0,39] + uint64x1_t src40hilo = vshl_n_u64( vget_high_u64(src40), 40 ); // [40..64] + uint64x1_t src40lo = vorr_u64( vget_low_u64(src40), src40hilo ); // [0..63] + uint64x1_t src40hi = vshr_n_u64( vget_high_u64(src40), 24 ); // [64..79] +// try to use vrev32_u8( vrbit_u8( rev8 ) ) + uint32_t src80lo = vget_lane_u32( vreinterpret_u32_u64(src40lo), 0 ); // [0..31] + uint32_t src80mid = vget_lane_u32( vreinterpret_u32_u64(src40lo), 1 ); // [32..63] + uint32_t src80hi = vget_lane_u32( vreinterpret_u32_u64(src40hi), 0 ); // [64..79] + uint32_t rev80hi = __rbit( src80lo ); // [48..79] + uint32_t rev80mid = __rbit( src80mid ); // [16..47] + uint32_t rev80lo = __rbit( src80hi ); // [16][0..15] + *((uint32_t*)(pDst_bytes + 12)) = rev80hi; // store [12,15] + *((uint32_t*)(pDst_bytes + 8)) = rev80mid; // store [8,11] + *((uint32_t*)(pDst_bytes + 4)) = rev80lo; // store [4,7] + *((uint32_t*)(pDst_bytes + 0)) = 0U; // store [0,3] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + __m128i src5 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) ); // load 16x8 + __m128i src4 = _mm_and_si128( _mm_set1_epi32(0x0F0F0F0F), src5 ); // get 4 bits for shuffle + __m128i src54 = _mm_andnot_si128( _mm_set1_epi32(0x0F0F0F0F), src5 ); // keep 5th bit + static const uint8_t s_reverse_bits5[16] = { 0x00, 0x10, 0x08, 0x18, 0x04, 0x14, 0x0C, 0x1C, 0x02, 0x12, 0x0A, 0x1A, 0x06, 0x16, 0x0E, 0x1E }; + __m128i bitsmask5 = _mm_loadu_si128( (const __m128i *) s_reverse_bits5 ); + __m128i rev4 = _mm_shuffle_epi8( bitsmask5, src4 ); // bit reverse + __m128i rev0 = _mm_srli_epi16( src54, 4 ); // shr16(4) because shl8 doesn't exist => put bit[0] at right position + __m128i rev5 = _mm_or_si128( rev0, rev4 ); // 5bits in 16 u8[0,4] + __m128i rev5lo = _mm_slli_epi16( rev5, 8 ); // shl16(8) to get lo5[0,4] in u16[8,12] + __m128i rev5hi = _mm_srli_epi16( rev5, 5 ); // shr16(5) to get hi5[8,12] in u16[3,7] + __m128i rev10 = _mm_or_si128( rev5hi, rev5lo ); // 10bits in 8 u16[3,12] + __m128i rev10lo = _mm_slli_epi32( rev10, 13 ); // shl32(13) to get lo10[3,12] in u32[16,25] + __m128i rev10hi = _mm_srli_epi32( rev10, 13 ); // shr32(13) to get hi10[35,44] in u32[6,15] + __m128i rev20 = _mm_or_si128( rev10hi, rev10lo ); // 20bits in 4 u32[6,25] + __m128i rev20lo = _mm_slli_epi64( rev20, 14 ); // shl64(14) to get lo20[6,25] in u64[20,39] garbage in [52,63] + __m128i rev20hi = _mm_srli_epi64( rev20, 38 ); // shr64(38) to get hi20[38,57] in u64[0,19] + __m128i rev40 = _mm_or_si128( rev20hi, rev20lo ); // 40bits in 2 u64[0,39] garbage in [52,63] + static const uint8_t s_reverse_bytes5[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x00, 0x01, 0x02, 0x03, 0x04 }; + __m128i bytesmask5 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes5 ); + __m128i rev80 = _mm_shuffle_epi8( rev40, bytesmask5 ); // swizzle bytes to store (can do a single store if done first) + _mm_store_si128( (__m128i*)pDst_bytes, rev80 ); // store [0,15] + #else // C const uint32_t N = 5; - for (int i = 0; i < total_weights; i++) + for (int i = 0; i < ((blockNum + 1) << 4); i++) { static const uint8_t s_reverse_bits5[32] = { 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31 }; const uint32_t ofs = 128 - N - (i * N); - const uint32_t rev = s_reverse_bits5[pBlock->m_weights[i]] << (ofs & 7); + const uint32_t rev = s_reverse_bits5[pBlockWeights[i]] << (ofs & 7); uint32_t index = ofs >> 3; assert(index < 16); @@ -12009,13 +12235,75 @@ namespace basist if (index < 16) pDst_bytes[index++] |= (rev >> 8); } - + #endif // C break; } default: assert(0); break; } + } + + const uint32_t ASTC_BLOCK_MODE_BITS = 11; + const uint32_t ASTC_PART_BITS = 2; + const uint32_t ASTC_CEM_BITS = 4; + const uint32_t ASTC_PARTITION_INDEX_BITS = 10; + const uint32_t ASTC_CCS_BITS = 2; + + const uint32_t g_uastc_mode_astc_block_mode[TOTAL_UASTC_MODES] = { 0x242, 0x42, 0x53, 0x42, 0x42, 0x53, 0x442, 0x42, 0, 0x42, 0x242, 0x442, 0x53, 0x441, 0x42, 0x242, 0x42, 0x442, 0x253 }; + + bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t uastc_mode) + { + assert(uastc_mode < TOTAL_UASTC_MODES); + uint8_t* pDst_bytes = reinterpret_cast(pDst); + + const int total_weights = pBlock->m_dual_plane ? 32 : 16; + const int bits_per_weight = g_astc_bise_range_table[pBlock->m_weight_range][0]; + + // will clear pDst_bytes[16] + pack_astc_block_weights( pDst_bytes, pBlock->m_weights, bits_per_weight, pBlock->m_dual_plane ? 1 : 0 ); + + // Set mode bits - see Table 146-147 + uint32_t mode = g_uastc_mode_astc_block_mode[uastc_mode]; + pDst_bytes[0] = (uint8_t)mode; + pDst_bytes[1] = (uint8_t)(mode >> 8); + + int bit_pos = ASTC_BLOCK_MODE_BITS; + + // We only support 1-5 bit weight indices + assert(!g_astc_bise_range_table[pBlock->m_weight_range][1] && !g_astc_bise_range_table[pBlock->m_weight_range][2]); + + // See table 143 - PART + astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_subsets - 1, ASTC_PART_BITS); + + if (pBlock->m_subsets == 1) + astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_cem, ASTC_CEM_BITS); + else + { + // See table 145 + astc_set_bits(pDst, bit_pos, pBlock->m_partition_seed, ASTC_PARTITION_INDEX_BITS); + + // Table 150 - we assume all CEM's are equal, so write 2 0's along with the CEM + astc_set_bits_1_to_9(pDst, bit_pos, (pBlock->m_cem << 2) & 63, ASTC_CEM_BITS + 2); + } + + if (pBlock->m_dual_plane) + { + const int total_weight_bits = total_weights * bits_per_weight; + + // See Illegal Encodings 23.24 + // https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_illegal_encodings + assert((total_weight_bits >= 24) && (total_weight_bits <= 96)); + + int ccs_bit_pos = 128 - total_weight_bits - ASTC_CCS_BITS; + astc_set_bits_1_to_9(pDst, ccs_bit_pos, pBlock->m_ccs, ASTC_CCS_BITS); + } + + const int num_cem_pairs = (1 + (pBlock->m_cem >> 2)) * pBlock->m_subsets; + assert(num_cem_pairs <= 9); + + astc_pack_bise(pDst, pBlock->m_endpoints, bit_pos, num_cem_pairs * 2, g_uastc_mode_endpoint_ranges[uastc_mode]); + return true; }