From b781af6136de226a89f416641692b7084d72755b Mon Sep 17 00:00:00 2001 From: "ronan.bel" Date: Thu, 12 May 2022 16:50:37 +0200 Subject: [PATCH] ASTC weights SIMD encoding ssse3 (I5 6300) : 163 => 136 ms arm (A53) : 340 => 282 ms I moved the block weight transform code in a single function : pack_astc_block_weights you can enable/disable the SIMD code with a define BASISD_ASTC_SIMD All the simd code is annotated tested x86_64 on windows, compiled with VS2019 and clang 11 tested arm & arm64 on android, compiled with latest NDK (clang11) if needed, you can get in touch at : ronan.bel@gmail.com ronan.bel@ubisoft.com fix previous issue + optimize unpack --- transcoder/basisu_transcoder.cpp | 1061 ++++++++++++++++++++++++++---- 1 file changed, 918 insertions(+), 143 deletions(-) diff --git a/transcoder/basisu_transcoder.cpp b/transcoder/basisu_transcoder.cpp index 3aeba0ee..633a1322 100644 --- a/transcoder/basisu_transcoder.cpp +++ b/transcoder/basisu_transcoder.cpp @@ -37,6 +37,32 @@ #endif #endif +#if defined(__arm__) || defined(__aarch64__) || defined(__SSSE3__) +#define BASISD_ASTC_SIMD // 287 => 240 +#define BASISD_UASTC_SIMD // 240 => 222 (231 without BASISD_USE_UNALIGNED_WORD_READS) +#define BASISD_ASTC_24WRITE // 200 !!! + +#if defined(BASISD_ASTC_SIMD) || defined(BASISD_UASTC_SIMD) +#undef BASISD_USE_UNALIGNED_WORD_READS +#define BASISD_USE_UNALIGNED_WORD_READS (1) +#if defined(_MSC_VER) + #include + #define BREAK_EXECUTION() __debugbreak(); + #define CLASSALIGN(n) __declspec(align(n)) +#else // !defined(_MSC_VER) + #if defined(__arm__) || defined(__aarch64__) + #include + #include + #else + #include + #endif + #define BREAK_EXECUTION() __builtin_trap(); + #define CLASSALIGN(n) __attribute__ ((aligned(n))) +#endif // !defined(_MSC_VER) +#endif // defined(BASISD_ASTC_SIMD) + +#endif // defined(__arm__) || defined(__aarch64__) || defined(__SSSE3__) + // Using unaligned loads and stores causes errors when using UBSan. Jam it off. #if defined(__has_feature) #if __has_feature(undefined_behavior_sanitizer) @@ -5073,13 +5099,28 @@ namespace basist 191, 223, 124, 125, 126 }; // Extracts bits [low,high] +// ronan +#if 0 static inline uint32_t astc_extract_bits(uint32_t bits, int low, int high) { return (bits >> low) & ((1 << (high - low + 1)) - 1); } +#else + template + inline uint32_t astc_extract_bits(uint32_t bits) { return (bits >> low) & ((1 << (high - low + 1)) - 1); } +#endif - // Writes bits to output in an endian safe way static inline void astc_set_bits(uint32_t* pOutput, int& bit_pos, uint32_t value, uint32_t total_bits) +#if defined(BASISD_ASTC_24WRITE) + { + assert( total_bits <= 24U ); + uint8_t* pBytes = reinterpret_cast(pOutput); + pBytes += (bit_pos >> 3); + *((uint32_t*)pBytes) |= (value << (bit_pos & 7)); + bit_pos += total_bits; + } +#else + // Writes bits to output in an endian safe way { uint8_t* pBytes = reinterpret_cast(pOutput); @@ -5094,13 +5135,14 @@ namespace basist value >>= bits_to_write; } } - +#endif // Encodes 5 values to output, usable for any range that uses trits and bits static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, int& bit_pos, int n) { // First extract the trits and the bits from the 5 input values int trits = 0, bits[5]; const uint32_t bit_mask = (1 << n) - 1; + for (int i = 0; i < 5; i++) { static const int s_muls[5] = { 1, 3, 9, 27, 81 }; @@ -5118,10 +5160,10 @@ namespace basist const int T = g_astc_trit_encode[trits]; // Now interleave the 8 encoded trit bits with the bits to form the encoded output. See table 94. - astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 1) << n) | (bits[1] << (2 + n)), n * 2 + 2); + astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits<0,1>(T) << n) | (bits[1] << (2 + n)), n * 2 + 2); - astc_set_bits(pOutput, bit_pos, astc_extract_bits(T, 2, 3) | (bits[2] << 2) | (astc_extract_bits(T, 4, 4) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits(T, 5, 6) << (3 + n * 2)) | - (bits[4] << (5 + n * 2)) | (astc_extract_bits(T, 7, 7) << (5 + n * 3)), n * 3 + 6); + astc_set_bits(pOutput, bit_pos, astc_extract_bits<2,3>(T) | (bits[2] << 2) | (astc_extract_bits<4,4>(T) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits<5,6>(T) << (3 + n * 2)) | + (bits[4] << (5 + n * 2)) | (astc_extract_bits<7,7>(T) << (5 + n * 3)), n * 3 + 6); } #endif // #if BASISD_SUPPORT_UASTC || BASISD_SUPPORT_ASTC @@ -9308,7 +9350,6 @@ namespace basist for (uint32_t block_y = 0; block_y < num_blocks_y; ++block_y) { void* pDst_block = (uint8_t*)pDst_blocks + block_y * output_row_pitch_in_blocks_or_pixels * output_block_or_pixel_stride_in_bytes; - for (uint32_t block_x = 0; block_x < num_blocks_x; ++block_x, ++pSource_block, pDst_block = (uint8_t *)pDst_block + output_block_or_pixel_stride_in_bytes) { switch (fmt) @@ -10288,7 +10329,7 @@ namespace basist return true; } - + bool basisu_transcoder::start_transcoding(const void* pData, uint32_t data_size) { if (!validate_header_quick(pData, data_size)) @@ -10398,7 +10439,6 @@ namespace basist } m_ready_to_transcode = true; - return true; } @@ -11832,8 +11872,8 @@ namespace basist const int T = g_astc_quint_encode[quints]; // Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96. - astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) | - (bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3); + astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits<0,2>(T) << n) | (bits[1] << (3 + n)) | (astc_extract_bits<3,4>(T) << (3 + n * 2)) | + (bits[2] << (5 + n * 2)) | (astc_extract_bits<5,6>(T) << (5 + n * 3)), 7 + n * 3); } // Packs values using ASTC's BISE to output buffer. @@ -11878,100 +11918,233 @@ namespace basist pDst[2] |= temp[2]; pDst[3] |= temp[3]; } - const uint32_t ASTC_BLOCK_MODE_BITS = 11; - const uint32_t ASTC_PART_BITS = 2; - const uint32_t ASTC_CEM_BITS = 4; - const uint32_t ASTC_PARTITION_INDEX_BITS = 10; - const uint32_t ASTC_CCS_BITS = 2; - - const uint32_t g_uastc_mode_astc_block_mode[TOTAL_UASTC_MODES] = { 0x242, 0x42, 0x53, 0x42, 0x42, 0x53, 0x442, 0x42, 0, 0x42, 0x242, 0x442, 0x53, 0x441, 0x42, 0x242, 0x42, 0x442, 0x253 }; - - bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t uastc_mode) + void pack_astc_block_weights( uint8_t* pDst_bytes, const uint8_t* pBlockWeights, int bits_per_weight, int blockNum ) { - assert(uastc_mode < TOTAL_UASTC_MODES); - uint8_t* pDst_bytes = reinterpret_cast(pDst); - - const int total_weights = pBlock->m_dual_plane ? 32 : 16; - - // Set mode bits - see Table 146-147 - uint32_t mode = g_uastc_mode_astc_block_mode[uastc_mode]; - pDst_bytes[0] = (uint8_t)mode; - pDst_bytes[1] = (uint8_t)(mode >> 8); - + #if defined(BASISD_ASTC_SIMD) + #else // C memset(pDst_bytes + 2, 0, 16 - 2); + #endif // C - int bit_pos = ASTC_BLOCK_MODE_BITS; - - // We only support 1-5 bit weight indices - assert(!g_astc_bise_range_table[pBlock->m_weight_range][1] && !g_astc_bise_range_table[pBlock->m_weight_range][2]); - const int bits_per_weight = g_astc_bise_range_table[pBlock->m_weight_range][0]; - - // See table 143 - PART - astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_subsets - 1, ASTC_PART_BITS); - - if (pBlock->m_subsets == 1) - astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_cem, ASTC_CEM_BITS); - else - { - // See table 145 - astc_set_bits(pDst, bit_pos, pBlock->m_partition_seed, ASTC_PARTITION_INDEX_BITS); - - // Table 150 - we assume all CEM's are equal, so write 2 0's along with the CEM - astc_set_bits_1_to_9(pDst, bit_pos, (pBlock->m_cem << 2) & 63, ASTC_CEM_BITS + 2); - } - - if (pBlock->m_dual_plane) - { - const int total_weight_bits = total_weights * bits_per_weight; - - // See Illegal Encodings 23.24 - // https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_illegal_encodings - assert((total_weight_bits >= 24) && (total_weight_bits <= 96)); - - int ccs_bit_pos = 128 - total_weight_bits - ASTC_CCS_BITS; - astc_set_bits_1_to_9(pDst, ccs_bit_pos, pBlock->m_ccs, ASTC_CCS_BITS); - } - - const int num_cem_pairs = (1 + (pBlock->m_cem >> 2)) * pBlock->m_subsets; - assert(num_cem_pairs <= 9); - - astc_pack_bise(pDst, pBlock->m_endpoints, bit_pos, num_cem_pairs * 2, g_uastc_mode_endpoint_ranges[uastc_mode]); - - // Write the weight bits in reverse bit order. switch (bits_per_weight) { case 1: { - const uint32_t N = 1; - for (int i = 0; i < total_weights; i++) + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + static const uint8_t CLASSALIGN(16) s_reverse_bits1[16] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; + uint8x16_t revmask1 = vld1q_u8( s_reverse_bits1 ); + uint8x8_t rev8; + if ( blockNum == 0 ) + { + uint8x16_t src1 = vld1q_u8( pBlockWeights ); // load 16x1 + uint8x16_t test1 = vceqq_u8( src1, vdupq_n_u8(0) ); // mask if x == 0 + uint8x16_t rev1 = vandq_u8( revmask1, vmvnq_u8(test1) ); // final byte shifted value + uint8x8_t rev2 = vpadd_u8( vget_high_u8(rev1), vget_low_u8(rev1) ); // swap(hi,lo) & merge 2 bits + uint8x8_t rev4 = vpadd_u8( rev2, vdup_n_u8(0) ); // merge 4 bits + rev8 = vpadd_u8( rev4, vdup_n_u8(0) ); // merge 8 bits + rev8 = vext_u8( vdup_n_u8(0), rev8, 6 ); // shl64(6) + } + else + { + uint8x16_t src1_0 = vld1q_u8( pBlockWeights ); // load 16x1 + uint8x16_t src1_1 = vld1q_u8( pBlockWeights + 16 ); // load 16x1 + uint8x16_t test1_0 = vceqq_u8( src1_0, vdupq_n_u8(0) ); // mask if x == 0 + uint8x16_t test1_1 = vceqq_u8( src1_1, vdupq_n_u8(0) ); // mask if x == 0 + uint8x16_t rev1_0 = vandq_u8( revmask1, vmvnq_u8(test1_0) ); // final byte shifted value + uint8x16_t rev1_1 = vandq_u8( revmask1, vmvnq_u8(test1_1) ); // final byte shifted value + uint8x8_t rev2_0 = vpadd_u8( vget_high_u8(rev1_0), vget_low_u8(rev1_0) ); // swap(hi,lo) & merge 2 bits + uint8x8_t rev2_1 = vpadd_u8( vget_high_u8(rev1_1), vget_low_u8(rev1_1) ); // swap(hi,lo) & merge 2 bits + uint8x8_t rev4 = vpadd_u8( rev2_1, rev2_0 ); // swap(hi,lo) & merge 4 buts + rev8 = vpadd_u8( rev4, vdup_n_u8(0) ); // merge 8 bits + rev8 = vext_u8( vdup_n_u8(0), rev8, 4 ); // shl64(4) + } + vst1_u8( pDst_bytes + 8, rev8 ); // store [8,15] + vst1_u8( pDst_bytes + 0, vdup_n_u8(0) ); // store [0,8] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + static const uint8_t CLASSALIGN(16) s_reverse_bits1[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + __m128i bitsmask1 = _mm_load_si128( (const __m128i *) s_reverse_bits1 ); + if ( blockNum == 0 ) + { + __m128i src1 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 0 ); // load 16x8 + __m128i rev1 = _mm_shuffle_epi8( src1, bitsmask1 ); // byte reverse + uint32_t rev16 = (uint32_t) _mm_movemask_epi8( _mm_slli_epi32( rev1, 7 ) ); // move to high bit and get mask + *((uint32_t*)(pDst_bytes + 12)) = (rev16 << 16U); // store [12,15] + } + else { - const uint32_t ofs = 128 - N - i; - assert((ofs >> 3) < 16); - pDst_bytes[ofs >> 3] |= (pBlock->m_weights[i] << (ofs & 7)); + __m128i src1_0 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 0 ); // load 16x8 + __m128i src1_1 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 1 ); // load 16x8 + __m128i rev1_0 = _mm_shuffle_epi8( src1_0, bitsmask1 ); // byte reverse + __m128i rev1_1 = _mm_shuffle_epi8( src1_1, bitsmask1 ); // byte reverse + uint32_t rev16_0 = (uint32_t) _mm_movemask_epi8( _mm_slli_epi32( rev1_0, 7 ) ); // move to high bit and get mask + uint32_t rev16_1 = (uint32_t) _mm_movemask_epi8( _mm_slli_epi32( rev1_1, 7 ) ); // move to high bit and get mask + *((uint32_t*)(pDst_bytes + 12)) = (rev16_0 << 16U) | rev16_1; // store [12,15] } + *((uint32_t*)(pDst_bytes + 8)) = 0U; // store [8,11] + *((uint64_t*)(pDst_bytes + 0)) = 0ULL; // store [0,7] + #else + { + const uint32_t N = 1; + for (int i = 0; i < ((blockNum + 1) << 4); i++) + { + const uint32_t ofs = 128 - N - i; + assert((ofs >> 3) < 16); + pDst_bytes[ofs >> 3] |= (pBlockWeights[i] << (ofs & 7)); + } + } + #endif break; } case 2: { - const uint32_t N = 2; - for (int i = 0; i < total_weights; i++) + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + if ( blockNum == 0 ) + { + uint8x16_t src2 = vld1q_u8( pBlockWeights ); + uint16x8_t src2hi = vshrq_n_u16( vreinterpretq_u16_u8(src2), 6 ); // shr16(8-2) to get in u16[2,3] + uint16x8_t src2lo = vandq_u16( vreinterpretq_u16_u8(src2), vdupq_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x8_t src4lohi = vorrq_u16( src2hi, src2lo ); // 4bits in 8 u16[0,3] + uint8x8_t src4 = vqmovn_u16( src4lohi ); // 4bits in 8 u8[0,3] + uint16x4_t rev4hi = vshr_n_u16( vreinterpret_u16_u8(src4), 4 ); // shr16(8-4) to get in u16[4,7] + uint16x4_t rev4lo = vand_u16( vreinterpret_u16_u8(src4), vdup_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x4_t rev8lohi = vorr_u16( rev4hi, rev4lo ); // 8bits in 4 u16[0,7] + uint8x8_t rev8 = vqmovn_u16( vcombine_u16( rev8lohi, vdup_n_u8(0) ) ); // 8bits in 4 u8 (clear lower 32) + #if defined(__aarch64__) + uint8x8_t rev64 = vrev64_u8( vrbit_u8( rev8 ) ); // bit reverse + vst1_u8( pDst_bytes + 8, rev64 ); // store [8,15] + #else // !defined(__aarch64__) + uint32_t rev32 = vget_lane_u32( vreinterpret_u32_u8(rev8), 0 ); // get the 32 bits + rev32 = __rbit( rev32 ); // reverse + *((uint32_t*)(pDst_bytes + 12)) = rev32; // store [12,15] + *((uint32_t*)(pDst_bytes + 8)) = 0U; // store [8,11] + #endif // !defined(__aarch64__) + } + else { - static const uint8_t s_reverse_bits2[4] = { 0, 2, 1, 3 }; - const uint32_t ofs = 128 - N - (i * N); - assert((ofs >> 3) < 16); - pDst_bytes[ofs >> 3] |= (s_reverse_bits2[pBlock->m_weights[i]] << (ofs & 7)); + uint8x16_t src2_0 = vld1q_u8( pBlockWeights ); + uint8x16_t src2_1 = vld1q_u8( pBlockWeights + 16 ); + uint16x8_t src2hi_0 = vshrq_n_u16( vreinterpretq_u16_u8(src2_0), 6 ); // shr16(8-2) to get in u16[2,3] + uint16x8_t src2hi_1 = vshrq_n_u16( vreinterpretq_u16_u8(src2_1), 6 ); // shr16(8-2) to get in u16[2,3] + uint16x8_t src2lo_0 = vandq_u16( vreinterpretq_u16_u8(src2_0), vdupq_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x8_t src2lo_1 = vandq_u16( vreinterpretq_u16_u8(src2_1), vdupq_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x8_t src4lohi_0 = vorrq_u16( src2hi_0, src2lo_0 ); // 4bits in 8 u16[0,3] + uint16x8_t src4lohi_1 = vorrq_u16( src2hi_1, src2lo_1 ); // 4bits in 8 u16[0,3] + uint16x4_t src4_0 = vreinterpret_u16_u8( vqmovn_u16( src4lohi_0 ) ); // 4bits in 8 u8[0,3] + uint16x4_t src4_1 = vreinterpret_u16_u8( vqmovn_u16( src4lohi_1 ) ); // 4bits in 8 u8[0,3] + uint16x8_t src4 = vcombine_u16( src4_0, src4_1 ); // 4bits in 8 u16[0,3] + uint16x8_t rev4hi = vshrq_n_u16( src4, 4 ); // shr16(8-4) to get in u16[4,7] + uint16x8_t rev4lo = vandq_u16( src4, vdupq_n_u16(0x00FF) ); // have to mask remainder + uint16x8_t rev8lohi = vorrq_u16( rev4hi, rev4lo ); // 8bits in 8 u16[0,7] + uint8x8_t rev8 = vqmovn_u16( rev8lohi ); // 8bits in 8 u8[0,7] + #if defined(__aarch64__) + uint8x8_t rev64 = vrev64_u8( vrbit_u8( rev8 ) ); // bit reverse + vst1_u8( pDst_bytes + 8, rev64 ); // store result + #else // !defined(__aarch64__) + uint64_t rev64 = vget_lane_u64( vreinterpret_u64_u8(rev8), 0 ); // get the 64 bits + rev64 = __rbitll( rev64 ); // bit reverse + *((uint64_t*)(pDst_bytes + 8)) = rev64; // store [8,15] + #endif // !defined(__aarch64__) + } + vst1_u8( pDst_bytes + 0, vdup_n_u8(0) ); // store [0,7] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + __m128i bitsmask2 = _mm_cvtsi32_si128( 0x03010200 ); // { 0, 2, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + if ( blockNum == 0 ) + { + __m128i src8 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) ); + __m128i rev8 = _mm_shuffle_epi8( bitsmask2, src8 ); // bit reverse + __m128i rev8lo = _mm_slli_epi16( rev8, 10 ); // shl16(8+2) to get lo2 in u16[10,11] + __m128i rev8lohi = _mm_or_si128( rev8lo, rev8 ); // 4bits in 8 u16[8,11] + __m128i rev16lo = _mm_slli_epi32( rev8lohi, 20 ); // shl32(16+4) to get lo4 in u32[20,23] + __m128i rev16lohi = _mm_or_si128( rev8lohi, rev16lo ); // 8bits in 8 u32[16,23] + static const uint8_t CLASSALIGN(16) s_reverse_bytes2[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0F, 0x0B, 0x07, 0x03 }; + __m128i bytesmask2 = _mm_load_si128( (const __m128i *) s_reverse_bytes2 ); + __m128i rev32 = _mm_shuffle_epi8( rev16lohi, bytesmask2 ); // 32bits in u128[96,127] + _mm_store_si128( (__m128i*)pDst_bytes, rev32 ); // store [0,15] + } + else // assume only 2 blocks + { + __m128i src8_0 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 0 ); + __m128i src8_1 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 1 ); + __m128i rev8_0 = _mm_shuffle_epi8( bitsmask2, src8_0 ); // bit reverse + __m128i rev8_1 = _mm_shuffle_epi8( bitsmask2, src8_1 ); // bit reverse + __m128i rev8lo_0 = _mm_slli_epi16( rev8_0, 10 ); // shl16(8+2) to get lo2 in u16[10,11] + __m128i rev8lo_1 = _mm_slli_epi16( rev8_1, 10 ); // shl16(8+2) to get lo2 in u16[10,11] + __m128i rev8lohi_0 = _mm_or_si128( rev8lo_0, rev8_0 ); // 4bits in 8 u16[8,11] + __m128i rev8lohi_1 = _mm_or_si128( rev8lo_1, rev8_1 ); // 4bits in 8 u16[8,11] + __m128i rev16lo_0 = _mm_slli_epi32( rev8lohi_0, 20 ); // shl32(16+4) to get lo4 in u32[20,23] + __m128i rev16lo_1 = _mm_slli_epi32( rev8lohi_1, 20 ); // shl32(16+4) to get lo4 in u32[20,23] + __m128i rev16lohi_0 = _mm_or_si128( rev8lohi_0, rev16lo_0 ); // 8bits in 8 u32[16,23] + __m128i rev16lohi_1 = _mm_or_si128( rev8lohi_1, rev16lo_1 ); // 8bits in 8 u32[16,23] + // try to avoid these 2 instructions + __m128i rev32_0 = _mm_srli_epi32( rev16lohi_0, 24 ); // srl32(24) to get 8bits in u32[0,7] + __m128i rev32_1 = _mm_srli_epi32( rev16lohi_1, 24 ); // srl32(24) to get 8bits in u32[0,7] + __m128i rev64 = _mm_packs_epi32( rev32_1, rev32_0 ); // 8bits in u16[0,7] + static const uint8_t CLASSALIGN(16) s_reverse_bytes2[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x06, 0x04, 0x02, 0x00, 0x0E, 0x0C, 0x0A, 0x08 }; + __m128i bytesmask2 = _mm_load_si128( (const __m128i *) s_reverse_bytes2 ); + __m128i dst = _mm_shuffle_epi8( rev64, bytesmask2 ); // 64bits high byte in reverse order + _mm_store_si128( (__m128i*)pDst_bytes, dst ); // store [0,15] + } + #else // C + { + const uint32_t N = 2; + for (int i = 0; i < ((blockNum + 1) << 4); i++) + { + static const uint8_t s_reverse_bits2[4] = { 0, 2, 1, 3 }; + const uint32_t ofs = 128 - N - (i * N); + assert((ofs >> 3) < 16); + pDst_bytes[ofs >> 3] |= (s_reverse_bits2[pBlockWeights[i]] << (ofs & 7)); + } } + #endif // C break; } - case 3: - { + case 3: // 295 => 285 + { + assert(blockNum == 0); + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + uint8x16_t src3 = vld1q_u8( pBlockWeights ); + uint16x8_t src3hi = vshrq_n_u16( vreinterpretq_u16_u8(src3), 5 ); // shr16(8-3) to get hi3 in u16[3,5] + uint16x8_t src3lo = vandq_u16( vreinterpretq_u16_u8(src3), vdupq_n_u16(0x007) ); // have to mask remainder to avoid bit collision + uint16x8_t src3lohi = vorrq_u16( src3hi, src3lo ); // 6bits in 8 u16[0,5] + uint8x8_t src6 = vqmovn_u16( src3lohi ); // 6bits in 8 u8[0,5] + uint16x4_t src6hi = vshr_n_u16( vreinterpret_u16_u8(src6), 2 ); // shr16(8-2) to get hi6 in u16[6,11] + uint16x4_t src12 = vbsl_u16( vdup_n_u16(0x003F), vreinterpret_u16_u8(src6), src6hi ); // 12bits in 4 u16[0,11] + uint32x2_t src12hi = vshr_n_u32( vreinterpret_u32_u16(src12), 4 ); // shr32(16-12) to get in hi12 u32[12,23] + uint32x2_t src24 = vbsl_u32( vdup_n_u32(0x0FFF), vreinterpret_u32_u16(src12), src12hi ); // 24bits in 2 u32[0,23] + uint32_t src24lo = vget_lane_u32( src24, 0 ); // lo 8.24 + uint32_t src24hi = vget_lane_u32( src24, 1 ); // hi 8.24 + uint32_t rev24hi = __rbit( src24lo ); // hi rev 24.8 + uint32_t rev24lo = __rbit( src24hi ); // lo rev 24.8 + uint32_t rev32hi = (rev24hi) | (rev24lo >> 24U); // [32,63] + uint32_t rev32lo = (rev24lo << 8U); // [0,31] + *((uint32_t*)(pDst_bytes + 12)) = rev32hi; // store [12,15] + *((uint32_t*)(pDst_bytes + 8)) = rev32lo; // store [8,11] + vst1_u8( pDst_bytes + 0, vdup_n_u8(0) ); // store [0,7] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + static const uint8_t CLASSALIGN(16) s_reverse_bits3[16] = { 0, 4, 2, 6, 1, 5, 3, 7, 0, 0, 0, 0, 0, 0, 0, 0 }; + __m128i bitsmask3 = _mm_load_si128( (const __m128i *) s_reverse_bits3 ); + __m128i src3 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) ); + __m128i rev3 = _mm_shuffle_epi8( bitsmask3, src3 ); // bit reverse + __m128i rev3lo = _mm_slli_epi16( rev3, 11 ); // shl16(8+3) to get lo3 in u16[11,13] + __m128i rev3hi = _mm_and_si128( rev3, _mm_set1_epi32(0x07000700) ); // have to mask remainder to avoid bit collision + __m128i rev6 = _mm_or_si128( rev3hi, rev3lo ); // 6bits in 8 u16[8,13] : BA,DC,FE,HG,JI,LK,NM,PO + __m128i rev6hi = _mm_srli_epi32( rev6, 24 ); // shr32(24) to get hi6[24,29] in u32[0,5] + __m128i rev6lo = _mm_srli_epi32( rev6, 2 ); // shr32(2) to get lo6[8,13] in u32[6,11] + __m128i rev12 = _mm_or_si128( rev6hi, rev6lo ); // 12bits in 4 u32[0,11] + garbage in u32[22,31] + __m128i rev24lo = _mm_slli_epi64( rev12, 44 ); // shl64(32+12) to get lo12 in u64[44,55] + __m128i rev24hi = _mm_and_si128( rev12, _mm_set1_epi32(0x0000FFFF) ); // clean garbage in u32[22,31] + __m128i rev24 = _mm_or_si128( rev24hi, rev24lo ); // 24bits in 2 u64[32,55] + static const uint8_t CLASSALIGN(16) s_reverse_bytes3[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0C, 0x0D, 0x0E, 0x04, 0x05, 0x06 }; + __m128i bytesmask3 = _mm_load_si128( (const __m128i *) s_reverse_bytes3 ); + __m128i rev48 = _mm_shuffle_epi8( rev24, bytesmask3 ); // 48bits in u128[80,127] + _mm_store_si128( (__m128i*)pDst_bytes, rev48 ); // store [0,15] + #else // C const uint32_t N = 3; - for (int i = 0; i < total_weights; i++) + for (int i = 0; i < ((blockNum + 1) << 4); i++) { static const uint8_t s_reverse_bits3[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; const uint32_t ofs = 128 - N - (i * N); - const uint32_t rev = s_reverse_bits3[pBlock->m_weights[i]] << (ofs & 7); + const uint32_t rev = s_reverse_bits3[pBlockWeights[i]] << (ofs & 7); uint32_t index = ofs >> 3; assert(index < 16); @@ -11979,29 +12152,108 @@ namespace basist if (index < 16) pDst_bytes[index++] |= (rev >> 8); } + #endif // C break; } case 4: { - const uint32_t N = 4; - for (int i = 0; i < total_weights; i++) - { - static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; - const int ofs = 128 - N - (i * N); - assert(ofs >= 0 && (ofs >> 3) < 16); - pDst_bytes[ofs >> 3] |= (s_reverse_bits4[pBlock->m_weights[i]] << (ofs & 7)); + assert(blockNum == 0); + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + uint8x16_t src4 = vld1q_u8( pBlockWeights ); + uint16x8_t src8hi = vshrq_n_u16( vreinterpretq_u16_u8(src4), 4 ); // shr16(8-4) to get hi4 in u16[4,7] + uint16x8_t src8lo = vandq_u16( vreinterpretq_u16_u8(src4), vdupq_n_u16(0x00FF) ); // have to mask remainder to avoid bit collision + uint16x8_t src8lohi = vorrq_u16( src8hi, src8lo ); // 8bits in 8 u16[0,7] +// uint16x8_t src8lohi = vbslq_u16( vdupq_n_u16(0x0F), vreinterpretq_u16_u8(src4), src8hi ); // slower than (and + or) + uint8x8_t src8 = vqmovn_u16( src8lohi ); // 8bits in 8 u8[0,7] + #if defined(__aarch64__) + uint8x8_t rev64 = vrev64_u8( vrbit_u8( src8 ) ); // bit reverse + vst1_u8( pDst_bytes + 8, rev64 ); // store [8,15] + #else // !defined(__aarch64__) + uint64_t rev64 = vget_lane_u64( vreinterpret_u64_u8(src8), 0 ); // get the 64 bits + rev64 = __rbitll( rev64 ); // bit reverse + *((uint64_t*)(pDst_bytes + 8)) = rev64; // store [8,15] + #endif // !defined(__aarch64__) + vst1_u8( pDst_bytes + 0, vdup_n_u8(0) ); // store [0,7] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + static const uint8_t CLASSALIGN(16) s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; + __m128i bitsmask4 = _mm_load_si128( (const __m128i *) s_reverse_bits4 ); + __m128i src8 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) ); // load 16x8 + __m128i rev8 = _mm_shuffle_epi8( bitsmask4, src8 ); // bit reverse + __m128i rev8lo = _mm_slli_epi16( rev8, 12 ); // shr16(8+4) to get lo in u16[12,15] + __m128i rev8lohi = _mm_or_si128( rev8lo, rev8 ); // 8bits in 8 u16[8,15] + static const uint8_t CLASSALIGN(16) s_reverse_bytes4[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01 }; + __m128i bytesmask4 = _mm_load_si128( (const __m128i *) s_reverse_bytes4 ); + __m128i rev64 = _mm_shuffle_epi8( rev8lohi, bytesmask4 ); // extract bytes in reverse order + _mm_store_si128( (__m128i*)pDst_bytes, rev64 ); // store [0,15] + #else // C + { + const uint32_t N = 4; + for (int i = 0; i < ((blockNum + 1) << 4); i++) + { + static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; + const int ofs = 128 - N - (i * N); + assert(ofs >= 0 && (ofs >> 3) < 16); + pDst_bytes[ofs >> 3] |= (s_reverse_bits4[pBlockWeights[i]] << (ofs & 7)); + } } + #endif // C break; } case 5: { + assert(blockNum == 0); + #if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) ) + uint8x16_t src5 = vld1q_u8( pBlockWeights ); + uint16x8_t src5hi = vshrq_n_u16( vreinterpretq_u16_u8(src5), 3 ); // shiftRight 8-5 to get in u16[5,9] + uint16x8_t src10 = vbslq_u16( vdupq_n_u16(0x01F), vreinterpretq_u16_u8(src5), src5hi ); // 10bits in 8 u16[0,9] + uint32x4_t src10hi = vshrq_n_u32( vreinterpretq_u32_u16(src10), 6 ); // shiftRight 16-10 to get in u32[10,19] + uint32x4_t src20 = vbslq_u32( vdupq_n_u32(0x03FF), vreinterpretq_u32_u16(src10), src10hi ); // 20bits in 4 u32[0,19] + uint64x2_t src20hi = vshrq_n_u64( vreinterpretq_u64_u32(src20), 12 ); // shiftRight 32-20 to get in u64[20,39] + uint64x2_t src40 = vbslq_u64( vdupq_n_u64(0x0FFFFF), vreinterpretq_u64_u32(src20), src20hi ); // 40bits in 2 u64[0,39] + uint64x1_t src40hilo = vshl_n_u64( vget_high_u64(src40), 40 ); // [40..64] + uint64x1_t src40lo = vorr_u64( vget_low_u64(src40), src40hilo ); // [0..63] + uint64x1_t src40hi = vshr_n_u64( vget_high_u64(src40), 24 ); // [64..79] +// try to use vrev32_u8( vrbit_u8( rev8 ) ) + uint32_t src80lo = vget_lane_u32( vreinterpret_u32_u64(src40lo), 0 ); // [0..31] + uint32_t src80mid = vget_lane_u32( vreinterpret_u32_u64(src40lo), 1 ); // [32..63] + uint32_t src80hi = vget_lane_u32( vreinterpret_u32_u64(src40hi), 0 ); // [64..79] + uint32_t rev80hi = __rbit( src80lo ); // [48..79] + uint32_t rev80mid = __rbit( src80mid ); // [16..47] + uint32_t rev80lo = __rbit( src80hi ); // [16][0..15] + *((uint32_t*)(pDst_bytes + 12)) = rev80hi; // store [12,15] + *((uint32_t*)(pDst_bytes + 8)) = rev80mid; // store [8,11] + *((uint32_t*)(pDst_bytes + 4)) = rev80lo; // store [4,7] + *((uint32_t*)(pDst_bytes + 0)) = 0U; // store [0,3] + #elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__) + __m128i src5 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) ); // load 16x8 + __m128i src4 = _mm_and_si128( _mm_set1_epi32(0x0F0F0F0F), src5 ); // get 4 bits for shuffle + __m128i src54 = _mm_andnot_si128( _mm_set1_epi32(0x0F0F0F0F), src5 ); // keep 5th bit + static const uint8_t CLASSALIGN(16) s_reverse_bits5[16] = { 0x00, 0x10, 0x08, 0x18, 0x04, 0x14, 0x0C, 0x1C, 0x02, 0x12, 0x0A, 0x1A, 0x06, 0x16, 0x0E, 0x1E }; + __m128i bitsmask5 = _mm_load_si128( (const __m128i *) s_reverse_bits5 ); + __m128i rev4 = _mm_shuffle_epi8( bitsmask5, src4 ); // bit reverse + __m128i rev0 = _mm_srli_epi16( src54, 4 ); // shr16(4) because shl8 doesn't exist => put bit[0] at right position + __m128i rev5 = _mm_or_si128( rev0, rev4 ); // 5bits in 16 u8[0,4] + __m128i rev5lo = _mm_slli_epi16( rev5, 8 ); // shl16(8) to get lo5[0,4] in u16[8,12] + __m128i rev5hi = _mm_srli_epi16( rev5, 5 ); // shr16(5) to get hi5[8,12] in u16[3,7] + __m128i rev10 = _mm_or_si128( rev5hi, rev5lo ); // 10bits in 8 u16[3,12] + __m128i rev10lo = _mm_slli_epi32( rev10, 13 ); // shl32(13) to get lo10[3,12] in u32[16,25] + __m128i rev10hi = _mm_srli_epi32( rev10, 13 ); // shr32(13) to get hi10[35,44] in u32[6,15] + __m128i rev20 = _mm_or_si128( rev10hi, rev10lo ); // 20bits in 4 u32[6,25] + __m128i rev20lo = _mm_slli_epi64( rev20, 14 ); // shl64(14) to get lo20[6,25] in u64[20,39] garbage in [52,63] + __m128i rev20hi = _mm_srli_epi64( rev20, 38 ); // shr64(38) to get hi20[38,57] in u64[0,19] + __m128i rev40 = _mm_or_si128( rev20hi, rev20lo ); // 40bits in 2 u64[0,39] garbage in [52,63] + static const uint8_t CLASSALIGN(16) s_reverse_bytes5[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x00, 0x01, 0x02, 0x03, 0x04 }; + __m128i bytesmask5 = _mm_load_si128( (const __m128i *) s_reverse_bytes5 ); + __m128i rev80 = _mm_shuffle_epi8( rev40, bytesmask5 ); // swizzle bytes to store (can do a single store if done first) + _mm_store_si128( (__m128i*)pDst_bytes, rev80 ); // store [0,15] + #else // C const uint32_t N = 5; - for (int i = 0; i < total_weights; i++) + for (int i = 0; i < ((blockNum + 1) << 4); i++) { static const uint8_t s_reverse_bits5[32] = { 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31 }; const uint32_t ofs = 128 - N - (i * N); - const uint32_t rev = s_reverse_bits5[pBlock->m_weights[i]] << (ofs & 7); + const uint32_t rev = s_reverse_bits5[pBlockWeights[i]] << (ofs & 7); uint32_t index = ofs >> 3; assert(index < 16); @@ -12009,13 +12261,75 @@ namespace basist if (index < 16) pDst_bytes[index++] |= (rev >> 8); } - + #endif // C break; } default: assert(0); break; } + } + + const uint32_t ASTC_BLOCK_MODE_BITS = 11; + const uint32_t ASTC_PART_BITS = 2; + const uint32_t ASTC_CEM_BITS = 4; + const uint32_t ASTC_PARTITION_INDEX_BITS = 10; + const uint32_t ASTC_CCS_BITS = 2; + + const uint32_t g_uastc_mode_astc_block_mode[TOTAL_UASTC_MODES] = { 0x242, 0x42, 0x53, 0x42, 0x42, 0x53, 0x442, 0x42, 0, 0x42, 0x242, 0x442, 0x53, 0x441, 0x42, 0x242, 0x42, 0x442, 0x253 }; + + bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t uastc_mode) + { + assert(uastc_mode < TOTAL_UASTC_MODES); + uint8_t* pDst_bytes = reinterpret_cast(pDst); + + const int total_weights = pBlock->m_dual_plane ? 32 : 16; + const int bits_per_weight = g_astc_bise_range_table[pBlock->m_weight_range][0]; + + // will clear pDst_bytes[16] + pack_astc_block_weights( pDst_bytes, pBlock->m_weights, bits_per_weight, pBlock->m_dual_plane ? 1 : 0 ); + + // Set mode bits - see Table 146-147 + uint32_t mode = g_uastc_mode_astc_block_mode[uastc_mode]; + pDst_bytes[0] = (uint8_t)mode; + pDst_bytes[1] = (uint8_t)(mode >> 8); + + int bit_pos = ASTC_BLOCK_MODE_BITS; + + // We only support 1-5 bit weight indices + assert(!g_astc_bise_range_table[pBlock->m_weight_range][1] && !g_astc_bise_range_table[pBlock->m_weight_range][2]); + + // See table 143 - PART + astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_subsets - 1, ASTC_PART_BITS); + + if (pBlock->m_subsets == 1) + astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_cem, ASTC_CEM_BITS); + else + { + // See table 145 + astc_set_bits(pDst, bit_pos, pBlock->m_partition_seed, ASTC_PARTITION_INDEX_BITS); + + // Table 150 - we assume all CEM's are equal, so write 2 0's along with the CEM + astc_set_bits_1_to_9(pDst, bit_pos, (pBlock->m_cem << 2) & 63, ASTC_CEM_BITS + 2); + } + + if (pBlock->m_dual_plane) + { + const int total_weight_bits = total_weights * bits_per_weight; + + // See Illegal Encodings 23.24 + // https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_illegal_encodings + assert((total_weight_bits >= 24) && (total_weight_bits <= 96)); + + int ccs_bit_pos = 128 - total_weight_bits - ASTC_CCS_BITS; + astc_set_bits_1_to_9(pDst, ccs_bit_pos, pBlock->m_ccs, ASTC_CCS_BITS); + } + + const int num_cem_pairs = (1 + (pBlock->m_cem >> 2)) * pBlock->m_subsets; + assert(num_cem_pairs <= 9); + + astc_pack_bise(pDst, pBlock->m_endpoints, bit_pos, num_cem_pairs * 2, g_uastc_mode_endpoint_ranges[uastc_mode]); + return true; } @@ -12138,6 +12452,463 @@ namespace basist return (w >> byte_bit_offset)& ((1U << codesize) - 1U); } +#if defined(BASISD_UASTC_SIMD) + + +class unpack_uastc_weights +{ +private: + +// 1 block +#define D_unpack_uast_caseMono( _base, _bitnum, _anchor ) k##_bitnum##_##_base, +// 2 blocks +#define D_unpack_uast_caseDual( _base, _bitnum, _anchor ) k##_bitnum##_##_base, k##_bitnum##_##_base##_1, +// 19 anchors combinaisons distinct cases, bit 0 always set, highest bit = 12 +#define D_unpack_uast_casePack( _base, _bitnum ) \ +k##_bitnum##_##_base, k##_bitnum##_##_base##_1, k##_bitnum##_##_base##_2, k##_bitnum##_##_base##_3, \ +k##_bitnum##_##_base##_4, k##_bitnum##_##_base##_5, k##_bitnum##_##_base##_6, k##_bitnum##_##_base##_7, \ +k##_bitnum##_##_base##_8, k##_bitnum##_##_base##_9, k##_bitnum##_##_base##_10, k##_bitnum##_##_base##_11, \ +k##_bitnum##_##_base##_12, k##_bitnum##_##_base##_13, k##_bitnum##_##_base##_14, k##_bitnum##_##_base##_15, \ +k##_bitnum##_##_base##_16, k##_bitnum##_##_base##_17, k##_bitnum##_##_base##_18, + + enum param_case : uint16_t + { + D_unpack_uast_caseDual(66,2, 3) // mode == 6 and 11 + D_unpack_uast_caseDual(94,1, 3) // mode == 13 + D_unpack_uast_caseDual(61,2, 3) // mode == 17 + + D_unpack_uast_caseMono(65,4, 1) // mode == 0 and 10 + D_unpack_uast_caseMono(69,2, 1) // mode == 1 + D_unpack_uast_caseMono(62,4, 1) // mode == 4 and 15 + D_unpack_uast_caseMono(68,3, 1) // mode == 5 + D_unpack_uast_caseMono(81,3, 1) // mode == 12 + D_unpack_uast_caseMono(92,2, 1) // mode == 14 + D_unpack_uast_caseMono(49,5, 1) // mode == 18 + + D_unpack_uast_casePack(73,3) + D_unpack_uast_casePack(89,2) + D_unpack_uast_casePack(97,2) + D_unpack_uast_casePack(98,2) + kLast + }; + +// 19 anchors combinaisons distinct cases, bit 0 always set, highest bit = 12 + enum anchorArrayCasesName : uint8_t + { k0001, k0003, k0005, k0007, k0009, k000B, k000D, k0011, k0045, k0081, + k0101, k0111, k0201, k0203, k0501, k0801, k1001, k1011, k1101 }; + + struct CLASSALIGN(16) param + { uint16_t m_ByteShiftByteMask[16]; }; + + struct CLASSALIGN(16) partition + { uint8_t m_Byte[16]; }; + + + static constexpr uint8_t min8( uint8_t a, uint8_t b ) { return (a > b) ? b : a; } + + static constexpr bool unpack_uastc_isAnchor( uint8_t _index, uint32_t _anchor ) + { return ((1U << _index) & _anchor) != 0U; } + + static constexpr uint8_t unpack_uastc_bitMask( uint8_t _bitnum, uint8_t _index, uint32_t _anchor ) + { return unpack_uastc_isAnchor( _index, _anchor ) ? (_bitnum - 1U) : _bitnum; } + + static constexpr uint8_t unpack_uastc_bitSizeSum( uint8_t _bitnum, uint8_t _index, uint32_t _anchor ) + { return ((_index > 0U) ? unpack_uastc_bitSizeSum( _bitnum, _index - 1U, _anchor ) : 0U) + _bitnum - ( unpack_uastc_isAnchor( _index, _anchor ) ? 1U : 0U); } + + static constexpr uint8_t unpack_uastc_bitOffset( uint8_t _base, uint8_t _bitnum, uint8_t _index, uint32_t _anchor ) + { return _base + ((_index > 0U) ? unpack_uastc_bitSizeSum( _bitnum, _index - 1U, _anchor ) : 0U); } + + static constexpr uint8_t unpack_uastc_bitBase( uint8_t _base, uint8_t _bitnum, uint8_t _index, uint32_t _anchor ) + { return unpack_uastc_bitOffset( _base, _bitnum, _index, _anchor ) & 7U; } + + static constexpr uint8_t unpack_uastc_bitShift( uint8_t _base, uint8_t _bitnum, uint8_t _index, uint32_t _anchor ) + { + uint8_t bitBase = unpack_uastc_bitBase( _base, _bitnum, _index, _anchor ); + #if ( defined(__arm__) || defined(__aarch64__) ) // ARM has shrv + return bitBase; } + #elif defined(__SSSE3__) // Intel : convert shrv to mulhi + // (bit & 7U) > 0 => byte0 = (bit >> 3U) ; byte1 = ((bit >> 3U) + 1) & 15U ; mulhi = 1U << (8U + 8U - (bit & 7U)) + // (bit & 7U) = 0 => byte0 = (bit >> 3U) ; byte1 = ((bit >> 3U) + 0) & 15U ; mulhi = 1U << 8U + return (bitBase == 0U) ? 0U : (8U - bitBase); } // real value is 1U << (v + 8U) + #endif + + static constexpr uint16_t byteOffset( uint8_t _base, uint8_t _bitnum, uint8_t _index, uint32_t _anchor ) + { + uint8_t bitOffset = unpack_uastc_bitOffset( _base, _bitnum, _index, _anchor ); + uint8_t lo = (bitOffset >> 3U); // 4bits + uint8_t hi = min8( 15U, lo + (((bitOffset & 7U) == 0) ? 0U : 1U) ); // may use only 1bit + lo |= (unpack_uastc_bitShift( _base, _bitnum, _index, _anchor ) << 4U); // 3bits + hi |= (unpack_uastc_bitMask( _bitnum, _index, _anchor ) << 4U); // 3bits + return (((uint16_t)hi) << 8U) | ((uint16_t)lo); + } + + static const anchorArrayCasesName ms_anchorArrayDict[ 1 + TOTAL_ASTC_BC7_COMMON_PARTITIONS3 + TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS + TOTAL_ASTC_BC7_COMMON_PARTITIONS2 ]; + + static const partition ms_PartitionDict[ 1 + TOTAL_ASTC_BC7_COMMON_PARTITIONS3 + TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS + TOTAL_ASTC_BC7_COMMON_PARTITIONS2 ]; + + static const param ms_param_group[param_case::kLast]; + + static unpack_uastc_weights ms_last; // to monitor unhandled cases + + static void compute( const param * _param, const uint8_t * _src, const partition * _pattern, uint8_t * _dst, uint8_t _weight_mask8, uint8_t _invert_subset_mask8 ) + { + #if ( defined(__arm__) || defined(__aarch64__) ) + const uint8_t * param = (const uint8_t *)_param; + uint8x16_t bitLoad = vld1q_u8( _src ); // load 8x16 + uint8x8x2_t bitSrc{ vget_low_u8( bitLoad ), vget_high_u8( bitLoad ) }; + + uint8x16_t bitByte0 = vld1q_u8( param + 0 ); + uint8x16_t bitByte1 = vld1q_u8( param + 16 ); + uint16x8_t bitCtrl0 = vreinterpretq_u16_u8( vshrq_n_u8( bitByte0, 4 ) ); // remove bytePos + uint16x8_t bitCtrl1 = vreinterpretq_u16_u8( vshrq_n_u8( bitByte1, 4 ) ); // remove bytePos + bitByte0 = vandq_u8( bitByte0, vdupq_n_u8(0x0F) ); // filter bytePos + bitByte1 = vandq_u8( bitByte1, vdupq_n_u8(0x0F) ); // filter bytePos + uint8x8_t word03 = vtbl2_u8( bitSrc, vget_low_u8( bitByte0 ) ); // translate 8x8 + uint8x8_t word47 = vtbl2_u8( bitSrc, vget_high_u8( bitByte0 ) ); // translate 8x8 + uint16x8_t word07 = vreinterpretq_u16_u8( vcombine_u8( word03, word47 ) ); // get word 16x8 + uint8x8_t word8B = vtbl2_u8( bitSrc, vget_low_u8( bitByte1 ) ); // translate 8x8 + uint8x8_t wordCF = vtbl2_u8( bitSrc, vget_high_u8( bitByte1 ) ); // translate 8x8 + uint16x8_t word8F = vreinterpretq_u16_u8( vcombine_u8( word8B, wordCF ) ); // get word 16x8 + uint16x8_t bitShift0 = vandq_u16( bitCtrl0, vdupq_n_u16( 0x000F ) ); // filter Shift + uint16x8_t bitShift1 = vandq_u16( bitCtrl1, vdupq_n_u16( 0x000F ) ); // filter Shift + int16x8_t shr0 = vnegq_s16( vreinterpretq_s16_u16( bitShift0 ) ); // neg shift 16x8 + int16x8_t shr1 = vnegq_s16( vreinterpretq_s16_u16( bitShift1 ) ); // neg shift 16x8 + word07 = vshlq_u16( word07, shr0 ); // align 16x8 + word8F = vshlq_u16( word8F, shr1 ); // align 16x8 + uint16x8_t bitNum0 = vshrq_n_u16( bitCtrl0, 8 ); // filter bitNum + uint16x8_t bitNum1 = vshrq_n_u16( bitCtrl1, 8 ); // filter bitNum + uint16x8_t bitMask0 = vshlq_u16( vdupq_n_u16(1), bitNum0 ); // bitMask = (1U << n) - 1U + uint16x8_t bitMask1 = vshlq_u16( vdupq_n_u16(1), bitNum1 ); // bitMask = (1U << n) - 1U + bitMask0 = vsubq_u16( bitMask0, vdupq_n_u16(1) ); // bitMask = (1U << n) - 1U + bitMask1 = vsubq_u16( bitMask1, vdupq_n_u16(1) ); // bitMask = (1U << n) - 1U + word07 = vandq_u16( word07, bitMask0 ); // mask 16x8 + word8F = vandq_u16( word8F, bitMask1 ); // mask 16x8 + uint8x16_t weight = vcombine_u8( vqmovn_u16(word07), vqmovn_u16(word8F) ); // pack to 8x16 + uint8x16_t pattern = vld1q_u8( (const uint8_t *)_pattern ); // load 8x16 + uint8x16_t test = vtstq_u8( pattern, vdupq_n_u8(_invert_subset_mask8) ); // mask if any + test = vandq_u8( test, vdupq_n_u8(_weight_mask8) ); // weightMask if set + weight = veorq_u8( weight, test ); // weightMask - weight + vst1q_u8( _dst, weight ); // store 8x16 + #elif defined(__SSSE3__) + const __m128i * param = (const __m128i *)_param; // msvc seems to compile worst with load aligned + __m128i bitSrc = _mm_loadu_si128( (const __m128i *)_src ); // load 8x16 + __m128i bitByte0 = _mm_load_si128( param + 0 ); // load 16x8 + __m128i bitByte1 = _mm_load_si128( param + 1 ); // load 16x8 + __m128i bitCtrl0 = _mm_srli_epi16( bitByte0, 4 ); // remove bytePos + __m128i bitCtrl1 = _mm_srli_epi16( bitByte1, 4 ); // remove bytePos + bitByte0 = _mm_and_si128( bitByte0, _mm_set1_epi32( 0x0F0F0F0F ) ); // filter bytePos + bitByte1 = _mm_and_si128( bitByte1, _mm_set1_epi32( 0x0F0F0F0F ) ); // filter bytePos + __m128i word07 = _mm_shuffle_epi8( bitSrc, bitByte0 ); // get word 16x8 + __m128i word8F = _mm_shuffle_epi8( bitSrc, bitByte1 ); // get word 16x8 + __m128i bitShift0 = _mm_and_si128( bitCtrl0, _mm_set1_epi32( 0x000F000F ) ); // filter Shift + __m128i bitShift1 = _mm_and_si128( bitCtrl1, _mm_set1_epi32( 0x000F000F ) ); // filter Shift + __m128i bitShift = _mm_packus_epi16( bitShift0, bitShift1 ); // pack to 8x16 + __m128i bitNum0 = _mm_srli_epi16( bitCtrl0, 8 ); // filter bitNum + __m128i bitNum1 = _mm_srli_epi16( bitCtrl1, 8 ); // filter bitNum + __m128i bitNum = _mm_packus_epi16( bitNum0, bitNum1 ); // pack to 8x16 + __m128i shiftToMul = _mm_setr_epi8( 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, -128, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, -128 ); // 8 used values + __m128i bitMul = _mm_shuffle_epi8( shiftToMul, bitShift ); // mulhi[8] = shuffle( shiftToMul, bitShift ) + __m128i bitMul0 = _mm_unpacklo_epi8( _mm_setzero_si128(), bitMul ); // mulhi[16] = (1U << (shift + 8U)) + __m128i bitMul1 = _mm_unpackhi_epi8( _mm_setzero_si128(), bitMul ); // mulhi[16] = (1U << (shift + 8U)) + word07 = _mm_mulhi_epu16( word07, bitMul0 ); // align 16x8 (shrv unavailable) + word8F = _mm_mulhi_epu16( word8F, bitMul1 ); // align 16x8 (shrv unavailable) + __m128i numToMask = _mm_setr_epi8( 0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F ); // 8 used values + __m128i bitMask = _mm_shuffle_epi8( numToMask, bitNum ); // could use : mask = shuffle( shiftToMul, bitNum ) - 1U + __m128i bitMask0 = _mm_unpacklo_epi8( bitMask, _mm_setzero_si128() ); // zeroExtend to u16 + __m128i bitMask1 = _mm_unpackhi_epi8( bitMask, _mm_setzero_si128() ); // zeroExtend to u16 + word07 = _mm_and_si128( word07, bitMask0 ); // mask 16x8 + word8F = _mm_and_si128( word8F, bitMask1 ); // mask 16x8 + __m128i weight = _mm_packus_epi16( word07, word8F ); // pack to 8x16 + __m128i pattern = _mm_load_si128( (const __m128i *)_pattern ); // load 8x16 + __m128i test = _mm_and_si128( pattern, _mm_set1_epi8(_invert_subset_mask8) ); // weightMask bit + test = _mm_cmpeq_epi8( test, _mm_setzero_si128() ); // weightMask propagate + test = _mm_andnot_si128( test, _mm_set1_epi8(_weight_mask8) ); // weightMask if set + weight = _mm_xor_si128( weight, test ); // weightMask - weight + _mm_storeu_si128( (__m128i*)_dst, weight ); // store 8x16 + #else // error + #error "undefined architecture" + #endif // error + } + + static const anchorArrayCasesName getAnchorDict( uint8_t index ) { return ms_anchorArrayDict[index]; } + + uint8_t get_anchorTableIndex() const + { + uint8_t index = 0U; + + if (m_subsets >= 2) + { + if (m_subsets == 3) + index = 1U + m_pattern; + else if (m_mode == 7) + index = 1U + TOTAL_ASTC_BC7_COMMON_PARTITIONS3 + m_pattern; + else + index = 1U + TOTAL_ASTC_BC7_COMMON_PARTITIONS3 + TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS + m_pattern; + } + return index; + } + + uint8_t m_bit_ofs; // 7 + uint8_t m_total_planes; // 1 + uint8_t m_weight_bits; // 5 + uint8_t m_invert_subset_mask8; // 3 + uint8_t m_mode; // 5 + uint8_t m_subsets; // 2 + uint8_t m_pattern; // 5 + uint8_t m_pad; + +public: + + unpack_uastc_weights() = default; + unpack_uastc_weights( uint8_t _total_planes, uint8_t _invert_subset_mask8, uint8_t weight_bits, uint8_t bit_ofs, uint8_t subsets, uint8_t _mode, uint8_t _pattern ) + : m_bit_ofs( bit_ofs ) + , m_total_planes( _total_planes ) + , m_weight_bits( weight_bits ) + , m_invert_subset_mask8( _invert_subset_mask8 ) + , m_mode( _mode ) + , m_subsets( subsets ) + , m_pattern( _pattern ) + , m_pad( 0U ) + {} + ~unpack_uastc_weights() = default; + + void run( const uint8_t * _src, uint8_t * _dst ) const + { + ms_last = *this; // to monitor unhandled cases + + uint8_t paramIndex = get_anchorTableIndex(); + + uint16_t packCase = unpack_uastc_weights::param_case::kLast; + bool dual = false; + if (m_mode == 18) + packCase = unpack_uastc_weights::param_case::k5_49; + else + { + if (m_total_planes == 2) + { + if ( m_bit_ofs == 61 ) + packCase = unpack_uastc_weights::param_case::k2_61; + else if ( m_bit_ofs == 66 ) + packCase = unpack_uastc_weights::param_case::k2_66; + else if ( m_bit_ofs == 94 ) + packCase = unpack_uastc_weights::param_case::k1_94; + else + BREAK_EXECUTION(); + } + else + { + if (m_subsets == 1) + { + if (m_weight_bits == 4) + { + if ( m_bit_ofs == 65 ) + packCase = unpack_uastc_weights::param_case::k4_65; + else if ( m_bit_ofs == 62 ) + packCase = unpack_uastc_weights::param_case::k4_62; + else + BREAK_EXECUTION(); + } + else + { + if (m_weight_bits == 3) + { + if ( ( m_bit_ofs == 68 ) && ( m_mode == 5 ) ) + packCase = unpack_uastc_weights::param_case::k3_68; + else if ( ( m_bit_ofs == 81 ) && ( m_mode == 12 ) ) + packCase = unpack_uastc_weights::param_case::k3_81; + else + BREAK_EXECUTION(); + } + else if (m_weight_bits == 2) + { + if ( ( m_bit_ofs == 92 ) && ( m_mode == 14 ) ) + packCase = unpack_uastc_weights::param_case::k2_92; + else if ( ( m_bit_ofs == 69 ) && ( m_mode == 1 ) ) + packCase = unpack_uastc_weights::param_case::k2_69; + else + BREAK_EXECUTION(); + } + else if (m_weight_bits == 1) + { + BREAK_EXECUTION(); + } + } + } + else + { + if ( (m_weight_bits == 3) && ( m_bit_ofs == 73 ) && ( m_mode == 2 ) ) + packCase = unpack_uastc_weights::param_case::k3_73 + getAnchorDict(paramIndex); + else if ( (m_weight_bits == 3) && ( m_bit_ofs == 73 ) && ( m_mode == 3 ) ) + packCase = unpack_uastc_weights::param_case::k3_73 + getAnchorDict(paramIndex); + else if ( (m_weight_bits == 2) && ( m_bit_ofs == 89 ) && ( m_mode == 3 ) ) + packCase = unpack_uastc_weights::param_case::k2_89 + getAnchorDict(paramIndex); + else if ( (m_weight_bits == 2) && ( m_bit_ofs == 89 ) && ( m_mode == 4 ) ) + packCase = unpack_uastc_weights::param_case::k2_89 + getAnchorDict(paramIndex); + else if ( (m_weight_bits == 2) && ( m_bit_ofs == 89 ) && ( m_mode == 7 ) ) + packCase = unpack_uastc_weights::param_case::k2_89 + getAnchorDict(paramIndex); + else if ( (m_weight_bits == 2) && ( m_bit_ofs == 97 ) && ( m_mode == 9 ) ) + packCase = unpack_uastc_weights::param_case::k2_97 + getAnchorDict(paramIndex); + else if ( (m_weight_bits == 2) && ( m_bit_ofs == 98 ) && ( m_mode == 16 ) ) + packCase = unpack_uastc_weights::param_case::k2_98 + getAnchorDict(paramIndex); + else + BREAK_EXECUTION(); + } + } + } + + if ( packCase != unpack_uastc_weights::param_case::kLast ) + { + const uint8_t weight_mask8 = (1U << m_weight_bits) - 1; + compute( &ms_param_group[packCase], _src, &ms_PartitionDict[paramIndex], _dst + 0, weight_mask8, m_invert_subset_mask8 ); + if (m_total_planes == 2) + compute( &ms_param_group[packCase + 1], _src, &ms_PartitionDict[paramIndex], _dst + 16, weight_mask8, m_invert_subset_mask8 ); + } + else + BREAK_EXECUTION(); + } + + void testPattern( const uint8_t * pPartition_pattern ) const + { + uint8_t paramIndex = get_anchorTableIndex(); + for (uint32_t i = 0; i < 16; i++) + if ( (1U << pPartition_pattern[i]) != ms_PartitionDict[paramIndex].m_Byte[i] ) + BREAK_EXECUTION(); + } + +}; // class unpack_uastc_weights + + + +// block constructor (16 items) +#define D_unpack_uastc_byteOffset( _base, _bitnum, _anchor, _start ) \ +{ unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 0, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 1, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 2, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 3, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 4, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 5, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 6, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 7, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 8, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 9, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 10, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 11, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 12, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 13, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 14, _anchor ), \ + unpack_uastc_weights::byteOffset( _base, _bitnum, _start + 15, _anchor ) }, + +// 1 block +#define D_unpack_uast_paramMono( _base, _bitnum, _anchor ) D_unpack_uastc_byteOffset( _base, _bitnum, _anchor, 0 ) +// 2 blocks +#define D_unpack_uast_paramDual( _base, _bitnum, _anchor ) D_unpack_uastc_byteOffset( _base, _bitnum, _anchor, 0 ) D_unpack_uastc_byteOffset( _base, _bitnum, _anchor, 16 ) +// 19 anchors combinaisons distinct cases, bit 0 always set, highest bit = 12 +#define D_unpack_uast_paramPack( _base, _bitnum ) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0001) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0003) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0005) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0007) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0009) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x000B) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x000D) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0011) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0045) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0081) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0101) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0111) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0201) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0203) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0501) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x0801) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x1001) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x1011) \ +D_unpack_uast_paramMono(_base, _bitnum, 0x1101) + + +// 19 anchors combinaisons distinct cases : dictonnary +const unpack_uastc_weights::anchorArrayCasesName unpack_uastc_weights::ms_anchorArrayDict[ 1 + TOTAL_ASTC_BC7_COMMON_PARTITIONS3 + TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS + TOTAL_ASTC_BC7_COMMON_PARTITIONS2 ] +{ +// default +// { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +k0001, +// const uint8_t g_astc_bc7_pattern3_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][3] = +// { 0, 8, 10 }, { 8, 0, 12 }, { 4, 0, 12 }, { 8, 0, 4 }, { 3, 0, 2 }, { 0, 1, 3 }, { 0, 2, 1 }, { 1, 9, 0 }, { 1, 2, 0 }, { 4, 0, 8 }, { 0, 6, 2 } +k0501, k1101, k1011, k0111, k000D, k000B, k0007, k0203, k0007, k0111, k0045, +// const uint8_t g_bc7_3_astc2_patterns2_anchors[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][3] = +// { 0, 4 }, { 0, 2 }, { 2, 0 }, { 0, 7 }, { 8, 0 }, { 0, 1 }, { 0, 3 }, { 0, 1 }, +k0011, k0005, k0005, k0081, k0101, k0003, k0009, k0003, +// { 2, 0 }, { 0, 1 }, { 0, 8 }, { 2, 0 }, { 0, 1 }, { 0, 7 }, { 12, 0 }, { 2, 0 }, +k0005, k0003, k0101, k0005, k0003, k0081, k1001, k0005, +// { 9, 0 }, { 0, 2 }, { 4, 0 } +k0201, k0005, k0011, +// const uint8_t g_astc_bc7_pattern2_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][3] = +// { 0, 2 }, { 0, 3 }, { 1, 0 }, { 0, 3 }, { 7, 0 }, { 0, 2 }, { 3, 0 }, { 7, 0 }, +k0005, k0009, k0003, k0009, k0081, k0005, k0009, k0081, +// { 0, 11 }, { 2, 0 }, { 0, 7 }, { 11, 0 }, { 3, 0 }, { 8, 0 }, { 0, 4 }, { 12, 0 }, +k0801, k0005, k0081, k0801, k0009, k0101, k0011, k1001, +// { 1, 0 }, { 8, 0 }, { 0, 1 }, { 0, 2 }, { 0, 4 }, { 8, 0 }, { 1, 0 }, { 0, 2 }, +k0003, k0101, k0003, k0005, k0011, k0101, k0003, k0005, +// { 4, 0 }, { 0, 1 }, { 4, 0 }, { 1, 0 }, { 4, 0 }, { 1, 0 } +k0011, k0003, k0011, k0003, k0011, k0003, +}; + + +#define BPF( _B15, _B14, _B13, _B12, _B11, _B10, _B9, _B8, _B7, _B6, _B5, _B4, _B3, _B2, _B1, _B0) \ +{ 1<<_B15, 1<<_B14, 1<<_B13, 1<<_B12, 1<<_B11, 1<<_B10, 1<<_B9, 1<<_B8, 1<<_B7, 1<<_B6, 1<<_B5, 1<<_B4, 1<<_B3, 1<<_B2, 1<<_B1, 1<<_B0 } +// pixel partitions converted to bitfield +const unpack_uastc_weights::partition unpack_uastc_weights::ms_PartitionDict[ 1 + TOTAL_ASTC_BC7_COMMON_PARTITIONS3 + TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS + TOTAL_ASTC_BC7_COMMON_PARTITIONS2 ] +{ +// default + BPF( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), +// const uint8_t g_astc_bc7_patterns3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][16] = + BPF( 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2 ), BPF( 1,1,1,1,1,1,1,1,0,0,0,0,2,2,2,2 ), BPF( 1,1,1,1,0,0,0,0,0,0,0,0,2,2,2,2 ), BPF( 1,1,1,1,2,2,2,2,0,0,0,0,0,0,0,0 ), + BPF( 1,1,2,0,1,1,2,0,1,1,2,0,1,1,2,0 ), BPF( 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2 ), BPF( 0,2,1,1,0,2,1,1,0,2,1,1,0,2,1,1 ), BPF( 2,0,0,0,2,0,0,0,2,1,1,1,2,1,1,1 ), + BPF( 2,0,1,2,2,0,1,2,2,0,1,2,2,0,1,2 ), BPF( 1,1,1,1,0,0,0,0,2,2,2,2,1,1,1,1 ), BPF( 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2 ), +// const uint8_t g_bc7_3_astc2_patterns2[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][16] = + BPF( 0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0 ), BPF( 0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0 ), BPF( 1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0 ), BPF( 0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1 ), + BPF( 1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1 ), BPF( 0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0 ), BPF( 0,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1 ), BPF( 0,1,1,1,0,0,1,1,0,0,1,1,0,0,1,1 ), + BPF( 1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0 ), BPF( 0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0 ), BPF( 0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,0 ), BPF( 1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0 ), + BPF( 0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0 ), BPF( 0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1 ), BPF( 1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0 ), BPF( 1,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0 ), + BPF( 1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0 ), BPF( 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0 ), BPF( 1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0 ), +// const uint8_t g_astc_bc7_patterns2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][16] = + BPF( 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1 ), BPF( 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1 ), BPF( 1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0 ), BPF( 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1 ), + BPF( 1,1,1,1,1,1,1,0,1,1,1,0,1,1,0,0 ), BPF( 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1 ), BPF( 1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0 ), BPF( 1,1,1,1,1,1,1,0,1,1,0,0,1,0,0,0 ), + BPF( 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1 ), BPF( 1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0 ), BPF( 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1 ), BPF( 1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0 ), + BPF( 1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0 ), BPF( 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0 ), BPF( 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1 ), BPF( 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0 ), + BPF( 1,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1 ), BPF( 1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,1 ), BPF( 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0 ), BPF( 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0 ), + BPF( 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0 ), BPF( 1,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1 ), BPF( 1,0,0,0,1,1,0,0,1,1,0,0,1,1,1,0 ), BPF( 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0 ), + BPF( 1,1,1,1,0,1,1,1,0,1,1,1,0,0,1,1 ), BPF( 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0 ), BPF( 1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1 ), BPF( 1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0 ), + BPF( 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0 ), BPF( 1,0,0,1,0,0,1,1,0,1,1,0,1,1,0,0 ) +}; + + + +const unpack_uastc_weights::param unpack_uastc_weights::ms_param_group[unpack_uastc_weights::param_case::kLast] { +D_unpack_uast_paramDual(66,2, 3) // mode == 6 and 11 +D_unpack_uast_paramDual(94,1, 3) // mode == 13 +D_unpack_uast_paramDual(61,2, 3) // mode == 17 + +D_unpack_uast_paramMono(65,4, 1) // mode == 0 and 10 +D_unpack_uast_paramMono(69,2, 1) // mode == 1 +D_unpack_uast_paramMono(62,4, 1) // mode == 4 and 15 +D_unpack_uast_paramMono(68,3, 1) // mode == 5 +D_unpack_uast_paramMono(81,3, 1) // mode == 12 +D_unpack_uast_paramMono(92,2, 1) // mode == 14 +D_unpack_uast_paramMono(49,5, 1) // mode == 18 + +D_unpack_uast_paramPack(73,3) +D_unpack_uast_paramPack(89,2) +D_unpack_uast_paramPack(97,2) +D_unpack_uast_paramPack(98,2) + +// if (mode == UASTC_MODE_INDEX_SOLID_COLOR) // mode == 8 +}; + +unpack_uastc_weights unpack_uastc_weights::ms_last; // to monitor unhandled cases + +#endif // defined(BASISD_UASTC_SIMD) + + bool unpack_uastc(const uastc_block& blk, unpacked_uastc_block& unpacked, bool blue_contract_check, bool read_hints) { //memset(&unpacked, 0, sizeof(unpacked)); @@ -12316,19 +13087,10 @@ namespace basist const uint32_t ep_quints = g_astc_bise_range_table[endpoint_range][2]; uint32_t total_tqs = 0; - uint32_t bundle_size = 0, mul = 0; if (ep_trits) - { total_tqs = (total_values + 4) / 5; - bundle_size = 5; - mul = 3; - } else if (ep_quints) - { total_tqs = (total_values + 2) / 3; - bundle_size = 3; - mul = 5; - } uint32_t tq_values[8]; for (uint32_t i = 0; i < total_tqs; i++) @@ -12336,7 +13098,8 @@ namespace basist uint32_t num_bits = ep_trits ? 8 : 7; if (i == (total_tqs - 1)) { - uint32_t num_remaining = total_values - (total_tqs - 1) * bundle_size; +// compiler knows how to fast-mul by 3 & 5 + uint32_t num_remaining = total_values - (total_tqs - 1) * ((ep_trits) ? 5U : 3U); if (ep_trits) { switch (num_remaining) @@ -12376,12 +13139,20 @@ namespace basist { assert(next_tq_index < total_tqs); accum = tq_values[next_tq_index++]; - accum_remaining = bundle_size; + accum_remaining = ((ep_trits) ? 5U : 3U); + } +// compiler knows how to fast-divide by 3 & 5 + uint32_t v; + if ( ep_trits ) + { + v = (uint32_t) (((uint8_t)accum) % 3); + accum = (uint32_t) (((uint8_t)accum) / 3); + } + else + { + v = (uint32_t) (((uint8_t)accum) % 5); + accum = (uint32_t) (((uint8_t)accum) / 5); } - - // TODO: Optimize with tables - uint32_t v = accum % mul; - accum /= mul; accum_remaining--; value |= (v << ep_bits); @@ -12390,7 +13161,7 @@ namespace basist unpacked.m_astc.m_endpoints[i] = (uint8_t)value; } - const uint8_t* pPartition_pattern; + const uint8_t* pPartition_pattern; // used to invert weights const uint8_t* pSubset_anchor_indices = get_anchor_indices(subsets, mode, unpacked.m_common_pattern, pPartition_pattern); #ifdef _DEBUG @@ -12432,6 +13203,35 @@ namespace basist } #endif + uint8_t invert_subset_mask8 = 0U; + if ((blue_contract_check) && (total_comps >= 3)) + { + // We only need to disable ASTC Blue Contraction when we'll be packing to ASTC. The other transcoders don't care. + + for (uint32_t subset_index = 0; subset_index < subsets; subset_index++) + { + const int s0 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 0]].m_unquant + + g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 2]].m_unquant + + g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 4]].m_unquant; + + const int s1 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 1]].m_unquant + + g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 3]].m_unquant + + g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 5]].m_unquant; + + if (s1 < s0) + { + for (uint32_t c = 0; c < total_comps; c++) + std::swap(unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 0], unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 1]); + + invert_subset_mask8 |= 1U << subset_index; + } + } + } +#if defined(BASISD_UASTC_SIMD) + unpack_uastc_weights job( (uint8_t)total_planes, invert_subset_mask8, (uint8_t)weight_bits, (uint8_t)bit_ofs, (uint8_t)subsets, (uint8_t)mode, (uint8_t)unpacked.m_common_pattern ); +// job.testPattern( pPartition_pattern ); + job.run( blk.m_bytes, unpacked.m_astc.m_weights ); +#else // !defined(BASISD_UASTC_SIMD) if (mode == 18) { // Mode 18 is the only mode with more than 64 weight bits. @@ -12471,7 +13271,6 @@ namespace basist if (total_planes == 2) { // Dual plane modes always have a single subset, and the first 2 weights are anchors. - unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask); bit_ofs += (weight_bits - 1); @@ -12548,50 +13347,26 @@ namespace basist } } - if ((blue_contract_check) && (total_comps >= 3)) + if (invert_subset_mask8 != 0U) { - // We only need to disable ASTC Blue Contraction when we'll be packing to ASTC. The other transcoders don't care. - bool invert_subset[3] = { false, false, false }; - bool any_flag = false; + const uint32_t weight_mask = (1 << weight_bits) - 1; - for (uint32_t subset_index = 0; subset_index < subsets; subset_index++) + for (uint32_t i = 0; i < 16; i++) { - const int s0 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 0]].m_unquant + - g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 2]].m_unquant + - g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 4]].m_unquant; + uint32_t subset = pPartition_pattern[i]; - const int s1 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 1]].m_unquant + - g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 3]].m_unquant + - g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 5]].m_unquant; - - if (s1 < s0) + if ( (invert_subset_mask8 & (1U << subset)) != 0U ) { - for (uint32_t c = 0; c < total_comps; c++) - std::swap(unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 0], unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 1]); + unpacked.m_astc.m_weights[i * total_planes] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes]); - invert_subset[subset_index] = true; - any_flag = true; + if (total_planes == 2) + unpacked.m_astc.m_weights[i * total_planes + 1] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes + 1]); } } + } +#endif // !defined(BASISD_UASTC_SIMD) - if (any_flag) - { - const uint32_t weight_mask = (1 << weight_bits) - 1; - - for (uint32_t i = 0; i < 16; i++) - { - uint32_t subset = pPartition_pattern[i]; - - if (invert_subset[subset]) - { - unpacked.m_astc.m_weights[i * total_planes] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes]); - if (total_planes == 2) - unpacked.m_astc.m_weights[i * total_planes + 1] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes + 1]); - } - } - } - } return true; }