diff --git a/transcoder/basisu_transcoder.cpp b/transcoder/basisu_transcoder.cpp
index 3aeba0ee..1899bb46 100644
--- a/transcoder/basisu_transcoder.cpp
+++ b/transcoder/basisu_transcoder.cpp
@@ -17,6 +17,21 @@
 #include <limits.h>
 #include "basisu_containers_impl.h"
 
+#define	BASISD_ASTC_SIMD
+
+#if defined(BASISD_ASTC_SIMD)
+#if defined(__clang__)
+	#if defined(__arm__) || defined(__aarch64__)
+		#include	<arm_neon.h>
+		#include	<arm_acle.h>
+	#else
+		#include	<x86intrin.h>
+	#endif
+#else	//	!defined(__clang__)
+	#include	<immintrin.h>
+#endif	//	!defined(__clang__)
+#endif	//	defined(BASISD_ASTC_SIMD)
+
 #ifndef BASISD_IS_BIG_ENDIAN
 // TODO: This doesn't work on OSX. How can this be so difficult?
 //#if defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(BIG_ENDIAN)
@@ -5101,6 +5116,7 @@ namespace basist
 		// First extract the trits and the bits from the 5 input values
 		int trits = 0, bits[5];
 		const uint32_t bit_mask = (1 << n) - 1;
+
 		for (int i = 0; i < 5; i++)
 		{
 			static const int s_muls[5] = { 1, 3, 9, 27, 81 };
@@ -9308,7 +9324,6 @@ namespace basist
 			for (uint32_t block_y = 0; block_y < num_blocks_y; ++block_y)
 			{
 				void* pDst_block = (uint8_t*)pDst_blocks + block_y * output_row_pitch_in_blocks_or_pixels * output_block_or_pixel_stride_in_bytes;
-								
 				for (uint32_t block_x = 0; block_x < num_blocks_x; ++block_x, ++pSource_block, pDst_block = (uint8_t *)pDst_block + output_block_or_pixel_stride_in_bytes)
 				{
 					switch (fmt)
@@ -10288,7 +10303,7 @@ namespace basist
 
 		return true;
 	}
-		
+
 	bool basisu_transcoder::start_transcoding(const void* pData, uint32_t data_size)
 	{
 		if (!validate_header_quick(pData, data_size))
@@ -10398,7 +10413,6 @@ namespace basist
 		}
 		
 		m_ready_to_transcode = true;
-
 		return true;
 	}
 
@@ -11878,100 +11892,233 @@ namespace basist
 		pDst[2] |= temp[2]; pDst[3] |= temp[3];
 	}
 
-	const uint32_t ASTC_BLOCK_MODE_BITS = 11;
-	const uint32_t ASTC_PART_BITS = 2;
-	const uint32_t ASTC_CEM_BITS = 4;
-	const uint32_t ASTC_PARTITION_INDEX_BITS = 10;
-	const uint32_t ASTC_CCS_BITS = 2;
-
-	const uint32_t g_uastc_mode_astc_block_mode[TOTAL_UASTC_MODES] = { 0x242, 0x42, 0x53, 0x42, 0x42, 0x53, 0x442, 0x42, 0, 0x42, 0x242, 0x442, 0x53, 0x441, 0x42, 0x242, 0x42, 0x442, 0x253 };
-
-	bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t uastc_mode)
+	void	pack_astc_block_weights( uint8_t* pDst_bytes, const uint8_t* pBlockWeights, int bits_per_weight, int blockNum )
 	{
-		assert(uastc_mode < TOTAL_UASTC_MODES);
-		uint8_t* pDst_bytes = reinterpret_cast<uint8_t*>(pDst);
-
-		const int total_weights = pBlock->m_dual_plane ? 32 : 16;
-
-		// Set mode bits - see Table 146-147
-		uint32_t mode = g_uastc_mode_astc_block_mode[uastc_mode];
-		pDst_bytes[0] = (uint8_t)mode;
-		pDst_bytes[1] = (uint8_t)(mode >> 8);
-
+		#if defined(BASISD_ASTC_SIMD)
+		#else	//	C
 		memset(pDst_bytes + 2, 0, 16 - 2);
+		#endif	//	C
 
-		int bit_pos = ASTC_BLOCK_MODE_BITS;
-
-		// We only support 1-5 bit weight indices
-		assert(!g_astc_bise_range_table[pBlock->m_weight_range][1] && !g_astc_bise_range_table[pBlock->m_weight_range][2]);
-		const int bits_per_weight = g_astc_bise_range_table[pBlock->m_weight_range][0];
-
-		// See table 143 - PART
-		astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_subsets - 1, ASTC_PART_BITS);
-
-		if (pBlock->m_subsets == 1)
-			astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_cem, ASTC_CEM_BITS);
-		else
-		{
-			// See table 145
-			astc_set_bits(pDst, bit_pos, pBlock->m_partition_seed, ASTC_PARTITION_INDEX_BITS);
-
-			// Table 150 - we assume all CEM's are equal, so write 2 0's along with the CEM
-			astc_set_bits_1_to_9(pDst, bit_pos, (pBlock->m_cem << 2) & 63, ASTC_CEM_BITS + 2);
-		}
-
-		if (pBlock->m_dual_plane)
-		{
-			const int total_weight_bits = total_weights * bits_per_weight;
-
-			// See Illegal Encodings 23.24
-			// https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_illegal_encodings
-			assert((total_weight_bits >= 24) && (total_weight_bits <= 96));
-
-			int ccs_bit_pos = 128 - total_weight_bits - ASTC_CCS_BITS;
-			astc_set_bits_1_to_9(pDst, ccs_bit_pos, pBlock->m_ccs, ASTC_CCS_BITS);
-		}
-
-		const int num_cem_pairs = (1 + (pBlock->m_cem >> 2)) * pBlock->m_subsets;
-		assert(num_cem_pairs <= 9);
-
-		astc_pack_bise(pDst, pBlock->m_endpoints, bit_pos, num_cem_pairs * 2, g_uastc_mode_endpoint_ranges[uastc_mode]);
-
-		// Write the weight bits in reverse bit order.
 		switch (bits_per_weight)
 		{
 		case 1:
 		{
-			const uint32_t N = 1;
-			for (int i = 0; i < total_weights; i++)
+			#if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) )
+			static const uint8_t s_reverse_bits1[16] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+			uint8x16_t	revmask1 = vld1q_u8( s_reverse_bits1 );
+			uint8x8_t	rev8;
+			if	( blockNum == 0 )
+			{
+ 				uint8x16_t	src1 = vld1q_u8( pBlockWeights );								//	load 16x1
+				uint8x16_t	test1 = vceqq_u8( src1, vdupq_n_u8(0) );						//	mask if x == 0
+				uint8x16_t	rev1 = vandq_u8( revmask1, vmvnq_u8(test1) );					//	final byte shifted value
+				uint8x8_t	rev2 = vpadd_u8( vget_high_u8(rev1), vget_low_u8(rev1) );		//	swap(hi,lo) & merge 2 bits
+				uint8x8_t	rev4 = vpadd_u8( rev2, vdup_n_u8(0) );							//	merge 4 bits
+							rev8 = vpadd_u8( rev4, vdup_n_u8(0) );							//	merge 8 bits
+							rev8 = vext_u8( vdup_n_u8(0), rev8, 6 );						//	shl64(6)
+			}
+			else
+			{
+ 				uint8x16_t	src1_0 = vld1q_u8( pBlockWeights );								//	load 16x1
+ 				uint8x16_t	src1_1 = vld1q_u8( pBlockWeights + 16 );						//	load 16x1
+				uint8x16_t	test1_0 = vceqq_u8( src1_0, vdupq_n_u8(0) );					//	mask if x == 0
+				uint8x16_t	test1_1 = vceqq_u8( src1_1, vdupq_n_u8(0) );					//	mask if x == 0
+				uint8x16_t	rev1_0 = vandq_u8( revmask1, vmvnq_u8(test1_0) );				//	final byte shifted value
+				uint8x16_t	rev1_1 = vandq_u8( revmask1, vmvnq_u8(test1_1) );				//	final byte shifted value
+				uint8x8_t	rev2_0 = vpadd_u8( vget_high_u8(rev1_0), vget_low_u8(rev1_0) );	//	swap(hi,lo) & merge 2 bits
+				uint8x8_t	rev2_1 = vpadd_u8( vget_high_u8(rev1_1), vget_low_u8(rev1_1) );	//	swap(hi,lo) & merge 2 bits
+				uint8x8_t	rev4 = vpadd_u8( rev2_1, rev2_0 );								//	swap(hi,lo) & merge 4 buts
+							rev8 = vpadd_u8( rev4, vdup_n_u8(0) );							//	merge 8 bits
+							rev8 = vext_u8( vdup_n_u8(0), rev8, 4 );						//	shl64(4)
+			}
+			vst1_u8( pDst_bytes + 8, rev8 );												//	store [8,15]
+			vst1_u8( pDst_bytes + 0, vdup_n_u8(0) );										//	store [0,8]
+			#elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__)
+			static const uint8_t s_reverse_bits1[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+			__m128i	bitsmask1 = _mm_loadu_si128( (const __m128i *) s_reverse_bits1 );
+			if	( blockNum == 0 )
+			{
+				__m128i		src1 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 0 );			//	load 16x8
+				__m128i		rev1 = _mm_shuffle_epi8( src1, bitsmask1 );								//	byte reverse
+				uint32_t	rev16 = (uint32_t) _mm_movemask_epi8( _mm_slli_epi32( rev1, 7 ) );		//	move to high bit and get mask
+				*((uint32_t*)(pDst_bytes + 12)) = (rev16 << 16U);									//	store [12,15]
+			}
+			else
+			{
+				__m128i		src1_0 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 0 );		//	load 16x8
+				__m128i		src1_1 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 1 );		//	load 16x8
+				__m128i		rev1_0 = _mm_shuffle_epi8( src1_0, bitsmask1 );							//	byte reverse
+				__m128i		rev1_1 = _mm_shuffle_epi8( src1_1, bitsmask1 );							//	byte reverse
+				uint32_t	rev16_0 = (uint32_t) _mm_movemask_epi8( _mm_slli_epi32( rev1_0, 7 ) );	//	move to high bit and get mask
+				uint32_t	rev16_1 = (uint32_t) _mm_movemask_epi8( _mm_slli_epi32( rev1_1, 7 ) );	//	move to high bit and get mask
+				*((uint32_t*)(pDst_bytes + 12)) = (rev16_0 << 16U) | rev16_1;						//	store [12,15]
+			}
+			*((uint32_t*)(pDst_bytes + 8)) = 0U;													//	store [8,11]
+			*((uint64_t*)(pDst_bytes + 0)) = 0ULL;													//	store [0,7]
+			#else
 			{
-				const uint32_t ofs = 128 - N - i;
-				assert((ofs >> 3) < 16);
-				pDst_bytes[ofs >> 3] |= (pBlock->m_weights[i] << (ofs & 7));
+				const uint32_t N = 1;
+				for (int i = 0; i < ((blockNum + 1) >> 4); i++)
+				{
+					const uint32_t ofs = 128 - N - i;
+					assert((ofs >> 3) < 16);
+					pDst_bytes[ofs >> 3] |= (pBlockWeights[i] << (ofs & 7));
+				}
 			}
+			#endif
 			break;
 		}
 		case 2:
 		{
-			const uint32_t N = 2;
-			for (int i = 0; i < total_weights; i++)
+			#if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) )
+			if	( blockNum == 0 )
+			{
+				uint8x16_t	src2 = vld1q_u8( pBlockWeights );
+				uint16x8_t	src2hi = vshrq_n_u16( vreinterpretq_u16_u8(src2), 6 );					//	shr16(8-2) to get in u16[2,3]
+				uint16x8_t	src2lo = vandq_u16( vreinterpretq_u16_u8(src2), vdupq_n_u16(0x00FF) );	//	have to mask remainder to avoid bit collision
+				uint16x8_t	src4lohi = vorrq_u16( src2hi, src2lo );									//	4bits in 8 u16[0,3]
+				uint8x8_t	src4 = vqmovn_u16( src4lohi );											//	4bits in 8 u8[0,3]
+				uint16x4_t	rev4hi = vshr_n_u16( vreinterpret_u16_u8(src4), 4 );					//	shr16(8-4) to get in u16[4,7]
+				uint16x4_t	rev4lo = vand_u16( vreinterpret_u16_u8(src4), vdup_n_u16(0x00FF) );		//	have to mask remainder to avoid bit collision
+				uint16x4_t	rev8lohi = vorr_u16( rev4hi, rev4lo );									//	8bits in 4 u16[0,7]
+				uint8x8_t	rev8 = vqmovn_u16( vcombine_u16( rev8lohi, vdup_n_u8(0) ) );			//	8bits in 4 u8 (clear lower 32)
+				#if defined(__aarch64__)
+				uint8x8_t	rev64 = vrev64_u8( vrbit_u8( rev8 ) );									//	bit reverse
+				vst1_u8( pDst_bytes + 8, rev64 );													//	store [8,15]
+				#else	//	!defined(__aarch64__)
+				uint32_t	rev32 = vget_lane_u32( vreinterpret_u32_u8(rev8), 0 );					//	get the 32 bits
+							rev32 = __rbit( rev32 );												//	reverse
+				*((uint32_t*)(pDst_bytes + 12)) = rev32;											//	store [12,15]
+				*((uint32_t*)(pDst_bytes + 8)) = 0U;												//	store [8,11]
+				#endif	//	!defined(__aarch64__)
+			}
+			else
 			{
-				static const uint8_t s_reverse_bits2[4] = { 0, 2, 1, 3 };
-				const uint32_t ofs = 128 - N - (i * N);
-				assert((ofs >> 3) < 16);
-				pDst_bytes[ofs >> 3] |= (s_reverse_bits2[pBlock->m_weights[i]] << (ofs & 7));
+				uint8x16_t	src2_0 = vld1q_u8( pBlockWeights );
+				uint8x16_t	src2_1 = vld1q_u8( pBlockWeights + 16 );
+				uint16x8_t	src2hi_0 = vshrq_n_u16( vreinterpretq_u16_u8(src2_0), 6 );					//	shr16(8-2) to get in u16[2,3]
+				uint16x8_t	src2hi_1 = vshrq_n_u16( vreinterpretq_u16_u8(src2_1), 6 );					//	shr16(8-2) to get in u16[2,3]
+				uint16x8_t	src2lo_0 = vandq_u16( vreinterpretq_u16_u8(src2_0), vdupq_n_u16(0x00FF) );	//	have to mask remainder to avoid bit collision
+				uint16x8_t	src2lo_1 = vandq_u16( vreinterpretq_u16_u8(src2_1), vdupq_n_u16(0x00FF) );	//	have to mask remainder to avoid bit collision
+				uint16x8_t	src4lohi_0 = vorrq_u16( src2hi_0, src2lo_0 );								//	4bits in 8 u16[0,3]
+				uint16x8_t	src4lohi_1 = vorrq_u16( src2hi_1, src2lo_1 );								//	4bits in 8 u16[0,3]
+				uint16x4_t	src4_0 = vreinterpret_u16_u8( vqmovn_u16( src4lohi_0 ) );					//	4bits in 8 u8[0,3]
+				uint16x4_t	src4_1 = vreinterpret_u16_u8( vqmovn_u16( src4lohi_1 ) );					//	4bits in 8 u8[0,3]
+				uint16x8_t	src4 = vcombine_u16( src4_0, src4_1 );										//	4bits in 8 u16[0,3]
+				uint16x8_t	rev4hi = vshrq_n_u16( src4, 4 );											//	shr16(8-4) to get in u16[4,7]
+				uint16x8_t	rev4lo = vandq_u16( src4, vdupq_n_u16(0x00FF) );							//	have to mask remainder
+				uint16x8_t	rev8lohi = vorrq_u16( rev4hi, rev4lo );										//	8bits in 8 u16[0,7]
+				uint8x8_t	rev8 = vqmovn_u16( rev8lohi );												//	8bits in 8 u8[0,7]
+				#if defined(__aarch64__)
+				uint8x8_t	rev64 = vrev64_u8( vrbit_u8( rev8 ) );										//	bit reverse
+				vst1_u8( pDst_bytes + 8, rev64 );														//	store result
+				#else	//	!defined(__aarch64__)
+				uint64_t	rev64 = vget_lane_u64( vreinterpret_u64_u8(rev8), 0 );						//	get the 64 bits
+							rev64 = __rbitll( rev64 );													//	bit reverse
+				*((uint64_t*)(pDst_bytes + 8)) = rev64;													//	store [8,15]
+				#endif	//	!defined(__aarch64__)
+			}
+			vst1_u8( pDst_bytes + 0, vdup_n_u8(0) );													//	store [0,7]
+			#elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__)
+			__m128i	bitsmask2 = _mm_cvtsi32_si128( 0x03010200 );	//	{ 0, 2, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+			if	( blockNum == 0 )
+			{
+				__m128i	src8 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) );
+				__m128i	rev8 = _mm_shuffle_epi8( bitsmask2, src8 );			//	bit reverse
+				__m128i	rev8lo = _mm_slli_epi16( rev8, 10 );				//	shl16(8+2) to get lo2 in u16[10,11]
+				__m128i	rev8lohi = _mm_or_si128( rev8lo, rev8 );			//	4bits in 8 u16[8,11]
+				__m128i	rev16lo = _mm_slli_epi32( rev8lohi, 20 );			//	shl32(16+4) to get lo4 in u32[20,23]
+				__m128i	rev16lohi = _mm_or_si128( rev8lohi, rev16lo );		//	8bits in 8 u32[16,23]
+				static const uint8_t s_reverse_bytes2[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0F, 0x0B, 0x07, 0x03 };
+				__m128i	bytesmask2 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes2 );
+				__m128i	rev32 = _mm_shuffle_epi8( rev16lohi, bytesmask2 );	//	32bits in u128[96,127]
+				_mm_store_si128( (__m128i*)pDst_bytes, rev32 );				//	store [0,15]
+			}
+			else	//	assume only 2 blocks
+			{
+				__m128i	src8_0 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 0 );
+				__m128i	src8_1 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) + 1 );
+				__m128i	rev8_0 = _mm_shuffle_epi8( bitsmask2, src8_0 );			//	bit reverse
+				__m128i	rev8_1 = _mm_shuffle_epi8( bitsmask2, src8_1 );			//	bit reverse
+				__m128i	rev8lo_0 = _mm_slli_epi16( rev8_0, 10 );				//	shl16(8+2) to get lo2 in u16[10,11]
+				__m128i	rev8lo_1 = _mm_slli_epi16( rev8_1, 10 );				//	shl16(8+2) to get lo2 in u16[10,11]
+				__m128i	rev8lohi_0 = _mm_or_si128( rev8lo_0, rev8_0 );			//	4bits in 8 u16[8,11]
+				__m128i	rev8lohi_1 = _mm_or_si128( rev8lo_1, rev8_1 );			//	4bits in 8 u16[8,11]
+				__m128i	rev16lo_0 = _mm_slli_epi32( rev8lohi_0, 20 );			//	shl32(16+4) to get lo4 in u32[20,23]
+				__m128i	rev16lo_1 = _mm_slli_epi32( rev8lohi_1, 20 );			//	shl32(16+4) to get lo4 in u32[20,23]
+				__m128i	rev16lohi_0 = _mm_or_si128( rev8lohi_0, rev16lo_0 );	//	8bits in 8 u32[16,23]
+				__m128i	rev16lohi_1 = _mm_or_si128( rev8lohi_1, rev16lo_1 );	//	8bits in 8 u32[16,23]
+				//	try to avoid these 2 instructions
+				__m128i	rev32_0 = _mm_srli_epi32( rev16lohi_0, 24 );			//	srl32(24) to get 8bits in u32[0,7]
+				__m128i	rev32_1 = _mm_srli_epi32( rev16lohi_1, 24 );			//	srl32(24) to get 8bits in u32[0,7]
+				__m128i	rev64 = _mm_packs_epi32( rev32_1, rev32_0 );			//	8bits in u16[0,7]
+				static const uint8_t s_reverse_bytes2[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x06, 0x04, 0x02, 0x00, 0x0E, 0x0C, 0x0A, 0x08 };
+				__m128i	bytesmask2 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes2 );
+				__m128i	dst = _mm_shuffle_epi8( rev64, bytesmask2 );			//	64bits high byte in reverse order
+				_mm_store_si128( (__m128i*)pDst_bytes, dst );					//	store [0,15]
+			}
+			#else	//	C
+			{
+				const uint32_t N = 2;
+				for (int i = 0; i < ((blockNum + 1) << 4); i++)
+				{
+					static const uint8_t s_reverse_bits2[4] = { 0, 2, 1, 3 };
+					const uint32_t ofs = 128 - N - (i * N);
+					assert((ofs >> 3) < 16);
+					pDst_bytes[ofs >> 3] |= (s_reverse_bits2[pBlockWeights[i]] << (ofs & 7));
+				}
 			}
+			#endif	//	C
 			break;
 		}
-		case 3:
-		{
+		case 3:	//	295 => 285
+		{
+			assert(blockNum == 0);
+			#if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) )
+			uint8x16_t	src3 = vld1q_u8( pBlockWeights );
+			uint16x8_t	src3hi = vshrq_n_u16( vreinterpretq_u16_u8(src3), 5 );							//	shr16(8-3) to get hi3 in u16[3,5]
+			uint16x8_t	src3lo = vandq_u16( vreinterpretq_u16_u8(src3), vdupq_n_u16(0x007) );			//	have to mask remainder to avoid bit collision
+			uint16x8_t	src3lohi = vorrq_u16( src3hi, src3lo );											//	6bits in 8 u16[0,5]
+			uint8x8_t	src6 = vqmovn_u16( src3lohi );													//	6bits in 8 u8[0,5]
+			uint16x4_t	src6hi = vshr_n_u16( vreinterpret_u16_u8(src6), 2 );							//	shr16(8-2) to get hi6 in u16[6,11]
+			uint16x4_t	src12 = vbsl_u16( vdup_n_u16(0x003F), vreinterpret_u16_u8(src6), src6hi );		//	12bits in 4 u16[0,11]
+			uint32x2_t	src12hi = vshr_n_u32( vreinterpret_u32_u16(src12), 4 );							//	shr32(16-12) to get in hi12 u32[12,23]
+			uint32x2_t	src24 = vbsl_u32( vdup_n_u32(0x0FFF), vreinterpret_u32_u16(src12), src12hi );	//	24bits in 2 u32[0,23]
+			uint32_t	src24lo = vget_lane_u32( src24, 0 );											//	lo 8.24
+			uint32_t	src24hi = vget_lane_u32( src24, 1 );											//	hi 8.24
+			uint32_t	rev24hi = __rbit( src24lo );													//	hi rev 24.8
+			uint32_t	rev24lo = __rbit( src24hi );													//	lo rev 24.8
+			uint32_t	rev32hi = (rev24hi) | (rev24lo >> 24U);											//	[32,63]
+			uint32_t	rev32lo = (rev24lo << 8U);														//	[0,31]
+			*((uint32_t*)(pDst_bytes + 12)) = rev32hi;													//	store [12,15]
+			*((uint32_t*)(pDst_bytes + 8)) = rev32lo;													//	store [8,11]
+			vst1_u8( pDst_bytes + 0, vdup_n_u8(0) );													//	store [0,7]
+			#elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__)
+			static const uint8_t s_reverse_bits3[16] = { 0, 4, 2, 6, 1, 5, 3, 7, 0, 0, 0, 0, 0, 0, 0, 0 };
+			__m128i	bitsmask3 = _mm_loadu_si128( (const __m128i *) s_reverse_bits3 );
+			__m128i	src3 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) );
+			__m128i	rev3 = _mm_shuffle_epi8( bitsmask3, src3 );						//	bit reverse
+			__m128i	rev3lo = _mm_slli_epi16( rev3, 11 );							//	shl16(8+3) to get lo3 in u16[11,13]
+			__m128i	rev3hi = _mm_and_si128( rev3, _mm_set1_epi32(0x07000700) );		//	have to mask remainder to avoid bit collision
+			__m128i	rev6 = _mm_or_si128( rev3hi, rev3lo );							//	6bits in 8 u16[8,13] : BA,DC,FE,HG,JI,LK,NM,PO
+			__m128i	rev6hi = _mm_srli_epi32( rev6, 24 );							//	shr32(24) to get hi6[24,29] in u32[0,5]
+			__m128i	rev6lo = _mm_srli_epi32( rev6, 2 );								//	shr32(2) to get lo6[8,13] in u32[6,11]
+			__m128i	rev12 = _mm_or_si128( rev6hi, rev6lo );							//	12bits in 4 u32[0,11] + garbage in u32[22,31]
+			__m128i	rev24lo = _mm_slli_epi64( rev12, 44 );							//	shl64(32+12) to get lo12 in u64[44,55]
+			__m128i	rev24hi = _mm_and_si128( rev12, _mm_set1_epi32(0x0000FFFF) );	//	clean garbage in u32[22,31]
+			__m128i	rev24 = _mm_or_si128( rev24hi, rev24lo );						//	24bits in 2 u64[32,55]
+			static const uint8_t s_reverse_bytes3[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0C, 0x0D, 0x0E, 0x04, 0x05, 0x06 };
+			__m128i	bytesmask3 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes3 );	
+			__m128i	rev48 = _mm_shuffle_epi8( rev24, bytesmask3 );					//	48bits in u128[80,127]
+			_mm_store_si128( (__m128i*)pDst_bytes, rev48 );							//	store [0,15]
+			#else	//	C
 			const uint32_t N = 3;
-			for (int i = 0; i < total_weights; i++)
+			for (int i = 0; i < ((blockNum + 1) << 4); i++)
 			{
 				static const uint8_t s_reverse_bits3[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
 
 				const uint32_t ofs = 128 - N - (i * N);
-				const uint32_t rev = s_reverse_bits3[pBlock->m_weights[i]] << (ofs & 7);
+				const uint32_t rev = s_reverse_bits3[pBlockWeights[i]] << (ofs & 7);
 
 				uint32_t index = ofs >> 3;
 				assert(index < 16);
@@ -11979,29 +12126,108 @@ namespace basist
 				if (index < 16)
 					pDst_bytes[index++] |= (rev >> 8);
 			}
+			#endif	//	C
 			break;
 		}
 		case 4:
 		{
-			const uint32_t N = 4;
-			for (int i = 0; i < total_weights; i++)
-			{
-				static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
-				const int ofs = 128 - N - (i * N);
-				assert(ofs >= 0 && (ofs >> 3) < 16);
-				pDst_bytes[ofs >> 3] |= (s_reverse_bits4[pBlock->m_weights[i]] << (ofs & 7));
+			assert(blockNum == 0);
+			#if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) )
+			uint8x16_t	src4 = vld1q_u8( pBlockWeights );
+			uint16x8_t	src8hi = vshrq_n_u16( vreinterpretq_u16_u8(src4), 4 );					//	shr16(8-4) to get hi4 in u16[4,7]
+			uint16x8_t	src8lo = vandq_u16( vreinterpretq_u16_u8(src4), vdupq_n_u16(0x00FF) );	//	have to mask remainder to avoid bit collision
+			uint16x8_t	src8lohi = vorrq_u16( src8hi, src8lo );									//	8bits in 8 u16[0,7]
+//			uint16x8_t	src8lohi = vbslq_u16( vdupq_n_u16(0x0F), vreinterpretq_u16_u8(src4), src8hi );	//	slower than (and + or)
+			uint8x8_t	src8 = vqmovn_u16( src8lohi );											//	8bits in 8 u8[0,7]
+			#if defined(__aarch64__)
+			uint8x8_t	rev64 = vrev64_u8( vrbit_u8( src8 ) );									//	bit reverse
+			vst1_u8( pDst_bytes + 8, rev64 );													//	store [8,15]
+			#else	//	!defined(__aarch64__)
+			uint64_t	rev64 = vget_lane_u64( vreinterpret_u64_u8(src8), 0 );					//	get the 64 bits
+						rev64 = __rbitll( rev64 );												//	bit reverse
+			*((uint64_t*)(pDst_bytes + 8)) = rev64;												//	store [8,15]
+			#endif	//	!defined(__aarch64__)
+			vst1_u8( pDst_bytes + 0, vdup_n_u8(0) );											//	store [0,7]
+			#elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__)
+			static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+			__m128i	bitsmask4 = _mm_loadu_si128( (const __m128i *) s_reverse_bits4 );
+			__m128i	src8 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) );				//	load 16x8
+			__m128i	rev8 = _mm_shuffle_epi8( bitsmask4, src8 );								//	bit reverse
+			__m128i	rev8lo = _mm_slli_epi16( rev8, 12 );									//	shr16(8+4) to get lo in u16[12,15]
+			__m128i	rev8lohi = _mm_or_si128( rev8lo, rev8 );								//	8bits in 8 u16[8,15]
+			static const uint8_t s_reverse_bytes4[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01 };
+			__m128i	bytesmask4 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes4 );
+			__m128i	rev64 = _mm_shuffle_epi8( rev8lohi, bytesmask4 );						//	extract bytes in reverse order
+			_mm_store_si128( (__m128i*)pDst_bytes, rev64 );									//	store [0,15]
+			#else	//	C
+			{
+				const uint32_t N = 4;
+				for (int i = 0; i < ((blockNum + 1) << 4); i++)
+				{
+					static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+					const int ofs = 128 - N - (i * N);
+					assert(ofs >= 0 && (ofs >> 3) < 16);
+					pDst_bytes[ofs >> 3] |= (s_reverse_bits4[pBlockWeights[i]] << (ofs & 7));
+				}
 			}
+			#endif	//	C
 			break;
 		}
 		case 5:
 		{
+			assert(blockNum == 0);
+			#if defined(BASISD_ASTC_SIMD) && ( defined(__arm__) || defined(__aarch64__) )
+			uint8x16_t	src5 = vld1q_u8( pBlockWeights );
+			uint16x8_t	src5hi = vshrq_n_u16( vreinterpretq_u16_u8(src5), 3 );								//	shiftRight 8-5 to get in u16[5,9]
+			uint16x8_t	src10 = vbslq_u16( vdupq_n_u16(0x01F), vreinterpretq_u16_u8(src5), src5hi );		//	10bits in 8 u16[0,9]
+			uint32x4_t	src10hi = vshrq_n_u32( vreinterpretq_u32_u16(src10), 6 );							//	shiftRight 16-10 to get in u32[10,19]
+			uint32x4_t	src20 = vbslq_u32( vdupq_n_u32(0x03FF), vreinterpretq_u32_u16(src10), src10hi );	//	20bits in 4 u32[0,19]
+			uint64x2_t	src20hi = vshrq_n_u64( vreinterpretq_u64_u32(src20), 12 );							//	shiftRight 32-20 to get in u64[20,39]
+			uint64x2_t	src40 = vbslq_u64( vdupq_n_u64(0x0FFFFF), vreinterpretq_u64_u32(src20), src20hi );	//	40bits in 2 u64[0,39]
+			uint64x1_t	src40hilo = vshl_n_u64( vget_high_u64(src40), 40 );						//	[40..64]
+			uint64x1_t	src40lo = vorr_u64( vget_low_u64(src40), src40hilo );					//	[0..63]
+			uint64x1_t	src40hi = vshr_n_u64( vget_high_u64(src40), 24 );						//	[64..79]
+//	try to use vrev32_u8( vrbit_u8( rev8 ) )
+			uint32_t	src80lo = vget_lane_u32( vreinterpret_u32_u64(src40lo), 0 );			//	[0..31]
+			uint32_t	src80mid = vget_lane_u32( vreinterpret_u32_u64(src40lo), 1 );			//	[32..63]
+			uint32_t	src80hi = vget_lane_u32( vreinterpret_u32_u64(src40hi), 0 );			//	[64..79]
+			uint32_t	rev80hi = __rbit( src80lo );											//	[48..79]
+			uint32_t	rev80mid = __rbit( src80mid );											//	[16..47]
+			uint32_t	rev80lo = __rbit( src80hi );											//	[16][0..15]
+			*((uint32_t*)(pDst_bytes + 12)) = rev80hi;											//	store [12,15]
+			*((uint32_t*)(pDst_bytes + 8)) = rev80mid;											//	store [8,11]
+			*((uint32_t*)(pDst_bytes + 4)) = rev80lo;											//	store [4,7]
+			*((uint32_t*)(pDst_bytes + 0)) = 0U;												//	store [0,3]
+			#elif defined(BASISD_ASTC_SIMD) && defined(__SSSE3__)
+			__m128i	src5 = _mm_loadu_si128( ((const __m128i *)pBlockWeights) );		//	load 16x8
+			__m128i	src4 = _mm_and_si128( _mm_set1_epi32(0x0F0F0F0F), src5 );		//	get 4 bits for shuffle
+			__m128i	src54 = _mm_andnot_si128( _mm_set1_epi32(0x0F0F0F0F), src5 );	//	keep 5th bit
+			static const uint8_t s_reverse_bits5[16] = { 0x00, 0x10, 0x08, 0x18, 0x04, 0x14, 0x0C, 0x1C, 0x02, 0x12, 0x0A, 0x1A, 0x06, 0x16, 0x0E, 0x1E };
+			__m128i	bitsmask5 = _mm_loadu_si128( (const __m128i *) s_reverse_bits5 );
+			__m128i	rev4 = _mm_shuffle_epi8( bitsmask5, src4 );						//	bit reverse
+			__m128i	rev0 = _mm_srli_epi16( src54, 4 );								//	shr16(4) because shl8 doesn't exist => put bit[0] at right position
+			__m128i	rev5 = _mm_or_si128( rev0, rev4 );								//	5bits in 16 u8[0,4]
+			__m128i	rev5lo = _mm_slli_epi16( rev5, 8 );								//	shl16(8) to get lo5[0,4] in u16[8,12]
+			__m128i	rev5hi = _mm_srli_epi16( rev5, 5 );								//	shr16(5) to get hi5[8,12] in u16[3,7]
+			__m128i	rev10 = _mm_or_si128( rev5hi, rev5lo );							//	10bits in 8 u16[3,12]
+			__m128i	rev10lo = _mm_slli_epi32( rev10, 13 );							//	shl32(13) to get lo10[3,12] in u32[16,25]
+			__m128i	rev10hi = _mm_srli_epi32( rev10, 13 );							//	shr32(13) to get hi10[35,44] in u32[6,15]
+			__m128i	rev20 = _mm_or_si128( rev10hi, rev10lo );						//	20bits in 4 u32[6,25]
+			__m128i	rev20lo = _mm_slli_epi64( rev20, 14 );							//	shl64(14) to get lo20[6,25] in u64[20,39] garbage in [52,63]
+			__m128i	rev20hi = _mm_srli_epi64( rev20, 38 );							//	shr64(38) to get hi20[38,57] in u64[0,19]
+			__m128i	rev40 = _mm_or_si128( rev20hi, rev20lo );						//	40bits in 2 u64[0,39] garbage in [52,63]
+			static const uint8_t s_reverse_bytes5[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x00, 0x01, 0x02, 0x03, 0x04 };
+			__m128i	bytesmask5 = _mm_loadu_si128( (const __m128i *) s_reverse_bytes5 );
+			__m128i	rev80 = _mm_shuffle_epi8( rev40, bytesmask5 );					//	swizzle bytes to store (can do a single store if done first)
+			_mm_store_si128( (__m128i*)pDst_bytes, rev80 );							//	store [0,15]
+			#else	//	C
 			const uint32_t N = 5;
-			for (int i = 0; i < total_weights; i++)
+			for (int i = 0; i < ((blockNum + 1) << 4); i++)
 			{
 				static const uint8_t s_reverse_bits5[32] = { 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31 };
 
 				const uint32_t ofs = 128 - N - (i * N);
-				const uint32_t rev = s_reverse_bits5[pBlock->m_weights[i]] << (ofs & 7);
+				const uint32_t rev = s_reverse_bits5[pBlockWeights[i]] << (ofs & 7);
 
 				uint32_t index = ofs >> 3;
 				assert(index < 16);
@@ -12009,13 +12235,75 @@ namespace basist
 				if (index < 16)
 					pDst_bytes[index++] |= (rev >> 8);
 			}
-
+			#endif	//	C
 			break;
 		}
 		default:
 			assert(0);
 			break;
 		}
+	}
+
+	const uint32_t ASTC_BLOCK_MODE_BITS = 11;
+	const uint32_t ASTC_PART_BITS = 2;
+	const uint32_t ASTC_CEM_BITS = 4;
+	const uint32_t ASTC_PARTITION_INDEX_BITS = 10;
+	const uint32_t ASTC_CCS_BITS = 2;
+
+	const uint32_t g_uastc_mode_astc_block_mode[TOTAL_UASTC_MODES] = { 0x242, 0x42, 0x53, 0x42, 0x42, 0x53, 0x442, 0x42, 0, 0x42, 0x242, 0x442, 0x53, 0x441, 0x42, 0x242, 0x42, 0x442, 0x253 };
+
+	bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t uastc_mode)
+	{
+		assert(uastc_mode < TOTAL_UASTC_MODES);
+		uint8_t* pDst_bytes = reinterpret_cast<uint8_t*>(pDst);
+
+		const int total_weights = pBlock->m_dual_plane ? 32 : 16;
+		const int bits_per_weight = g_astc_bise_range_table[pBlock->m_weight_range][0];
+
+		//	will clear pDst_bytes[16]
+		pack_astc_block_weights( pDst_bytes, pBlock->m_weights, bits_per_weight, pBlock->m_dual_plane ? 1 : 0 );
+
+		// Set mode bits - see Table 146-147
+		uint32_t mode = g_uastc_mode_astc_block_mode[uastc_mode];
+		pDst_bytes[0] = (uint8_t)mode;
+		pDst_bytes[1] = (uint8_t)(mode >> 8);
+
+		int bit_pos = ASTC_BLOCK_MODE_BITS;
+
+		// We only support 1-5 bit weight indices
+		assert(!g_astc_bise_range_table[pBlock->m_weight_range][1] && !g_astc_bise_range_table[pBlock->m_weight_range][2]);
+
+		// See table 143 - PART
+		astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_subsets - 1, ASTC_PART_BITS);
+
+		if (pBlock->m_subsets == 1)
+			astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_cem, ASTC_CEM_BITS);
+		else
+		{
+			// See table 145
+			astc_set_bits(pDst, bit_pos, pBlock->m_partition_seed, ASTC_PARTITION_INDEX_BITS);
+
+			// Table 150 - we assume all CEM's are equal, so write 2 0's along with the CEM
+			astc_set_bits_1_to_9(pDst, bit_pos, (pBlock->m_cem << 2) & 63, ASTC_CEM_BITS + 2);
+		}
+
+		if (pBlock->m_dual_plane)
+		{
+			const int total_weight_bits = total_weights * bits_per_weight;
+
+			// See Illegal Encodings 23.24
+			// https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_illegal_encodings
+			assert((total_weight_bits >= 24) && (total_weight_bits <= 96));
+
+			int ccs_bit_pos = 128 - total_weight_bits - ASTC_CCS_BITS;
+			astc_set_bits_1_to_9(pDst, ccs_bit_pos, pBlock->m_ccs, ASTC_CCS_BITS);
+		}
+
+		const int num_cem_pairs = (1 + (pBlock->m_cem >> 2)) * pBlock->m_subsets;
+		assert(num_cem_pairs <= 9);
+
+		astc_pack_bise(pDst, pBlock->m_endpoints, bit_pos, num_cem_pairs * 2, g_uastc_mode_endpoint_ranges[uastc_mode]);
+
 
 		return true;
 	}