Skip to content

Commit

Permalink
Integrate lossless forward transform intrinsics
Browse files Browse the repository at this point in the history
This requires adding entries for WHT 1D functions, so the same is
also applied to inverse_transform_add and code paths are merged.
  • Loading branch information
barrbrain committed Oct 20, 2023
1 parent 620d541 commit 05ec6f4
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 153 deletions.
31 changes: 13 additions & 18 deletions src/asm/aarch64/transform/inverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,25 @@ use crate::{Pixel, PixelType};
use crate::asm::shared::transform::inverse::*;
use crate::asm::shared::transform::*;

#[inline]
pub fn inverse_transform_add_lossless<T: Pixel>(
pub fn inverse_transform_add<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
bd: usize, cpu: CpuFeatureLevel,
tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
) {
match T::type_enum() {
PixelType::U8 => {
if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
return call_inverse_func(func, input, output, eob, 4, 4, bd);
if tx_type == TxType::WHT_WHT {
debug_assert!(tx_size == TxSize::TX_4X4);
match T::type_enum() {
PixelType::U8 => {
if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
return call_inverse_func(func, input, output, eob, 4, 4, bd);
}
}
}
PixelType::U16 => {
if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
PixelType::U16 => {
if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
}
}
}
}
rust::inverse_transform_add_lossless(input, output, eob, bd, cpu);
}

pub fn inverse_transform_add<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
) {
match T::type_enum() {
PixelType::U8 => {
if let Some(func) = INV_TXFM_FNS[cpu.as_index()]
Expand Down
4 changes: 2 additions & 2 deletions src/asm/x86/transform/forward.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

pub use crate::transform::forward::rust::forward_transform_lossless;

type TxfmFuncI32X8 = unsafe fn(&mut [I32X8]);

#[inline]
Expand All @@ -41,6 +39,7 @@ fn get_func_i32x8(t: TxfmType) -> TxfmFuncI32X8 {
Identity8 => fidentity,
Identity16 => fidentity,
Identity32 => fidentity,
WHT4 => fwht4,
}
}

Expand Down Expand Up @@ -509,6 +508,7 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
/// # Panics
///
/// - If called with an invalid combination of `tx_size` and `tx_type`
#[inline]
pub fn forward_transform<T: Coefficient>(
input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
Expand Down
31 changes: 13 additions & 18 deletions src/asm/x86/transform/inverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,25 @@ use crate::{Pixel, PixelType};
use crate::asm::shared::transform::inverse::*;
use crate::asm::shared::transform::*;

#[inline]
pub fn inverse_transform_add_lossless<T: Pixel>(
pub fn inverse_transform_add<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
bd: usize, cpu: CpuFeatureLevel,
tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
) {
match T::type_enum() {
PixelType::U8 => {
if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
return call_inverse_func(func, input, output, eob, 4, 4, bd);
if tx_type == TxType::WHT_WHT {
debug_assert!(tx_size == TxSize::TX_4X4);
match T::type_enum() {
PixelType::U8 => {
if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
return call_inverse_func(func, input, output, eob, 4, 4, bd);
}

Check warning on line 29 in src/asm/x86/transform/inverse.rs

View check run for this annotation

Codecov / codecov/patch

src/asm/x86/transform/inverse.rs#L29

Added line #L29 was not covered by tests
}
}
PixelType::U16 => {
if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
PixelType::U16 => {
if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
}

Check warning on line 34 in src/asm/x86/transform/inverse.rs

View check run for this annotation

Codecov / codecov/patch

src/asm/x86/transform/inverse.rs#L34

Added line #L34 was not covered by tests
}
}
}
rust::inverse_transform_add_lossless(input, output, eob, bd, cpu);
}

pub fn inverse_transform_add<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
) {
match T::type_enum() {
PixelType::U8 => {
if let Some(func) = INV_TXFM_FNS[cpu.as_index()]
Expand Down
31 changes: 1 addition & 30 deletions src/transform/forward.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,36 +92,7 @@ pub mod rust {
Identity8 => fidentity,
Identity16 => fidentity,
Identity32 => fidentity,
}
}

pub fn forward_transform_lossless<T: Coefficient>(
input: &[i16], output: &mut [T], stride: usize, _cpu: CpuFeatureLevel,
) {
let mut tmp = [0i32; 4 * 4];
let buf = &mut tmp[..];
let mut col_coeffs_backing = [0i32; 4];
let col_coeffs = &mut col_coeffs_backing[..];

// Columns
for c in 0..4 {
for r in 0..4 {
col_coeffs[r] = (input[r * stride + c]).into();
}
fwht4(col_coeffs);
for r in 0..4 {
buf[r * 4 + c] = col_coeffs[r];
}
}

// Rows
for r in 0..4 {
let row_coeffs = &mut buf[r * 4..];
fwht4(row_coeffs);
av1_round_shift_array(row_coeffs, 4, -2);
for c in 0..4 {
output[c * 4 + r] = T::cast_from(row_coeffs[c]);
}
WHT4 => fwht4,
}
}

Expand Down
22 changes: 16 additions & 6 deletions src/transform/forward_shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ const FWD_SHIFT_32X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
const FWD_SHIFT_16X64: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
const FWD_SHIFT_64X16: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];

const FWD_SHIFT_4X4_WHT: TxfmShift = [0, 0, 2];

pub const FWD_TXFM_SHIFT_LS: [TxfmShifts; TxSize::TX_SIZES_ALL] = [
FWD_SHIFT_4X4,
FWD_SHIFT_8X8,
Expand Down Expand Up @@ -75,31 +77,35 @@ pub enum TxfmType {
Identity8,
Identity16,
Identity32,
WHT4,
}

impl TxfmType {
const TX_TYPES_1D: usize = 4;
const TX_TYPES_1D: usize = 5;
const AV1_TXFM_TYPE_LS: [[Option<TxfmType>; Self::TX_TYPES_1D]; 5] = [
[
Some(TxfmType::DCT4),
Some(TxfmType::ADST4),
Some(TxfmType::ADST4),
Some(TxfmType::Identity4),
Some(TxfmType::WHT4),
],
[
Some(TxfmType::DCT8),
Some(TxfmType::ADST8),
Some(TxfmType::ADST8),
Some(TxfmType::Identity8),
None,
],
[
Some(TxfmType::DCT16),
Some(TxfmType::ADST16),
Some(TxfmType::ADST16),
Some(TxfmType::Identity16),
None,
],
[Some(TxfmType::DCT32), None, None, Some(TxfmType::Identity32)],
[Some(TxfmType::DCT64), None, None, None],
[Some(TxfmType::DCT32), None, None, Some(TxfmType::Identity32), None],
[Some(TxfmType::DCT64), None, None, None, None],
];
}

Expand Down Expand Up @@ -129,12 +135,17 @@ impl Txfm2DFlipCfg {
let txfm_type_row =
TxfmType::AV1_TXFM_TYPE_LS[txw_idx][tx_type_1d_row as usize].unwrap();
let (ud_flip, lr_flip) = Self::get_flip_cfg(tx_type);
let shift = if tx_type == TxType::WHT_WHT {
FWD_SHIFT_4X4_WHT
} else {
FWD_TXFM_SHIFT_LS[tx_size as usize][(bd - 8) / 2]
};

Txfm2DFlipCfg {
tx_size,
ud_flip,
lr_flip,
shift: FWD_TXFM_SHIFT_LS[tx_size as usize][(bd - 8) / 2],
shift,
txfm_type_col,
txfm_type_row,
}
Expand All @@ -145,7 +156,7 @@ impl Txfm2DFlipCfg {
use self::TxType::*;
match tx_type {
DCT_DCT | ADST_DCT | DCT_ADST | ADST_ADST | IDTX | V_DCT | H_DCT
| V_ADST | H_ADST => (false, false),
| V_ADST | H_ADST | WHT_WHT => (false, false),
FLIPADST_DCT | FLIPADST_ADST | V_FLIPADST => (true, false),
DCT_FLIPADST | ADST_FLIPADST | H_FLIPADST => (false, true),
FLIPADST_FLIPADST => (true, true),
Expand Down Expand Up @@ -1728,7 +1739,6 @@ $($s)* fn daala_fdct64<T: TxOperations>(coeffs: &mut [T]) {
#[$m]
$($s)* fn fidentity<T: TxOperations>(_coeffs: &mut [T]) {}

#[allow(unused)]
#[$m]
$($s)* fn fwht4<T: TxOperations>(coeffs: &mut [T]) {
assert!(coeffs.len() >= 4);
Expand Down
63 changes: 14 additions & 49 deletions src/transform/inverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use super::TxType;
/// # Panics
///
/// - If `input` or `output` have fewer than 4 items.
pub fn av1_iwht4(input: &[i32], output: &mut [i32]) {
pub fn av1_iwht4(input: &[i32], output: &mut [i32], _range: usize) {

Check warning on line 36 in src/transform/inverse.rs

View check run for this annotation

Codecov / codecov/patch

src/transform/inverse.rs#L36

Added line #L36 was not covered by tests
assert!(input.len() >= 4);
assert!(output.len() >= 4);

Expand Down Expand Up @@ -1591,7 +1591,7 @@ fn av1_idct64(input: &[i32], output: &mut [i32], range: usize) {

type InvTxfmFn = fn(input: &[i32], output: &mut [i32], range: usize);

static INV_TXFM_FNS: [[InvTxfmFn; 5]; 4] = [
static INV_TXFM_FNS: [[InvTxfmFn; 5]; 5] = [
[av1_idct4, av1_idct8, av1_idct16, av1_idct32, av1_idct64],
[
av1_iadst4,
Expand All @@ -1614,6 +1614,13 @@ static INV_TXFM_FNS: [[InvTxfmFn; 5]; 4] = [
av1_iidentity32,
|_, _, _| unimplemented!(),
],
[
av1_iwht4,
|_, _, _| unimplemented!(),
|_, _, _| unimplemented!(),
|_, _, _| unimplemented!(),
|_, _, _| unimplemented!(),

Check warning on line 1622 in src/transform/inverse.rs

View check run for this annotation

Codecov / codecov/patch

src/transform/inverse.rs#L1619-L1622

Added lines #L1619 - L1622 were not covered by tests
],
];

pub(crate) mod rust {
Expand All @@ -1624,52 +1631,6 @@ pub(crate) mod rust {
use simd_helpers::cold_for_target_arch;
use std::cmp;

#[cold_for_target_arch("x86_64", "aarch64")]
pub fn inverse_transform_add_lossless<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: usize,
_bd: usize, _cpu: CpuFeatureLevel,
) {
// <https://aomediacodec.github.io/av1-spec/#2d-inverse-transform-process>
let input: &[T::Coeff] = &input[..4 * 4];
let mut buffer = [0i32; 4 * 4];

// perform inv txfm on every row
for (r, buffer_slice) in buffer.chunks_exact_mut(4).enumerate() {
let mut temp_in: [i32; 4] = [0; 4];
for (val, transposed) in input[r..]
.iter()
.map(|a| i32::cast_from(*a))
.step_by(4)
.zip(temp_in.iter_mut())
{
*transposed = val >> 2;
}
av1_iwht4(&temp_in, buffer_slice);
}

// perform inv txfm on every col
for c in 0..4 {
let mut temp_in: [i32; 4] = [0; 4];
let mut temp_out: [i32; 4] = [0; 4];
for (val, transposed) in buffer[c..]
.iter()
.map(|a| i32::cast_from(*a))
.step_by(4)
.zip(temp_in.iter_mut())
{
*transposed = val;
}
av1_iwht4(&temp_in, &mut temp_out);
for (temp, out) in temp_out
.iter()
.zip(output.rows_iter_mut().map(|row| &mut row[c]).take(4))
{
let v = i32::cast_from(*out) + *temp;
*out = T::cast_from(v);
}
}
}

#[cold_for_target_arch("x86_64", "aarch64")]
pub fn inverse_transform_add<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: usize,
Expand All @@ -1686,6 +1647,7 @@ pub(crate) mod rust {
let mut buffer = vec![0i32; width * height].into_boxed_slice();
let rect_type = get_rect_tx_log_ratio(width, height);
let tx_types_1d = get_1d_tx_types(tx_type);
let lossless = tx_type == TxType::WHT_WHT;

// perform inv txfm on every row
let range = bd + 8;
Expand All @@ -1705,6 +1667,8 @@ pub(crate) mod rust {
{
let val = if rect_type.abs() == 1 {
round_shift(raw * INV_SQRT2, SQRT2_BITS)
} else if lossless {
raw >> 2

Check warning on line 1671 in src/transform/inverse.rs

View check run for this annotation

Codecov / codecov/patch

src/transform/inverse.rs#L1671

Added line #L1671 was not covered by tests
} else {
raw
};
Expand Down Expand Up @@ -1733,7 +1697,8 @@ pub(crate) mod rust {
.zip(output.rows_iter_mut().map(|row| &mut row[c]).take(height))
{
let v: i32 = (*out).as_();
let v = clamp(v + round_shift(*temp, 4), 0, (1 << bd) - 1);
let r = if lossless { *temp } else { round_shift(*temp, 4) };
let v = clamp(v + r, 0, (1 << bd) - 1);
*out = T::cast_from(v);
}
}
Expand Down
Loading

0 comments on commit 05ec6f4

Please sign in to comment.