use crate::simd::{LaneCount, Simd, SupportedLaneCount};
use core::mem;
impl<const N: usize> Simd<u8, N>
where
LaneCount<N>: SupportedLaneCount,
{
#[inline]
pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
#![allow(unused_imports, unused_unsafe)]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
use core::arch::arm::{uint8x8_t, vtbl1_u8};
#[cfg(target_arch = "wasm32")]
use core::arch::wasm32 as wasm;
#[cfg(target_arch = "x86")]
use core::arch::x86;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64 as x86;
unsafe {
match N {
#[cfg(target_feature = "neon")]
8 => transize(vtbl1_u8, self, idxs),
#[cfg(target_feature = "ssse3")]
16 => transize(x86::_mm_shuffle_epi8, self, idxs),
#[cfg(target_feature = "simd128")]
16 => transize(wasm::i8x16_swizzle, self, idxs),
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
16 => transize(vqtbl1q_u8, self, idxs),
#[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
32 => transize_raw(avx2_pshufb, self, idxs),
#[cfg(target_feature = "avx512vl,avx512vbmi")]
32 => transize(x86::_mm256_permutexvar_epi8, self, idxs),
_ => {
let mut array = [0; N];
for (i, k) in idxs.to_array().into_iter().enumerate() {
if (k as usize) < N {
array[i] = self[k as usize];
};
}
array.into()
}
}
}
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
#[allow(unused)]
#[inline]
#[allow(clippy::let_and_return)]
unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
use crate::simd::SimdPartialOrd;
#[cfg(target_arch = "x86")]
use core::arch::x86;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64 as x86;
use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
let mid = Simd::splat(16u8);
let high = mid + mid;
unsafe {
let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into());
let hi_shuf = Simd::from(avx2_half_pshufb(
hihi, idxs.into(), ));
let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0));
let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into());
let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into()));
let compose = idxs.simd_lt(mid).select(lo_shuf, compose);
compose
}
}
#[allow(dead_code)]
#[inline(always)]
unsafe fn transize<T, const N: usize>(
f: unsafe fn(T, T) -> T,
bytes: Simd<u8, N>,
idxs: Simd<u8, N>,
) -> Simd<u8, N>
where
LaneCount<N>: SupportedLaneCount,
{
let idxs = zeroing_idxs(idxs);
unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
}
#[inline(always)]
fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
where
LaneCount<N>: SupportedLaneCount,
{
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
let idxs = {
use crate::simd::SimdPartialOrd;
idxs.simd_lt(Simd::splat(N as u8))
.select(idxs, Simd::splat(u8::MAX))
};
idxs
}
#[allow(dead_code)]
#[inline(always)]
unsafe fn transize_raw<T, const N: usize>(
f: unsafe fn(T, T) -> T,
bytes: Simd<u8, N>,
idxs: Simd<u8, N>,
) -> Simd<u8, N>
where
LaneCount<N>: SupportedLaneCount,
{
unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
}