use crate::simd::{LaneCount, Simd, SupportedLaneCount};
use core::mem;

impl<const N: usize> Simd<u8, N>
where
    LaneCount<N>: SupportedLaneCount,
{
    /// 根据索引 vector 调配字节的 vector。
    /// 范围内的索引选择适当的字节。
    /// 指数 "out of bounds" 而不是选择 0.
    ///
    /// 请注意，当前实现是在标准库的构建期间选择的，因此可能需要 `cargo build -Zbuild-std` 才能解锁更好的性能，尤其是对于较大的 vectors。
    ///
    /// 计划中的编译器改进将支持使用 `#[target_feature]`。
    ///
    #[inline]
    pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
        #![allow(unused_imports, unused_unsafe)]
        #[cfg(target_arch = "aarch64")]
        use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
        #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
        use core::arch::arm::{uint8x8_t, vtbl1_u8};
        #[cfg(target_arch = "wasm32")]
        use core::arch::wasm32 as wasm;
        #[cfg(target_arch = "x86")]
        use core::arch::x86;
        #[cfg(target_arch = "x86_64")]
        use core::arch::x86_64 as x86;
        // SAFETY: cfg 覆盖的内部数
        unsafe {
            match N {
                #[cfg(target_feature = "neon")]
                8 => transize(vtbl1_u8, self, idxs),
                #[cfg(target_feature = "ssse3")]
                16 => transize(x86::_mm_shuffle_epi8, self, idxs),
                #[cfg(target_feature = "simd128")]
                16 => transize(wasm::i8x16_swizzle, self, idxs),
                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
                16 => transize(vqtbl1q_u8, self, idxs),
                #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                32 => transize_raw(avx2_pshufb, self, idxs),
                #[cfg(target_feature = "avx512vl,avx512vbmi")]
                32 => transize(x86::_mm256_permutexvar_epi8, self, idxs),
                // 值得注意的缺席: avx512bw shuffle 如果 avx512bw 可用，则 avx512vbmi 的可能性很大
                //
                // FIXME: 最初的 AVX512VBMI 变体实际上并没有通过测试
                // #[cfg(target_feature = "avx512vbmi")]
                // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
                _ => {
                    let mut array = [0; N];
                    for (i, k) in idxs.to_array().into_iter().enumerate() {
                        if (k as usize) < N {
                            array[i] = self[k as usize];
                        };
                    }
                    array.into()
                }
            }
        }
    }
}

/// AVX2 上的 "vpshufb like it was meant to be"
///
/// # Safety
/// 这需要 AVX2 才能工作
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
#[allow(unused)]
#[inline]
#[allow(clippy::let_and_return)]
unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
    use crate::simd::SimdPartialOrd;
    #[cfg(target_arch = "x86")]
    use core::arch::x86;
    #[cfg(target_arch = "x86_64")]
    use core::arch::x86_64 as x86;
    use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
    use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
    let mid = Simd::splat(16u8);
    let high = mid + mid;
    // SAFETY: 调用 promised AVX2
    unsafe {
        // 这是排序敏感的，LLVM 将按照您放置它们的方式对它们进行排序。
        // 大多数 AVX2 实现使用 ~5 "ports"，并且只有 1 或 2 个能够进行置换。
        // 但是 "compose" 步骤将降低到也可以使用至少 1 个其他端口的操作。
        // 因此，这试图分解排列，以便合成流经 "open" 端口。
        // 在重新订购之前，应该在多个 AVX2 CPU 上进行比较工作台

        let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into());
        let hi_shuf = Simd::from(avx2_half_pshufb(
            hihi,        // 复制 vector 的上半部分
            idxs.into(), // 因此仅使用索引的 4 位仍会选择字节 16-31
        ));
        // 组合步骤中的零填充给出了 "all-Neon-like" OOB-is-0 语义
        let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0));
        let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into());
        let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into()));
        // 重复，然后选择 < 16 的索引，覆盖上一个组合步骤中的索引 0-15
        let compose = idxs.simd_lt(mid).select(lo_shuf, compose);
        compose
    }
}

/// 这为特定于体系结构的功能设置了一个调用，并在这样做时说服 rustc 一切都是正确的大小。
/// 它是什么。
/// 如果可以说服 Rust，通过匹配 N，N 就是那个值，则不需要这样做，因此可以有效地替换例如
/// 16.
///
/// # Safety
/// 该函数的正确性取决于实际大小一致。
#[allow(dead_code)]
#[inline(always)]
unsafe fn transize<T, const N: usize>(
    f: unsafe fn(T, T) -> T,
    bytes: Simd<u8, N>,
    idxs: Simd<u8, N>,
) -> Simd<u8, N>
where
    LaneCount<N>: SupportedLaneCount,
{
    let idxs = zeroing_idxs(idxs);
    // SAFETY: 使用此功能的义务与使用 mem::transmute_copy 的义务相同。
    unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
}

/// 为该架构制作产生 0 的索引
#[inline(always)]
fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
where
    LaneCount<N>: SupportedLaneCount,
{
    // 在 x86 上，确保最高位已设置。
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    let idxs = {
        use crate::simd::SimdPartialOrd;
        idxs.simd_lt(Simd::splat(N as u8))
            .select(idxs, Simd::splat(u8::MAX))
    };
    // 在大多数架构上什么都不做。
    idxs
}

/// 作为转换但没有隐式调用到 `zeroing_idxs`。
#[allow(dead_code)]
#[inline(always)]
unsafe fn transize_raw<T, const N: usize>(
    f: unsafe fn(T, T) -> T,
    bytes: Simd<u8, N>,
    idxs: Simd<u8, N>,
) -> Simd<u8, N>
where
    LaneCount<N>: SupportedLaneCount,
{
    // SAFETY: 使用此功能的义务与使用 mem::transmute_copy 的义务相同。
    unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
}