33#ifndef AVX2_COMMON_FUNCTIONS_H
34#define AVX2_COMMON_FUNCTIONS_H
44#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
45 #define FIX_W32 __vectorcall
54#if defined __GNUC__ || defined __clang__ || defined __MINGW32__ || defined __MINGW64__
55 #define FIX_UNUSED __attribute__((unused))
60#define FIX_NOINLINE FIX_W32 FIX_UNUSED
70static INLINE void scanord_read_vector(
const int16_t **__restrict coeffs,
const uint32_t *__restrict scan, int8_t scan_mode, int32_t subpos, int32_t width, __m256i *result_vecs,
const int n_bufs)
73 const __m128i low128_shuffle_masks[3] = {
74 _mm_setr_epi8(10,11, 4, 5, 12,13, 0, 1, 6, 7, 14,15, 8, 9, 2, 3),
75 _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15),
76 _mm_setr_epi8( 4, 5, 6, 7, 0, 1, 2, 3, 12,13, 14,15, 8, 9, 10,11),
79 const __m128i blend_masks[3] = {
80 _mm_setr_epi16( 0, 0, 0, -1, 0, 0, -1, -1),
81 _mm_setr_epi16( 0, 0, 0, 0, 0, 0, 0, 0),
82 _mm_setr_epi16( 0, 0, -1, -1, 0, 0, -1, -1),
85 const __m128i invec_rearr_masks_upper[3] = {
86 _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 6, 7, 10,11, 4, 5, 12,13, 14,15),
87 _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15),
88 _mm_setr_epi8( 0, 1, 8, 9, 4, 5, 12,13, 2, 3, 10,11, 6, 7, 14,15),
91 const __m128i invec_rearr_masks_lower[3] = {
92 _mm_setr_epi8(12,13, 6, 7, 0, 1, 2, 3, 14,15, 4, 5, 8, 9, 10,11),
93 _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15),
94 _mm_setr_epi8( 4, 5, 12,13, 0, 1, 8, 9, 6, 7, 14,15, 2, 3, 10,11),
97 const size_t row_offsets[4] = {
98 scan[subpos] + width * 0,
99 scan[subpos] + width * 1,
100 scan[subpos] + width * 2,
101 scan[subpos] + width * 3,
104 for (
int i = 0; i < n_bufs; i++) {
105 const int16_t *__restrict coeff = coeffs[i];
111 __m128d coeffs_d_upper;
112 __m128d coeffs_d_lower;
114 __m128i coeffs_upper;
115 __m128i coeffs_lower;
117 __m128i coeffs_rearr1_upper;
118 __m128i coeffs_rearr1_lower;
120 __m128i coeffs_rearr2_upper;
121 __m128i coeffs_rearr2_lower;
125 coeffs_d_upper = _mm_setzero_pd();
126 coeffs_d_lower = _mm_setzero_pd();
128 coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (
double *)(coeff + row_offsets[0]));
129 coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (
double *)(coeff + row_offsets[1]));
131 coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (
double *)(coeff + row_offsets[2]));
132 coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (
double *)(coeff + row_offsets[3]));
134 coeffs_upper = _mm_castpd_si128(coeffs_d_upper);
135 coeffs_lower = _mm_castpd_si128(coeffs_d_lower);
137 coeffs_lower = _mm_shuffle_epi8(coeffs_lower, low128_shuffle_masks[scan_mode]);
139 coeffs_rearr1_upper = _mm_blendv_epi8(coeffs_upper, coeffs_lower, blend_masks[scan_mode]);
140 coeffs_rearr1_lower = _mm_blendv_epi8(coeffs_lower, coeffs_upper, blend_masks[scan_mode]);
142 coeffs_rearr2_upper = _mm_shuffle_epi8(coeffs_rearr1_upper, invec_rearr_masks_upper[scan_mode]);
143 coeffs_rearr2_lower = _mm_shuffle_epi8(coeffs_rearr1_lower, invec_rearr_masks_lower[scan_mode]);
147 result_vecs[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(coeffs_rearr2_upper),
162 const __m256i zero = _mm256_setzero_si256();
164 __m256i zeros = _mm256_cmpeq_epi16(ints, zero);
165 uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));
166 *first = ( (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
167 *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
173 __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
174 __m256i sum3 = _mm256_add_epi32 (sum1, sum2);
175 __m256i sum4 = _mm256_shuffle_epi32 (sum3, _MM_SHUFFLE(1, 0, 3, 2));
176 __m256i sum5 = _mm256_add_epi32 (sum3, sum4);
177 __m256i sum6 = _mm256_shuffle_epi32 (sum5, _MM_SHUFFLE(2, 3, 0, 1));
178 __m256i sum7 = _mm256_add_epi32 (sum5, sum6);
180 __m128i sum8 = _mm256_castsi256_si128 (sum7);
181 int32_t sum9 = _mm_cvtsi128_si32 (sum8);
static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
Definition: avx2_common_functions.h:154
static int32_t hsum_8x32b(const __m256i v)
Definition: avx2_common_functions.h:170
#define FIX_NOINLINE
Definition: avx2_common_functions.h:60
static INLINE void scanord_read_vector(const int16_t **__restrict coeffs, const uint32_t *__restrict scan, int8_t scan_mode, int32_t subpos, int32_t width, __m256i *result_vecs, const int n_bufs)
Definition: avx2_common_functions.h:70
#define INLINE
Definition: global.h:240