97 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
98 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
101 #include <inttypes.h>
105 #ifdef LV_HAVE_GENERIC
110 unsigned int num_points)
116 for (
unsigned int number = 0; number < num_points; number++) {
117 *cPtr++ = (*aPtr++) / (*bPtr++);
124 #include <pmmintrin.h>
130 unsigned int num_points)
138 unsigned int number = 0;
139 const unsigned int quarterPoints = num_points / 4;
141 __m128 num01, num23, den01, den23, norm, result;
146 for (; number < quarterPoints; number++) {
147 num01 = _mm_loadu_ps((
float*)a);
148 den01 = _mm_loadu_ps((
float*)b);
153 num23 = _mm_loadu_ps((
float*)a);
154 den23 = _mm_loadu_ps((
float*)b);
160 den01 = _mm_unpacklo_ps(norm, norm);
161 den23 = _mm_unpackhi_ps(norm, norm);
163 result = _mm_div_ps(num01, den01);
164 _mm_storeu_ps((
float*)c, result);
166 result = _mm_div_ps(num23, den23);
167 _mm_storeu_ps((
float*)c, result);
172 for (; number < num_points; number++) {
183 #include <immintrin.h>
189 unsigned int num_points)
197 unsigned int number = 0;
198 const unsigned int quarterPoints = num_points / 4;
200 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
205 for (; number < quarterPoints; number++) {
206 num = _mm256_loadu_ps(
208 denum = _mm256_loadu_ps(
211 sq = _mm256_mul_ps(denum, denum);
212 mag_sq_un = _mm256_hadd_ps(
214 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8);
217 div = _mm256_div_ps(mul_conj, mag_sq);
219 _mm256_storeu_ps((
float*)c, div);
226 number = quarterPoints * 4;
228 for (; number < num_points; number++) {
229 *c++ = (*a++) / (*b++);
238 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
239 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
242 #include <inttypes.h>
247 #include <pmmintrin.h>
253 unsigned int num_points)
261 unsigned int number = 0;
262 const unsigned int quarterPoints = num_points / 4;
264 __m128 num01, num23, den01, den23, norm, result;
269 for (; number < quarterPoints; number++) {
270 num01 = _mm_load_ps((
float*)a);
271 den01 = _mm_load_ps((
float*)b);
276 num23 = _mm_load_ps((
float*)a);
277 den23 = _mm_load_ps((
float*)b);
284 den01 = _mm_unpacklo_ps(norm, norm);
285 den23 = _mm_unpackhi_ps(norm, norm);
287 result = _mm_div_ps(num01, den01);
288 _mm_store_ps((
float*)c, result);
290 result = _mm_div_ps(num23, den23);
291 _mm_store_ps((
float*)c, result);
296 for (; number < num_points; number++) {
306 #include <immintrin.h>
312 unsigned int num_points)
328 const unsigned int eigthPoints = num_points / 8;
330 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
332 for (
unsigned int number = 0; number < eigthPoints; number++) {
334 num01 = _mm256_load_ps((
float*)a);
335 denum01 = _mm256_load_ps((
float*)b);
341 num23 = _mm256_load_ps((
float*)a);
342 denum23 = _mm256_load_ps((
float*)b);
347 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
348 _mm256_mul_ps(denum23, denum23));
350 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
351 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
353 result0 = _mm256_div_ps(num01, denum01);
354 result1 = _mm256_div_ps(num23, denum23);
356 _mm256_store_ps((
float*)c, result0);
358 _mm256_store_ps((
float*)c, result1);
368 #include <arm_neon.h>
373 unsigned int num_points)
379 float32x4x2_t aVal, bVal, cVal;
380 float32x4_t bAbs, bAbsInv;
382 const unsigned int quarterPoints = num_points / 4;
383 unsigned int number = 0;
384 for (; number < quarterPoints; number++) {
385 aVal = vld2q_f32((
const float*)(aPtr));
386 bVal = vld2q_f32((
const float*)(bPtr));
392 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
393 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
395 bAbsInv = vrecpeq_f32(bAbs);
396 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
397 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
399 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
400 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
401 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
403 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
404 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
405 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
407 vst2q_f32((
float*)(cPtr), cVal);
411 for (number = quarterPoints * 4; number < num_points; number++) {
412 *cPtr++ = (*aPtr++) / (*bPtr++);
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:250
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:107
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:370
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:186
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:309
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:127
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:76
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
float complex lv_32fc_t
Definition: volk_complex.h:74
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:38
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31