Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32fc_s32f_atan2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
61 #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62 #define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
63 
64 #include <math.h>
65 
66 #ifdef LV_HAVE_GENERIC
67 static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
68  const lv_32fc_t* inputVector,
69  const float normalizeFactor,
70  unsigned int num_points)
71 {
72  float* outPtr = outputVector;
73  const float* inPtr = (float*)inputVector;
74  const float invNormalizeFactor = 1.f / normalizeFactor;
75  unsigned int number = 0;
76  for (; number < num_points; number++) {
77  const float real = *inPtr++;
78  const float imag = *inPtr++;
79  *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
80  }
81 }
82 #endif /* LV_HAVE_GENERIC */
83 
84 #ifdef LV_HAVE_GENERIC
85 #include <volk/volk_common.h>
86 static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector,
87  const lv_32fc_t* inputVector,
88  const float normalizeFactor,
89  unsigned int num_points)
90 {
91  float* outPtr = outputVector;
92  const float* inPtr = (float*)inputVector;
93  const float invNormalizeFactor = 1.f / normalizeFactor;
94  unsigned int number = 0;
95  for (; number < num_points; number++) {
96  const float x = *inPtr++;
97  const float y = *inPtr++;
98  *outPtr++ = volk_atan2(y, x) * invNormalizeFactor;
99  }
100 }
101 #endif /* LV_HAVE_GENERIC */
102 
103 #if LV_HAVE_AVX2 && LV_HAVE_FMA
104 #include <immintrin.h>
106 static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector,
107  const lv_32fc_t* complexVector,
108  const float normalizeFactor,
109  unsigned int num_points)
110 {
111  const float* in = (float*)complexVector;
112  float* out = (float*)outputVector;
113 
114  const float invNormalizeFactor = 1.f / normalizeFactor;
115  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
116  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
117  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
118  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
119  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
120  const __m256 zero = _mm256_setzero_ps();
121 
122  unsigned int number = 0;
123  unsigned int eighth_points = num_points / 8;
124  for (; number < eighth_points; number++) {
125  __m256 z1 = _mm256_load_ps(in);
126  in += 8;
127  __m256 z2 = _mm256_load_ps(in);
128  in += 8;
129 
130  __m256 x = _mm256_real(z1, z2);
131  __m256 y = _mm256_imag(z1, z2);
132 
133  __m256 swap_mask = _mm256_cmp_ps(
134  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
135  __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
136  _mm256_blendv_ps(x, y, swap_mask));
137  __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
138  input = _mm256_blendv_ps(input, zero, nan_mask);
139  __m256 result = _m256_arctan_poly_avx2_fma(input);
140 
141  input =
142  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
143  result = _mm256_blendv_ps(result, input, swap_mask);
144 
145  __m256 x_sign_mask =
146  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
147 
148  result = _mm256_add_ps(
149  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
150  result);
151  result = _mm256_mul_ps(result, vinvNormalizeFactor);
152 
153  _mm256_store_ps(out, result);
154  out += 8;
155  }
156 
157  number = eighth_points * 8;
159  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
160 }
161 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
162 
163 #if LV_HAVE_AVX2
164 #include <immintrin.h>
166 static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
167  const lv_32fc_t* complexVector,
168  const float normalizeFactor,
169  unsigned int num_points)
170 {
171  const float* in = (float*)complexVector;
172  float* out = (float*)outputVector;
173 
174  const float invNormalizeFactor = 1.f / normalizeFactor;
175  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
176  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
177  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
178  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
180  const __m256 zero = _mm256_setzero_ps();
181 
182  unsigned int number = 0;
183  unsigned int eighth_points = num_points / 8;
184  for (; number < eighth_points; number++) {
185  __m256 z1 = _mm256_load_ps(in);
186  in += 8;
187  __m256 z2 = _mm256_load_ps(in);
188  in += 8;
189 
190  __m256 x = _mm256_real(z1, z2);
191  __m256 y = _mm256_imag(z1, z2);
192 
193  __m256 swap_mask = _mm256_cmp_ps(
194  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
195  __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
196  _mm256_blendv_ps(x, y, swap_mask));
197  __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
198  input = _mm256_blendv_ps(input, zero, nan_mask);
199  __m256 result = _m256_arctan_poly_avx(input);
200 
201  input =
202  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
203  result = _mm256_blendv_ps(result, input, swap_mask);
204 
205  __m256 x_sign_mask =
206  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
207 
208  result = _mm256_add_ps(
209  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
210  result);
211  result = _mm256_mul_ps(result, vinvNormalizeFactor);
212 
213  _mm256_store_ps(out, result);
214  out += 8;
215  }
216 
217  number = eighth_points * 8;
219  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
220 }
221 #endif /* LV_HAVE_AVX2 for aligned */
222 #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
223 
224 #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
225 #define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
226 
227 #if LV_HAVE_AVX2 && LV_HAVE_FMA
228 #include <immintrin.h>
230 static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector,
231  const lv_32fc_t* complexVector,
232  const float normalizeFactor,
233  unsigned int num_points)
234 {
235  const float* in = (float*)complexVector;
236  float* out = (float*)outputVector;
237 
238  const float invNormalizeFactor = 1.f / normalizeFactor;
239  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
240  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
241  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
242  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
243  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
244  const __m256 zero = _mm256_setzero_ps();
245 
246  unsigned int number = 0;
247  unsigned int eighth_points = num_points / 8;
248  for (; number < eighth_points; number++) {
249  __m256 z1 = _mm256_loadu_ps(in);
250  in += 8;
251  __m256 z2 = _mm256_loadu_ps(in);
252  in += 8;
253 
254  __m256 x = _mm256_real(z1, z2);
255  __m256 y = _mm256_imag(z1, z2);
256 
257  __m256 swap_mask = _mm256_cmp_ps(
258  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
259  __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
260  _mm256_blendv_ps(x, y, swap_mask));
261  __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
262  input = _mm256_blendv_ps(input, zero, nan_mask);
263  __m256 result = _m256_arctan_poly_avx2_fma(input);
264 
265  input =
266  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
267  result = _mm256_blendv_ps(result, input, swap_mask);
268 
269  __m256 x_sign_mask =
270  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
271 
272  result = _mm256_add_ps(
273  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
274  result);
275  result = _mm256_mul_ps(result, vinvNormalizeFactor);
276 
277  _mm256_storeu_ps(out, result);
278  out += 8;
279  }
280 
281  number = eighth_points * 8;
283  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
284 }
285 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
286 
287 #if LV_HAVE_AVX2
288 #include <immintrin.h>
290 static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
291  const lv_32fc_t* complexVector,
292  const float normalizeFactor,
293  unsigned int num_points)
294 {
295  const float* in = (float*)complexVector;
296  float* out = (float*)outputVector;
297 
298  const float invNormalizeFactor = 1.f / normalizeFactor;
299  const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
300  const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
301  const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
302  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
303  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
304  const __m256 zero = _mm256_setzero_ps();
305 
306  unsigned int number = 0;
307  unsigned int eighth_points = num_points / 8;
308  for (; number < eighth_points; number++) {
309  __m256 z1 = _mm256_loadu_ps(in);
310  in += 8;
311  __m256 z2 = _mm256_loadu_ps(in);
312  in += 8;
313 
314  __m256 x = _mm256_real(z1, z2);
315  __m256 y = _mm256_imag(z1, z2);
316 
317  __m256 swap_mask = _mm256_cmp_ps(
318  _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
319  __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
320  _mm256_blendv_ps(x, y, swap_mask));
321  __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
322  input = _mm256_blendv_ps(input, zero, nan_mask);
323  __m256 result = _m256_arctan_poly_avx(input);
324 
325  input =
326  _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
327  result = _mm256_blendv_ps(result, input, swap_mask);
328 
329  __m256 x_sign_mask =
330  _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
331 
332  result = _mm256_add_ps(
333  _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
334  result);
335  result = _mm256_mul_ps(result, vinvNormalizeFactor);
336 
337  _mm256_storeu_ps(out, result);
338  out += 8;
339  }
340 
341  number = eighth_points * 8;
343  out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
344 }
345 #endif /* LV_HAVE_AVX2 for unaligned */
346 
347 #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_u_H */
static void volk_32fc_s32f_atan2_32f_polynomial(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition: volk_32fc_s32f_atan2_32f.h:86
static void volk_32fc_s32f_atan2_32f_generic(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition: volk_32fc_s32f_atan2_32f.h:67
static __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:26
static __m256 _mm256_real(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:21
static __m256 _mm256_imag(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:28
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition: volk_avx_intrinsics.h:27
static float volk_atan2(const float y, const float x)
Definition: volk_common.h:215
float complex lv_32fc_t
Definition: volk_complex.h:74