61#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
69 const float normalizeFactor,
70 unsigned int num_points)
72 float* outPtr = outputVector;
73 const float* inPtr = (
float*)inputVector;
74 const float invNormalizeFactor = 1.f / normalizeFactor;
75 unsigned int number = 0;
76 for (; number < num_points; number++) {
77 const float real = *inPtr++;
78 const float imag = *inPtr++;
79 *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
88 const float normalizeFactor,
89 unsigned int num_points)
91 float* outPtr = outputVector;
92 const float* inPtr = (
float*)inputVector;
93 const float invNormalizeFactor = 1.f / normalizeFactor;
94 unsigned int number = 0;
95 for (; number < num_points; number++) {
96 const float x = *inPtr++;
97 const float y = *inPtr++;
98 *outPtr++ =
volk_atan2(y, x) * invNormalizeFactor;
103#if LV_HAVE_AVX2 && LV_HAVE_FMA
104#include <immintrin.h>
106static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(
float* outputVector,
108 const float normalizeFactor,
109 unsigned int num_points)
111 const float* in = (
float*)complexVector;
112 float* out = (
float*)outputVector;
114 const float invNormalizeFactor = 1.f / normalizeFactor;
115 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
116 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
117 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
118 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
119 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
120 const __m256 zero = _mm256_setzero_ps();
122 unsigned int number = 0;
123 unsigned int eighth_points = num_points / 8;
124 for (; number < eighth_points; number++) {
125 __m256 z1 = _mm256_load_ps(in);
127 __m256 z2 = _mm256_load_ps(in);
133 __m256 swap_mask = _mm256_cmp_ps(
134 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
135 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
136 _mm256_blendv_ps(x, y, swap_mask));
137 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
138 input = _mm256_blendv_ps(input, zero, nan_mask);
142 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
143 result = _mm256_blendv_ps(result, input, swap_mask);
146 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
148 result = _mm256_add_ps(
149 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
151 result = _mm256_mul_ps(result, vinvNormalizeFactor);
153 _mm256_store_ps(out, result);
157 number = eighth_points * 8;
159 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
164#include <immintrin.h>
166static inline void volk_32fc_s32f_atan2_32f_a_avx2(
float* outputVector,
168 const float normalizeFactor,
169 unsigned int num_points)
171 const float* in = (
float*)complexVector;
172 float* out = (
float*)outputVector;
174 const float invNormalizeFactor = 1.f / normalizeFactor;
175 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
176 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
177 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
178 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
180 const __m256 zero = _mm256_setzero_ps();
182 unsigned int number = 0;
183 unsigned int eighth_points = num_points / 8;
184 for (; number < eighth_points; number++) {
185 __m256 z1 = _mm256_load_ps(in);
187 __m256 z2 = _mm256_load_ps(in);
193 __m256 swap_mask = _mm256_cmp_ps(
194 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
195 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
196 _mm256_blendv_ps(x, y, swap_mask));
197 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
198 input = _mm256_blendv_ps(input, zero, nan_mask);
202 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
203 result = _mm256_blendv_ps(result, input, swap_mask);
206 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
208 result = _mm256_add_ps(
209 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
211 result = _mm256_mul_ps(result, vinvNormalizeFactor);
213 _mm256_store_ps(out, result);
217 number = eighth_points * 8;
219 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
224#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
225#define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
227#if LV_HAVE_AVX2 && LV_HAVE_FMA
228#include <immintrin.h>
230static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(
float* outputVector,
232 const float normalizeFactor,
233 unsigned int num_points)
235 const float* in = (
float*)complexVector;
236 float* out = (
float*)outputVector;
238 const float invNormalizeFactor = 1.f / normalizeFactor;
239 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
240 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
241 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
242 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
243 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
244 const __m256 zero = _mm256_setzero_ps();
246 unsigned int number = 0;
247 unsigned int eighth_points = num_points / 8;
248 for (; number < eighth_points; number++) {
249 __m256 z1 = _mm256_loadu_ps(in);
251 __m256 z2 = _mm256_loadu_ps(in);
257 __m256 swap_mask = _mm256_cmp_ps(
258 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
259 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
260 _mm256_blendv_ps(x, y, swap_mask));
261 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
262 input = _mm256_blendv_ps(input, zero, nan_mask);
266 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
267 result = _mm256_blendv_ps(result, input, swap_mask);
270 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
272 result = _mm256_add_ps(
273 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
275 result = _mm256_mul_ps(result, vinvNormalizeFactor);
277 _mm256_storeu_ps(out, result);
281 number = eighth_points * 8;
283 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
288#include <immintrin.h>
290static inline void volk_32fc_s32f_atan2_32f_u_avx2(
float* outputVector,
292 const float normalizeFactor,
293 unsigned int num_points)
295 const float* in = (
float*)complexVector;
296 float* out = (
float*)outputVector;
298 const float invNormalizeFactor = 1.f / normalizeFactor;
299 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
300 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
301 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
302 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
303 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
304 const __m256 zero = _mm256_setzero_ps();
306 unsigned int number = 0;
307 unsigned int eighth_points = num_points / 8;
308 for (; number < eighth_points; number++) {
309 __m256 z1 = _mm256_loadu_ps(in);
311 __m256 z2 = _mm256_loadu_ps(in);
317 __m256 swap_mask = _mm256_cmp_ps(
318 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
319 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
320 _mm256_blendv_ps(x, y, swap_mask));
321 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
322 input = _mm256_blendv_ps(input, zero, nan_mask);
326 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
327 result = _mm256_blendv_ps(result, input, swap_mask);
330 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
332 result = _mm256_add_ps(
333 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
335 result = _mm256_mul_ps(result, vinvNormalizeFactor);
337 _mm256_storeu_ps(out, result);
341 number = eighth_points * 8;
343 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
static void volk_32fc_s32f_atan2_32f_polynomial(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition volk_32fc_s32f_atan2_32f.h:86
static void volk_32fc_s32f_atan2_32f_generic(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition volk_32fc_s32f_atan2_32f.h:67
static __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
Definition volk_avx2_fma_intrinsics.h:26
static __m256 _mm256_real(const __m256 z1, const __m256 z2)
Definition volk_avx2_intrinsics.h:21
static __m256 _mm256_imag(const __m256 z1, const __m256 z2)
Definition volk_avx2_intrinsics.h:28
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition volk_avx_intrinsics.h:27
static float volk_atan2(const float y, const float x)
Definition volk_common.h:215
float complex lv_32fc_t
Definition volk_complex.h:74