NFFT  3.3.1alpha
nfft.c
1 /*
2  * Copyright (c) 2002, 2016 Jens Keiner, Stefan Kunis, Daniel Potts
3  *
4  * This program is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation; either version 2 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc., 51
16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18 
19 /* Nonequispaced FFT */
20 
21 /* Authors: D. Potts, S. Kunis 2002-2009, Jens Keiner 2009, Toni Volkmer 2012 */
22 
23 /* configure header */
24 #include "config.h"
25 
26 /* complex datatype (maybe) */
27 #ifdef HAVE_COMPLEX_H
28 #include<complex.h>
29 #endif
30 
31 /* NFFT headers */
32 #include "nfft3.h"
33 #include "infft.h"
34 
35 #ifdef _OPENMP
36 #include <omp.h>
37 #endif
38 
39 #ifdef OMP_ASSERT
40 #include <assert.h>
41 #endif
42 
43 #undef X
44 #define X(name) NFFT(name)
45 
47 static inline INT intprod(const INT *vec, const INT a, const INT d)
48 {
49  INT t, p;
50 
51  p = 1;
52  for (t = 0; t < d; t++)
53  p *= vec[t] - a;
54 
55  return p;
56 }
57 
58 /* handy shortcuts */
59 #define BASE(x) CEXP(x)
60 
75 static inline void sort0(const INT d, const INT *n, const INT m,
76  const INT local_x_num, const R *local_x, INT *ar_x)
77 {
78  INT u_j[d], i, j, help, rhigh;
79  INT *ar_x_temp;
80  INT nprod;
81 
82  for (i = 0; i < local_x_num; i++)
83  {
84  ar_x[2 * i] = 0;
85  ar_x[2 *i + 1] = i;
86  for (j = 0; j < d; j++)
87  {
88  help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
89  u_j[j] = (help % n[j] + n[j]) % n[j];
90 
91  ar_x[2 * i] += u_j[j];
92  if (j + 1 < d)
93  ar_x[2 * i] *= n[j + 1];
94  }
95  }
96 
97  for (j = 0, nprod = 1; j < d; j++)
98  nprod *= n[j];
99 
100  rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
101 
102  ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) * sizeof(INT));
103  Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
104 #ifdef OMP_ASSERT
105  for (i = 1; i < local_x_num; i++)
106  assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
107 #endif
108  Y(free)(ar_x_temp);
109 }
110 
119 static inline void sort(const X(plan) *ths)
120 {
121  if (ths->flags & NFFT_SORT_NODES)
122  sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
123 }
124 
145 void X(trafo_direct)(const X(plan) *ths)
146 {
147  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
148 
149  memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
150 
151  if (ths->d == 1)
152  {
153  /* specialize for univariate case, rationale: faster */
154  INT j;
155 #ifdef _OPENMP
156  #pragma omp parallel for default(shared) private(j)
157 #endif
158  for (j = 0; j < ths->M_total; j++)
159  {
160  INT k_L;
161  for (k_L = 0; k_L < ths->N_total; k_L++)
162  {
163  R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
164  f[j] += f_hat[k_L] * BASE(-II * omega);
165  }
166  }
167  }
168  else
169  {
170  /* multivariate case */
171  INT j;
172 #ifdef _OPENMP
173  #pragma omp parallel for default(shared) private(j)
174 #endif
175  for (j = 0; j < ths->M_total; j++)
176  {
177  R x[ths->d], omega, Omega[ths->d + 1];
178  INT t, t2, k_L, k[ths->d];
179  Omega[0] = K(0.0);
180  for (t = 0; t < ths->d; t++)
181  {
182  k[t] = -ths->N[t]/2;
183  x[t] = K2PI * ths->x[j * ths->d + t];
184  Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
185  }
186  omega = Omega[ths->d];
187 
188  for (k_L = 0; k_L < ths->N_total; k_L++)
189  {
190  f[j] += f_hat[k_L] * BASE(-II * omega);
191  {
192  for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
193  k[t]-= ths->N[t]-1;
194 
195  k[t]++;
196 
197  for (t2 = t; t2 < ths->d; t2++)
198  Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
199 
200  omega = Omega[ths->d];
201  }
202  }
203  }
204  }
205 }
206 
207 void X(adjoint_direct)(const X(plan) *ths)
208 {
209  C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
210 
211  memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
212 
213  if (ths->d == 1)
214  {
215  /* specialize for univariate case, rationale: faster */
216 #ifdef _OPENMP
217  INT k_L;
218  #pragma omp parallel for default(shared) private(k_L)
219  for (k_L = 0; k_L < ths->N_total; k_L++)
220  {
221  INT j;
222  for (j = 0; j < ths->M_total; j++)
223  {
224  R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
225  f_hat[k_L] += f[j] * BASE(II * omega);
226  }
227  }
228 #else
229  INT j;
230  for (j = 0; j < ths->M_total; j++)
231  {
232  INT k_L;
233  for (k_L = 0; k_L < ths->N_total; k_L++)
234  {
235  R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
236  f_hat[k_L] += f[j] * BASE(II * omega);
237  }
238  }
239 #endif
240  }
241  else
242  {
243  /* multivariate case */
244  INT j, k_L;
245 #ifdef _OPENMP
246  #pragma omp parallel for default(shared) private(j, k_L)
247  for (k_L = 0; k_L < ths->N_total; k_L++)
248  {
249  INT k[ths->d], k_temp, t;
250 
251  k_temp = k_L;
252 
253  for (t = ths->d - 1; t >= 0; t--)
254  {
255  k[t] = k_temp % ths->N[t] - ths->N[t]/2;
256  k_temp /= ths->N[t];
257  }
258 
259  for (j = 0; j < ths->M_total; j++)
260  {
261  R omega = K(0.0);
262  for (t = 0; t < ths->d; t++)
263  omega += k[t] * K2PI * ths->x[j * ths->d + t];
264  f_hat[k_L] += f[j] * BASE(II * omega);
265  }
266  }
267 #else
268  for (j = 0; j < ths->M_total; j++)
269  {
270  R x[ths->d], omega, Omega[ths->d+1];
271  INT t, t2, k[ths->d];
272  Omega[0] = K(0.0);
273  for (t = 0; t < ths->d; t++)
274  {
275  k[t] = -ths->N[t]/2;
276  x[t] = K2PI * ths->x[j * ths->d + t];
277  Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
278  }
279  omega = Omega[ths->d];
280  for (k_L = 0; k_L < ths->N_total; k_L++)
281  {
282  f_hat[k_L] += f[j] * BASE(II * omega);
283 
284  for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
285  k[t]-= ths->N[t]-1;
286 
287  k[t]++;
288 
289  for (t2 = t; t2 < ths->d; t2++)
290  Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
291 
292  omega = Omega[ths->d];
293  }
294  }
295 #endif
296  }
297 }
298 
324 static inline void uo(const X(plan) *ths, const INT j, INT *up, INT *op,
325  const INT act_dim)
326 {
327  const R xj = ths->x[j * ths->d + act_dim];
328  INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
329 
330  (*up) = c - (ths->m);
331  (*op) = c + 1 + (ths->m);
332 }
333 
334 static inline void uo2(INT *u, INT *o, const R x, const INT n, const INT m)
335 {
336  INT c = LRINT(FLOOR(x * (R)(n)));
337 
338  *u = (c - m + n) % n;
339  *o = (c + 1 + m + n) % n;
340 }
341 
342 #define MACRO_D_compute_A \
343 { \
344  g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
345 }
346 
347 #define MACRO_D_compute_T \
348 { \
349  f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
350 }
351 
352 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
353 
354 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
355 
356 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
357 
358 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2));
359 
360 #define MACRO_init_k_ks \
361 { \
362  for (t = ths->d-1; 0 <= t; t--) \
363  { \
364  kp[t] = k[t] = 0; \
365  ks[t] = ths->N[t]/2; \
366  } \
367  t++; \
368 }
369 
370 #define MACRO_update_c_phi_inv_k(which_one) \
371 { \
372  for (t2 = t; t2 < ths->d; t2++) \
373  { \
374  c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
375  ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
376  k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
377  } \
378 }
379 
380 #define MACRO_count_k_ks \
381 { \
382  for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
383  { \
384  kp[t] = k[t] = 0; \
385  ks[t]= ths->N[t]/2; \
386  } \
387 \
388  kp[t]++; k[t]++; ks[t]++; \
389  if(kp[t] == ths->N[t]/2) \
390  { \
391  k[t] = ths->n[t] - ths->N[t]/2; \
392  ks[t] = 0; \
393  } \
394 } \
395 
396 /* sub routines for the fast transforms matrix vector multiplication with D, D^T */
397 #define MACRO_D(which_one) \
398 static inline void D_serial_ ## which_one (X(plan) *ths) \
399 { \
400  C *f_hat, *g_hat; /* local copy */ \
401  R c_phi_inv_k[ths->d+1]; /* postfix product of PHI_HUT */ \
402  INT t, t2; /* index dimensions */ \
403  INT k_L; /* plain index */ \
404  INT kp[ths->d]; /* multi index (simple) */ \
405  INT k[ths->d]; /* multi index in g_hat */ \
406  INT ks[ths->d]; /* multi index in f_hat, c_phi_inv*/ \
407  INT k_plain[ths->d+1]; /* postfix plain index */ \
408  INT ks_plain[ths->d+1]; /* postfix plain index */ \
409  \
410  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
411  MACRO_D_init_result_ ## which_one; \
412 \
413  c_phi_inv_k[0] = K(1.0); \
414  k_plain[0] = 0; \
415  ks_plain[0] = 0; \
416 \
417  MACRO_init_k_ks; \
418 \
419  if (ths->flags & PRE_PHI_HUT) \
420  { \
421  for (k_L = 0; k_L < ths->N_total; k_L++) \
422  { \
423  MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
424  MACRO_D_compute_ ## which_one; \
425  MACRO_count_k_ks; \
426  } \
427  } \
428  else \
429  { \
430  for (k_L = 0; k_L < ths->N_total; k_L++) \
431  { \
432  MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
433  MACRO_D_compute_ ## which_one; \
434  MACRO_count_k_ks; \
435  } \
436  } \
437 }
438 
439 #ifdef _OPENMP
440 static inline void D_openmp_A(X(plan) *ths)
441 {
442  C *f_hat, *g_hat;
443  INT k_L;
445  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
446  memset(g_hat, 0, ths->n_total * sizeof(C));
447 
448  if (ths->flags & PRE_PHI_HUT)
449  {
450  #pragma omp parallel for default(shared) private(k_L)
451  for (k_L = 0; k_L < ths->N_total; k_L++)
452  {
453  INT kp[ths->d]; //0..N-1
454  INT k[ths->d];
455  INT ks[ths->d];
456  R c_phi_inv_k_val = K(1.0);
457  INT k_plain_val = 0;
458  INT ks_plain_val = 0;
459  INT t;
460  INT k_temp = k_L;
461 
462  for (t = ths->d-1; t >= 0; t--)
463  {
464  kp[t] = k_temp % ths->N[t];
465  if (kp[t] >= ths->N[t]/2)
466  k[t] = ths->n[t] - ths->N[t] + kp[t];
467  else
468  k[t] = kp[t];
469  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
470  k_temp /= ths->N[t];
471  }
472 
473  for (t = 0; t < ths->d; t++)
474  {
475  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
476  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
477  k_plain_val = k_plain_val*ths->n[t] + k[t];
478  }
479 
480  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
481  } /* for(k_L) */
482  } /* if(PRE_PHI_HUT) */
483  else
484  {
485  #pragma omp parallel for default(shared) private(k_L)
486  for (k_L = 0; k_L < ths->N_total; k_L++)
487  {
488  INT kp[ths->d]; //0..N-1
489  INT k[ths->d];
490  INT ks[ths->d];
491  R c_phi_inv_k_val = K(1.0);
492  INT k_plain_val = 0;
493  INT ks_plain_val = 0;
494  INT t;
495  INT k_temp = k_L;
496 
497  for (t = ths->d-1; t >= 0; t--)
498  {
499  kp[t] = k_temp % ths->N[t];
500  if (kp[t] >= ths->N[t]/2)
501  k[t] = ths->n[t] - ths->N[t] + kp[t];
502  else
503  k[t] = kp[t];
504  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
505  k_temp /= ths->N[t];
506  }
507 
508  for (t = 0; t < ths->d; t++)
509  {
510  c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
511  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
512  k_plain_val = k_plain_val*ths->n[t] + k[t];
513  }
514 
515  g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
516  } /* for(k_L) */
517  } /* else(PRE_PHI_HUT) */
518 }
519 #endif
520 
521 #ifndef _OPENMP
522 MACRO_D(A)
523 #endif
524 
525 static inline void D_A(X(plan) *ths)
526 {
527 #ifdef _OPENMP
528  D_openmp_A(ths);
529 #else
530  D_serial_A(ths);
531 #endif
532 }
533 
534 #ifdef _OPENMP
535 static void D_openmp_T(X(plan) *ths)
536 {
537  C *f_hat, *g_hat;
538  INT k_L;
540  f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
541  memset(f_hat, 0, ths->N_total * sizeof(C));
542 
543  if (ths->flags & PRE_PHI_HUT)
544  {
545  #pragma omp parallel for default(shared) private(k_L)
546  for (k_L = 0; k_L < ths->N_total; k_L++)
547  {
548  INT kp[ths->d]; //0..N-1
549  INT k[ths->d];
550  INT ks[ths->d];
551  R c_phi_inv_k_val = K(1.0);
552  INT k_plain_val = 0;
553  INT ks_plain_val = 0;
554  INT t;
555  INT k_temp = k_L;
556 
557  for (t = ths->d - 1; t >= 0; t--)
558  {
559  kp[t] = k_temp % ths->N[t];
560  if (kp[t] >= ths->N[t]/2)
561  k[t] = ths->n[t] - ths->N[t] + kp[t];
562  else
563  k[t] = kp[t];
564  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
565  k_temp /= ths->N[t];
566  }
567 
568  for (t = 0; t < ths->d; t++)
569  {
570  c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
571  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
572  k_plain_val = k_plain_val*ths->n[t] + k[t];
573  }
574 
575  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
576  } /* for(k_L) */
577  } /* if(PRE_PHI_HUT) */
578  else
579  {
580  #pragma omp parallel for default(shared) private(k_L)
581  for (k_L = 0; k_L < ths->N_total; k_L++)
582  {
583  INT kp[ths->d]; //0..N-1
584  INT k[ths->d];
585  INT ks[ths->d];
586  R c_phi_inv_k_val = K(1.0);
587  INT k_plain_val = 0;
588  INT ks_plain_val = 0;
589  INT t;
590  INT k_temp = k_L;
591 
592  for (t = ths->d-1; t >= 0; t--)
593  {
594  kp[t] = k_temp % ths->N[t];
595  if (kp[t] >= ths->N[t]/2)
596  k[t] = ths->n[t] - ths->N[t] + kp[t];
597  else
598  k[t] = kp[t];
599  ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
600  k_temp /= ths->N[t];
601  }
602 
603  for (t = 0; t < ths->d; t++)
604  {
605  c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
606  ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
607  k_plain_val = k_plain_val*ths->n[t] + k[t];
608  }
609 
610  f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
611  } /* for(k_L) */
612  } /* else(PRE_PHI_HUT) */
613 }
614 #endif
615 
616 #ifndef _OPENMP
617 MACRO_D(T)
618 #endif
619 
620 static void D_T(X(plan) *ths)
621 {
622 #ifdef _OPENMP
623  D_openmp_T(ths);
624 #else
625  D_serial_T(ths);
626 #endif
627 }
628 
629 /* sub routines for the fast transforms matrix vector multiplication with B, B^T */
630 #define MACRO_B_init_result_A memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
631 #define MACRO_B_init_result_T memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
632 
633 #define MACRO_B_PRE_FULL_PSI_compute_A \
634 { \
635  (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
636 }
637 
638 #define MACRO_B_PRE_FULL_PSI_compute_T \
639 { \
640  g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
641 }
642 
643 #define MACRO_B_compute_A \
644 { \
645  (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
646 }
647 
648 #define MACRO_B_compute_T \
649 { \
650  g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
651 }
652 
653 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
654 
655 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
656 
657 #define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \
658  - ((R)l[t2])/((R)ths->n[t2]), t2)
659 
660 #define MACRO_init_uo_l_lj_t \
661 { \
662  for (t = ths->d-1; t >= 0; t--) \
663  { \
664  uo(ths,j,&u[t],&o[t],t); \
665  l[t] = u[t]; \
666  lj[t] = 0; \
667  } \
668  t++; \
669 }
670 
671 #define MACRO_update_phi_prod_ll_plain(which_one) { \
672  for (t2 = t; t2 < ths->d; t2++) \
673  { \
674  phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \
675  ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + (l[t2] + ths->n[t2]) % ths->n[t2]; \
676  } \
677 }
678 
679 #define MACRO_count_uo_l_lj_t \
680 { \
681  for (t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
682  { \
683  l[t] = u[t]; \
684  lj[t] = 0; \
685  } \
686  \
687  l[t]++; \
688  lj[t]++; \
689 }
690 
691 #define MACRO_B(which_one) \
692 static inline void B_serial_ ## which_one (X(plan) *ths) \
693 { \
694  INT lprod; /* 'regular bandwidth' of matrix B */ \
695  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
696  INT t, t2; /* index dimensions */ \
697  INT j; /* index nodes */ \
698  INT l_L, ix; /* index one row of B */ \
699  INT l[ths->d]; /* multi index u<=l<=o */ \
700  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
701  INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
702  R phi_prod[ths->d+1]; /* postfix product of PHI */ \
703  C *f, *g; /* local copy */ \
704  C *fj; /* local copy */ \
705  R y[ths->d]; \
706  R fg_psi[ths->d][2*ths->m+2]; \
707  R fg_exp_l[ths->d][2*ths->m+2]; \
708  INT l_fg,lj_fg; \
709  R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
710  R ip_w; \
711  INT ip_u; \
712  INT ip_s = ths->K/(ths->m+2); \
713  \
714  f = (C*)ths->f; g = (C*)ths->g; \
715  \
716  MACRO_B_init_result_ ## which_one; \
717  \
718  if (ths->flags & PRE_FULL_PSI) \
719  { \
720  for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
721  { \
722  for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
723  { \
724  MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
725  } \
726  } \
727  return; \
728  } \
729 \
730  phi_prod[0] = K(1.0); \
731  ll_plain[0] = 0; \
732 \
733  for (t = 0, lprod = 1; t < ths->d; t++) \
734  lprod *= (2 * ths->m + 2); \
735 \
736  if (ths->flags & PRE_PSI) \
737  { \
738  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
739  { \
740  MACRO_init_uo_l_lj_t; \
741  \
742  for (l_L = 0; l_L < lprod; l_L++) \
743  { \
744  MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
745  \
746  MACRO_B_compute_ ## which_one; \
747  \
748  MACRO_count_uo_l_lj_t; \
749  } /* for(l_L) */ \
750  } /* for(j) */ \
751  return; \
752  } /* if(PRE_PSI) */ \
753  \
754  if (ths->flags & PRE_FG_PSI) \
755  { \
756  for(t2 = 0; t2 < ths->d; t2++) \
757  { \
758  tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \
759  tmpEXP2sq = tmpEXP2*tmpEXP2; \
760  tmp2 = K(1.0); \
761  tmp3 = K(1.0); \
762  fg_exp_l[t2][0] = K(1.0); \
763  for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
764  { \
765  tmp3 = tmp2*tmpEXP2; \
766  tmp2 *= tmpEXP2sq; \
767  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \
768  } \
769  } \
770  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
771  { \
772  MACRO_init_uo_l_lj_t; \
773  \
774  for (t2 = 0; t2 < ths->d; t2++) \
775  { \
776  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
777  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
778  tmp1 = K(1.0); \
779  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
780  { \
781  tmp1 *= tmpEXP1; \
782  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
783  } \
784  } \
785  \
786  for (l_L= 0; l_L < lprod; l_L++) \
787  { \
788  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
789  \
790  MACRO_B_compute_ ## which_one; \
791  \
792  MACRO_count_uo_l_lj_t; \
793  } /* for(l_L) */ \
794  } /* for(j) */ \
795  return; \
796  } /* if(PRE_FG_PSI) */ \
797  \
798  if (ths->flags & FG_PSI) \
799  { \
800  for (t2 = 0; t2 < ths->d; t2++) \
801  { \
802  tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
803  tmpEXP2sq = tmpEXP2*tmpEXP2; \
804  tmp2 = K(1.0); \
805  tmp3 = K(1.0); \
806  fg_exp_l[t2][0] = K(1.0); \
807  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
808  { \
809  tmp3 = tmp2*tmpEXP2; \
810  tmp2 *= tmpEXP2sq; \
811  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
812  } \
813  } \
814  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
815  { \
816  MACRO_init_uo_l_lj_t; \
817  \
818  for (t2 = 0; t2 < ths->d; t2++) \
819  { \
820  fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\
821  \
822  tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
823  /ths->b[t2]); \
824  tmp1 = K(1.0); \
825  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
826  { \
827  tmp1 *= tmpEXP1; \
828  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
829  } \
830  } \
831  \
832  for (l_L = 0; l_L < lprod; l_L++) \
833  { \
834  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
835  \
836  MACRO_B_compute_ ## which_one; \
837  \
838  MACRO_count_uo_l_lj_t; \
839  } /* for(l_L) */ \
840  } /* for(j) */ \
841  return; \
842  } /* if(FG_PSI) */ \
843  \
844  if (ths->flags & PRE_LIN_PSI) \
845  { \
846  for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
847  { \
848  MACRO_init_uo_l_lj_t; \
849  \
850  for (t2 = 0; t2 < ths->d; t2++) \
851  { \
852  y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
853  * ((R)(ths->K))) / (R)(ths->m + 2); \
854  ip_u = LRINT(FLOOR(y[t2])); \
855  ip_w = y[t2]-ip_u; \
856  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
857  { \
858  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
859  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
860  * (ip_w); \
861  } \
862  } \
863  \
864  for (l_L = 0; l_L < lprod; l_L++) \
865  { \
866  MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
867  \
868  MACRO_B_compute_ ## which_one; \
869  \
870  MACRO_count_uo_l_lj_t; \
871  } /* for(l_L) */ \
872  } /* for(j) */ \
873  return; \
874  } /* if(PRE_LIN_PSI) */ \
875  \
876  /* no precomputed psi at all */ \
877  for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
878  { \
879  MACRO_init_uo_l_lj_t; \
880  \
881  for (l_L = 0; l_L < lprod; l_L++) \
882  { \
883  MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
884  \
885  MACRO_B_compute_ ## which_one; \
886  \
887  MACRO_count_uo_l_lj_t; \
888  } /* for(l_L) */ \
889  } /* for(j) */ \
890 } /* nfft_B */ \
891 
892 #ifndef _OPENMP
893 MACRO_B(A)
894 #endif
895 
896 #ifdef _OPENMP
897 static inline void B_openmp_A (X(plan) *ths)
898 {
899  INT lprod; /* 'regular bandwidth' of matrix B */
900  INT k;
901 
902  memset(ths->f, 0, ths->M_total * sizeof(C));
903 
904  for (k = 0, lprod = 1; k < ths->d; k++)
905  lprod *= (2*ths->m+2);
906 
907  if (ths->flags & PRE_FULL_PSI)
908  {
909  #pragma omp parallel for default(shared) private(k)
910  for (k = 0; k < ths->M_total; k++)
911  {
912  INT l;
913  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
914  ths->f[j] = K(0.0);
915  for (l = 0; l < lprod; l++)
916  ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
917  }
918  return;
919  }
920 
921  if (ths->flags & PRE_PSI)
922  {
923  #pragma omp parallel for default(shared) private(k)
924  for (k = 0; k < ths->M_total; k++)
925  {
926  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
927  INT t, t2; /* index dimensions */
928  INT l_L; /* index one row of B */
929  INT l[ths->d]; /* multi index u<=l<=o */
930  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
931  INT ll_plain[ths->d+1]; /* postfix plain index in g */
932  R phi_prod[ths->d+1]; /* postfix product of PHI */
933  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
934 
935  phi_prod[0] = K(1.0);
936  ll_plain[0] = 0;
937 
938  MACRO_init_uo_l_lj_t;
939 
940  for (l_L = 0; l_L < lprod; l_L++)
941  {
942  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
943 
944  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
945 
946  MACRO_count_uo_l_lj_t;
947  } /* for(l_L) */
948  } /* for(j) */
949  return;
950  } /* if(PRE_PSI) */
951 
952  if (ths->flags & PRE_FG_PSI)
953  {
954  INT t, t2; /* index dimensions */
955  R fg_exp_l[ths->d][2*ths->m+2];
956 
957  for (t2 = 0; t2 < ths->d; t2++)
958  {
959  INT lj_fg;
960  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
961  R tmpEXP2sq = tmpEXP2*tmpEXP2;
962  R tmp2 = K(1.0);
963  R tmp3 = K(1.0);
964  fg_exp_l[t2][0] = K(1.0);
965  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
966  {
967  tmp3 = tmp2*tmpEXP2;
968  tmp2 *= tmpEXP2sq;
969  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
970  }
971  }
972 
973  #pragma omp parallel for default(shared) private(k,t,t2)
974  for (k = 0; k < ths->M_total; k++)
975  {
976  INT ll_plain[ths->d+1]; /* postfix plain index in g */
977  R phi_prod[ths->d+1]; /* postfix product of PHI */
978  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
979  INT l[ths->d]; /* multi index u<=l<=o */
980  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
981  R fg_psi[ths->d][2*ths->m+2];
982  R tmpEXP1, tmp1;
983  INT l_fg,lj_fg;
984  INT l_L;
985  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
986 
987  phi_prod[0] = K(1.0);
988  ll_plain[0] = 0;
989 
990  MACRO_init_uo_l_lj_t;
991 
992  for (t2 = 0; t2 < ths->d; t2++)
993  {
994  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
995  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
996  tmp1 = K(1.0);
997  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
998  {
999  tmp1 *= tmpEXP1;
1000  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1001  }
1002  }
1003 
1004  for (l_L= 0; l_L < lprod; l_L++)
1005  {
1006  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1007 
1008  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1009 
1010  MACRO_count_uo_l_lj_t;
1011  } /* for(l_L) */
1012  } /* for(j) */
1013  return;
1014  } /* if(PRE_FG_PSI) */
1015 
1016  if (ths->flags & FG_PSI)
1017  {
1018  INT t, t2; /* index dimensions */
1019  R fg_exp_l[ths->d][2*ths->m+2];
1020 
1021  sort(ths);
1022 
1023  for (t2 = 0; t2 < ths->d; t2++)
1024  {
1025  INT lj_fg;
1026  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1027  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1028  R tmp2 = K(1.0);
1029  R tmp3 = K(1.0);
1030  fg_exp_l[t2][0] = K(1.0);
1031  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1032  {
1033  tmp3 = tmp2*tmpEXP2;
1034  tmp2 *= tmpEXP2sq;
1035  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1036  }
1037  }
1038 
1039  #pragma omp parallel for default(shared) private(k,t,t2)
1040  for (k = 0; k < ths->M_total; k++)
1041  {
1042  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1043  R phi_prod[ths->d+1]; /* postfix product of PHI */
1044  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1045  INT l[ths->d]; /* multi index u<=l<=o */
1046  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1047  R fg_psi[ths->d][2*ths->m+2];
1048  R tmpEXP1, tmp1;
1049  INT l_fg,lj_fg;
1050  INT l_L;
1051  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1052 
1053  phi_prod[0] = K(1.0);
1054  ll_plain[0] = 0;
1055 
1056  MACRO_init_uo_l_lj_t;
1057 
1058  for (t2 = 0; t2 < ths->d; t2++)
1059  {
1060  fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1061 
1062  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1063  /ths->b[t2]);
1064  tmp1 = K(1.0);
1065  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1066  {
1067  tmp1 *= tmpEXP1;
1068  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1069  }
1070  }
1071 
1072  for (l_L = 0; l_L < lprod; l_L++)
1073  {
1074  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1075 
1076  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1077 
1078  MACRO_count_uo_l_lj_t;
1079  } /* for(l_L) */
1080  } /* for(j) */
1081  return;
1082  } /* if(FG_PSI) */
1083 
1084  if (ths->flags & PRE_LIN_PSI)
1085  {
1086  sort(ths);
1087 
1088  #pragma omp parallel for default(shared) private(k)
1089  for (k = 0; k<ths->M_total; k++)
1090  {
1091  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1092  INT t, t2; /* index dimensions */
1093  INT l_L; /* index one row of B */
1094  INT l[ths->d]; /* multi index u<=l<=o */
1095  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1096  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1097  R phi_prod[ths->d+1]; /* postfix product of PHI */
1098  R y[ths->d];
1099  R fg_psi[ths->d][2*ths->m+2];
1100  INT l_fg,lj_fg;
1101  R ip_w;
1102  INT ip_u;
1103  INT ip_s = ths->K/(ths->m+2);
1104  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1105 
1106  phi_prod[0] = K(1.0);
1107  ll_plain[0] = 0;
1108 
1109  MACRO_init_uo_l_lj_t;
1110 
1111  for (t2 = 0; t2 < ths->d; t2++)
1112  {
1113  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1114  * ((R)ths->K))/(ths->m+2);
1115  ip_u = LRINT(FLOOR(y[t2]));
1116  ip_w = y[t2]-ip_u;
1117  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1118  {
1119  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1120  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1121  * (ip_w);
1122  }
1123  }
1124 
1125  for (l_L = 0; l_L < lprod; l_L++)
1126  {
1127  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1128 
1129  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1130 
1131  MACRO_count_uo_l_lj_t;
1132  } /* for(l_L) */
1133  } /* for(j) */
1134  return;
1135  } /* if(PRE_LIN_PSI) */
1136 
1137  /* no precomputed psi at all */
1138  sort(ths);
1139 
1140  #pragma omp parallel for default(shared) private(k)
1141  for (k = 0; k < ths->M_total; k++)
1142  {
1143  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1144  INT t, t2; /* index dimensions */
1145  INT l_L; /* index one row of B */
1146  INT l[ths->d]; /* multi index u<=l<=o */
1147  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1148  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1149  R phi_prod[ths->d+1]; /* postfix product of PHI */
1150  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1151 
1152  phi_prod[0] = K(1.0);
1153  ll_plain[0] = 0;
1154 
1155  MACRO_init_uo_l_lj_t;
1156 
1157  for (l_L = 0; l_L < lprod; l_L++)
1158  {
1159  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1160 
1161  ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1162 
1163  MACRO_count_uo_l_lj_t;
1164  } /* for(l_L) */
1165  } /* for(j) */
1166 }
1167 #endif
1168 
1169 static void B_A(X(plan) *ths)
1170 {
1171 #ifdef _OPENMP
1172  B_openmp_A(ths);
1173 #else
1174  B_serial_A(ths);
1175 #endif
1176 }
1177 
1178 #ifdef _OPENMP
1179 
1194 static inline INT index_x_binary_search(const INT *ar_x, const INT len, const INT key)
1195 {
1196  INT left = 0, right = len - 1;
1197 
1198  if (len == 1)
1199  return 0;
1200 
1201  while (left < right - 1)
1202  {
1203  INT i = (left + right) / 2;
1204  if (ar_x[2*i] >= key)
1205  right = i;
1206  else if (ar_x[2*i] < key)
1207  left = i;
1208  }
1209 
1210  if (ar_x[2*left] < key && left != len-1)
1211  return left+1;
1212 
1213  return left;
1214 }
1215 #endif
1216 
1217 #ifdef _OPENMP
1218 
1233 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
1234  INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b, const INT d,
1235  const INT *n, const INT m)
1236 {
1237  const INT n0 = n[0];
1238  INT k;
1239  INT nthreads = omp_get_num_threads();
1240  INT nthreads_used = MIN(nthreads, n0);
1241  INT size_per_thread = n0 / nthreads_used;
1242  INT size_left = n0 - size_per_thread * nthreads_used;
1243  INT size_g[nthreads_used];
1244  INT offset_g[nthreads_used];
1245  INT my_id = omp_get_thread_num();
1246  INT n_prod_rest = 1;
1247 
1248  for (k = 1; k < d; k++)
1249  n_prod_rest *= n[k];
1250 
1251  *min_u_a = -1;
1252  *max_u_a = -1;
1253  *min_u_b = -1;
1254  *max_u_b = -1;
1255  *my_u0 = -1;
1256  *my_o0 = -1;
1257 
1258  if (my_id < nthreads_used)
1259  {
1260  const INT m22 = 2 * m + 2;
1261 
1262  offset_g[0] = 0;
1263  for (k = 0; k < nthreads_used; k++)
1264  {
1265  if (k > 0)
1266  offset_g[k] = offset_g[k-1] + size_g[k-1];
1267  size_g[k] = size_per_thread;
1268  if (size_left > 0)
1269  {
1270  size_g[k]++;
1271  size_left--;
1272  }
1273  }
1274 
1275  *my_u0 = offset_g[my_id];
1276  *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1277 
1278  if (nthreads_used > 1)
1279  {
1280  *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1281  *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1282  }
1283  else
1284  {
1285  *min_u_a = 0;
1286  *max_u_a = n_prod_rest * n0 - 1;
1287  }
1288 
1289  if (*min_u_a < 0)
1290  {
1291  *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1292  *max_u_b = n_prod_rest * n0 - 1;
1293  *min_u_a = 0;
1294  }
1295 
1296  if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1297  {
1298  *max_u_a = *max_u_b;
1299  *min_u_b = -1;
1300  *max_u_b = -1;
1301  }
1302 #ifdef OMP_ASSERT
1303  assert(*min_u_a <= *max_u_a);
1304  assert(*min_u_b <= *max_u_b);
1305  assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1306 #endif
1307  }
1308 }
1309 #endif
1310 
1319 static void nfft_adjoint_B_compute_full_psi(C *g, const INT *psi_index_g,
1320  const R *psi, const C *f, const INT M, const INT d, const INT *n,
1321  const INT m, const unsigned flags, const INT *index_x)
1322 {
1323  INT k;
1324  INT lprod;
1325 #ifdef _OPENMP
1326  INT lprod_m1;
1327 #endif
1328 #ifndef _OPENMP
1329  UNUSED(n);
1330 #endif
1331  {
1332  INT t;
1333  for(t = 0, lprod = 1; t < d; t++)
1334  lprod *= 2 * m + 2;
1335  }
1336 #ifdef _OPENMP
1337  lprod_m1 = lprod / (2 * m + 2);
1338 #endif
1339 
1340 #ifdef _OPENMP
1341  if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1342  {
1343  #pragma omp parallel private(k)
1344  {
1345  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1346  const INT *ar_x = index_x;
1347  INT n_prod_rest = 1;
1348 
1349  for (k = 1; k < d; k++)
1350  n_prod_rest *= n[k];
1351 
1352  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1353 
1354  if (min_u_a != -1)
1355  {
1356  k = index_x_binary_search(ar_x, M, min_u_a);
1357 #ifdef OMP_ASSERT
1358  assert(ar_x[2*k] >= min_u_a || k == M-1);
1359  if (k > 0)
1360  assert(ar_x[2*k-2] < min_u_a);
1361 #endif
1362  while (k < M)
1363  {
1364  INT l0, lrest;
1365  INT u_prod = ar_x[2*k];
1366  INT j = ar_x[2*k+1];
1367 
1368  if (u_prod < min_u_a || u_prod > max_u_a)
1369  break;
1370 
1371  for (l0 = 0; l0 < 2 * m + 2; l0++)
1372  {
1373  const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1374 
1375  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1376  continue;
1377 
1378  for (lrest = 0; lrest < lprod_m1; lrest++)
1379  {
1380  const INT l = l0 * lprod_m1 + lrest;
1381  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1382  }
1383  }
1384 
1385  k++;
1386  }
1387  }
1388 
1389  if (min_u_b != -1)
1390  {
1391  k = index_x_binary_search(ar_x, M, min_u_b);
1392 #ifdef OMP_ASSERT
1393  assert(ar_x[2*k] >= min_u_b || k == M-1);
1394  if (k > 0)
1395  assert(ar_x[2*k-2] < min_u_b);
1396 #endif
1397  while (k < M)
1398  {
1399  INT l0, lrest;
1400  INT u_prod = ar_x[2*k];
1401  INT j = ar_x[2*k+1];
1402 
1403  if (u_prod < min_u_b || u_prod > max_u_b)
1404  break;
1405 
1406  for (l0 = 0; l0 < 2 * m + 2; l0++)
1407  {
1408  const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1409 
1410  if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1411  continue;
1412  for (lrest = 0; lrest < lprod_m1; lrest++)
1413  {
1414  const INT l = l0 * lprod_m1 + lrest;
1415  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1416  }
1417  }
1418 
1419  k++;
1420  }
1421  }
1422  } /* omp parallel */
1423  return;
1424  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */
1425 #endif
1426 
1427 #ifdef _OPENMP
1428  #pragma omp parallel for default(shared) private(k)
1429 #endif
1430  for (k = 0; k < M; k++)
1431  {
1432  INT l;
1433  INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1434 
1435  for (l = 0; l < lprod; l++)
1436  {
1437 #ifdef _OPENMP
1438  C val = psi[j * lprod + l] * f[j];
1439  C *gref = g + psi_index_g[j * lprod + l];
1440  R *gref_real = (R*) gref;
1441 
1442  #pragma omp atomic
1443  gref_real[0] += CREAL(val);
1444 
1445  #pragma omp atomic
1446  gref_real[1] += CIMAG(val);
1447 #else
1448  g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1449 #endif
1450  }
1451  }
1452 }
1453 
1454 #ifndef _OPENMP
1455 MACRO_B(T)
1456 #endif
1457 
1458 #ifdef _OPENMP
1459 static inline void B_openmp_T(X(plan) *ths)
1460 {
1461  INT lprod; /* 'regular bandwidth' of matrix B */
1462  INT k;
1463 
1464  memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C));
1465 
1466  for (k = 0, lprod = 1; k < ths->d; k++)
1467  lprod *= (2*ths->m+2);
1468 
1469  if (ths->flags & PRE_FULL_PSI)
1470  {
1471  nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
1472  ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
1473  return;
1474  }
1475 
1476  if (ths->flags & PRE_PSI)
1477  {
1478  #pragma omp parallel for default(shared) private(k)
1479  for (k = 0; k < ths->M_total; k++)
1480  {
1481  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1482  INT t, t2; /* index dimensions */
1483  INT l_L; /* index one row of B */
1484  INT l[ths->d]; /* multi index u<=l<=o */
1485  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1486  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1487  R phi_prod[ths->d+1]; /* postfix product of PHI */
1488  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1489 
1490  phi_prod[0] = K(1.0);
1491  ll_plain[0] = 0;
1492 
1493  MACRO_init_uo_l_lj_t;
1494 
1495  for (l_L = 0; l_L < lprod; l_L++)
1496  {
1497  C *lhs;
1498  R *lhs_real;
1499  C val;
1500 
1501  MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1502 
1503  lhs = ths->g + ll_plain[ths->d];
1504  lhs_real = (R*)lhs;
1505  val = phi_prod[ths->d] * ths->f[j];
1506 
1507  #pragma omp atomic
1508  lhs_real[0] += CREAL(val);
1509 
1510  #pragma omp atomic
1511  lhs_real[1] += CIMAG(val);
1512 
1513  MACRO_count_uo_l_lj_t;
1514  } /* for(l_L) */
1515  } /* for(j) */
1516  return;
1517  } /* if(PRE_PSI) */
1518 
1519  if (ths->flags & PRE_FG_PSI)
1520  {
1521  INT t, t2; /* index dimensions */
1522  R fg_exp_l[ths->d][2*ths->m+2];
1523  for(t2 = 0; t2 < ths->d; t2++)
1524  {
1525  INT lj_fg;
1526  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1527  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1528  R tmp2 = K(1.0);
1529  R tmp3 = K(1.0);
1530  fg_exp_l[t2][0] = K(1.0);
1531  for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1532  {
1533  tmp3 = tmp2*tmpEXP2;
1534  tmp2 *= tmpEXP2sq;
1535  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1536  }
1537  }
1538 
1539  #pragma omp parallel for default(shared) private(k,t,t2)
1540  for (k = 0; k < ths->M_total; k++)
1541  {
1542  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1543  R phi_prod[ths->d+1]; /* postfix product of PHI */
1544  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1545  INT l[ths->d]; /* multi index u<=l<=o */
1546  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1547  R fg_psi[ths->d][2*ths->m+2];
1548  R tmpEXP1, tmp1;
1549  INT l_fg,lj_fg;
1550  INT l_L;
1551  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1552 
1553  phi_prod[0] = K(1.0);
1554  ll_plain[0] = 0;
1555 
1556  MACRO_init_uo_l_lj_t;
1557 
1558  for (t2 = 0; t2 < ths->d; t2++)
1559  {
1560  fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
1561  tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
1562  tmp1 = K(1.0);
1563  for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1564  {
1565  tmp1 *= tmpEXP1;
1566  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1567  }
1568  }
1569 
1570  for (l_L= 0; l_L < lprod; l_L++)
1571  {
1572  C *lhs;
1573  R *lhs_real;
1574  C val;
1575 
1576  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1577 
1578  lhs = ths->g + ll_plain[ths->d];
1579  lhs_real = (R*)lhs;
1580  val = phi_prod[ths->d] * ths->f[j];
1581 
1582  #pragma omp atomic
1583  lhs_real[0] += CREAL(val);
1584 
1585  #pragma omp atomic
1586  lhs_real[1] += CIMAG(val);
1587 
1588  MACRO_count_uo_l_lj_t;
1589  } /* for(l_L) */
1590  } /* for(j) */
1591  return;
1592  } /* if(PRE_FG_PSI) */
1593 
1594  if (ths->flags & FG_PSI)
1595  {
1596  INT t, t2; /* index dimensions */
1597  R fg_exp_l[ths->d][2*ths->m+2];
1598 
1599  sort(ths);
1600 
1601  for (t2 = 0; t2 < ths->d; t2++)
1602  {
1603  INT lj_fg;
1604  R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1605  R tmpEXP2sq = tmpEXP2*tmpEXP2;
1606  R tmp2 = K(1.0);
1607  R tmp3 = K(1.0);
1608  fg_exp_l[t2][0] = K(1.0);
1609  for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1610  {
1611  tmp3 = tmp2*tmpEXP2;
1612  tmp2 *= tmpEXP2sq;
1613  fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1614  }
1615  }
1616 
1617  #pragma omp parallel for default(shared) private(k,t,t2)
1618  for (k = 0; k < ths->M_total; k++)
1619  {
1620  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1621  R phi_prod[ths->d+1]; /* postfix product of PHI */
1622  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1623  INT l[ths->d]; /* multi index u<=l<=o */
1624  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1625  R fg_psi[ths->d][2*ths->m+2];
1626  R tmpEXP1, tmp1;
1627  INT l_fg,lj_fg;
1628  INT l_L;
1629  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1630 
1631  phi_prod[0] = K(1.0);
1632  ll_plain[0] = 0;
1633 
1634  MACRO_init_uo_l_lj_t;
1635 
1636  for (t2 = 0; t2 < ths->d; t2++)
1637  {
1638  fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1639 
1640  tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1641  /ths->b[t2]);
1642  tmp1 = K(1.0);
1643  for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1644  {
1645  tmp1 *= tmpEXP1;
1646  fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1647  }
1648  }
1649 
1650  for (l_L = 0; l_L < lprod; l_L++)
1651  {
1652  C *lhs;
1653  R *lhs_real;
1654  C val;
1655 
1656  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1657 
1658  lhs = ths->g + ll_plain[ths->d];
1659  lhs_real = (R*)lhs;
1660  val = phi_prod[ths->d] * ths->f[j];
1661 
1662  #pragma omp atomic
1663  lhs_real[0] += CREAL(val);
1664 
1665  #pragma omp atomic
1666  lhs_real[1] += CIMAG(val);
1667 
1668  MACRO_count_uo_l_lj_t;
1669  } /* for(l_L) */
1670  } /* for(j) */
1671  return;
1672  } /* if(FG_PSI) */
1673 
1674  if (ths->flags & PRE_LIN_PSI)
1675  {
1676  sort(ths);
1677 
1678  #pragma omp parallel for default(shared) private(k)
1679  for (k = 0; k<ths->M_total; k++)
1680  {
1681  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1682  INT t, t2; /* index dimensions */
1683  INT l_L; /* index one row of B */
1684  INT l[ths->d]; /* multi index u<=l<=o */
1685  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1686  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1687  R phi_prod[ths->d+1]; /* postfix product of PHI */
1688  R y[ths->d];
1689  R fg_psi[ths->d][2*ths->m+2];
1690  INT l_fg,lj_fg;
1691  R ip_w;
1692  INT ip_u;
1693  INT ip_s = ths->K/(ths->m+2);
1694  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1695 
1696  phi_prod[0] = K(1.0);
1697  ll_plain[0] = 0;
1698 
1699  MACRO_init_uo_l_lj_t;
1700 
1701  for (t2 = 0; t2 < ths->d; t2++)
1702  {
1703  y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1704  * ((R)ths->K))/(ths->m+2);
1705  ip_u = LRINT(FLOOR(y[t2]));
1706  ip_w = y[t2]-ip_u;
1707  for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1708  {
1709  fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1710  * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1711  * (ip_w);
1712  }
1713  }
1714 
1715  for (l_L = 0; l_L < lprod; l_L++)
1716  {
1717  C *lhs;
1718  R *lhs_real;
1719  C val;
1720 
1721  MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1722 
1723  lhs = ths->g + ll_plain[ths->d];
1724  lhs_real = (R*)lhs;
1725  val = phi_prod[ths->d] * ths->f[j];
1726 
1727  #pragma omp atomic
1728  lhs_real[0] += CREAL(val);
1729 
1730  #pragma omp atomic
1731  lhs_real[1] += CIMAG(val);
1732 
1733  MACRO_count_uo_l_lj_t;
1734  } /* for(l_L) */
1735  } /* for(j) */
1736  return;
1737  } /* if(PRE_LIN_PSI) */
1738 
1739  /* no precomputed psi at all */
1740  sort(ths);
1741 
1742  #pragma omp parallel for default(shared) private(k)
1743  for (k = 0; k < ths->M_total; k++)
1744  {
1745  INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
1746  INT t, t2; /* index dimensions */
1747  INT l_L; /* index one row of B */
1748  INT l[ths->d]; /* multi index u<=l<=o */
1749  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
1750  INT ll_plain[ths->d+1]; /* postfix plain index in g */
1751  R phi_prod[ths->d+1]; /* postfix product of PHI */
1752  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1753 
1754  phi_prod[0] = K(1.0);
1755  ll_plain[0] = 0;
1756 
1757  MACRO_init_uo_l_lj_t;
1758 
1759  for (l_L = 0; l_L < lprod; l_L++)
1760  {
1761  C *lhs;
1762  R *lhs_real;
1763  C val;
1764 
1765  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1766 
1767  lhs = ths->g + ll_plain[ths->d];
1768  lhs_real = (R*)lhs;
1769  val = phi_prod[ths->d] * ths->f[j];
1770 
1771  #pragma omp atomic
1772  lhs_real[0] += CREAL(val);
1773 
1774  #pragma omp atomic
1775  lhs_real[1] += CIMAG(val);
1776 
1777  MACRO_count_uo_l_lj_t;
1778  } /* for(l_L) */
1779  } /* for(j) */
1780 }
1781 #endif
1782 
1783 static void B_T(X(plan) *ths)
1784 {
1785 #ifdef _OPENMP
1786  B_openmp_T(ths);
1787 #else
1788  B_serial_T(ths);
1789 #endif
1790 }
1791 
1792 /* ## specialized version for d=1 ########################################### */
1793 
1794 static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
1795 {
1796  const INT tmp2 = 2*m+2;
1797  INT l;
1798  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
1799 
1800  fg_exp_b0 = EXP(K(-1.0)/b);
1801  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
1802  fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
1803 
1804  for (l = 1; l < tmp2; l++)
1805  {
1806  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
1807  fg_exp_b1 *= fg_exp_b0_sq;
1808  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
1809  }
1810 }
1811 
1812 
1813 static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const,
1814  const R *xj, const INT n, const INT m)
1815 {
1816  INT u, o, l;
1817  const C *gj;
1818  const R *psij;
1819  psij = psij_const;
1820 
1821  uo2(&u, &o, *xj, n, m);
1822 
1823  if (u < o)
1824  {
1825  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
1826  (*fj) += (*psij++) * (*gj++);
1827  }
1828  else
1829  {
1830  for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
1831  (*fj) += (*psij++) * (*gj++);
1832  for (l = 0, gj = g; l <= o; l++)
1833  (*fj) += (*psij++) * (*gj++);
1834  }
1835 }
1836 
1837 #ifndef _OPENMP
1838 static void nfft_adjoint_1d_compute_serial(const C *fj, C *g,
1839  const R *psij_const, const R *xj, const INT n, const INT m)
1840 {
1841  INT u,o,l;
1842  C *gj;
1843  const R *psij;
1844  psij = psij_const;
1845 
1846  uo2(&u,&o,*xj, n, m);
1847 
1848  if (u < o)
1849  {
1850  for (l = 0, gj = g+u; l <= 2*m+1; l++)
1851  (*gj++) += (*psij++) * (*fj);
1852  }
1853  else
1854  {
1855  for (l = 0, gj = g+u; l < 2*m+1-o; l++)
1856  (*gj++) += (*psij++) * (*fj);
1857  for (l = 0, gj = g; l <= o; l++)
1858  (*gj++) += (*psij++) * (*fj);
1859  }
1860 }
1861 #endif
1862 
1863 #ifdef _OPENMP
1864 /* adjoint NFFT one-dimensional case with OpenMP atomic operations */
1865 static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g,
1866  const R *psij_const, const R *xj, const INT n, const INT m)
1867 {
1868  INT u,o,l;
1869  C *gj;
1870  INT index_temp[2*m+2];
1871 
1872  uo2(&u,&o,*xj, n, m);
1873 
1874  for (l=0; l<=2*m+1; l++)
1875  index_temp[l] = (l+u)%n;
1876 
1877  for (l = 0, gj = g+u; l <= 2*m+1; l++)
1878  {
1879  INT i = index_temp[l];
1880  C *lhs = g+i;
1881  R *lhs_real = (R*)lhs;
1882  C val = psij_const[l] * f;
1883  #pragma omp atomic
1884  lhs_real[0] += CREAL(val);
1885 
1886  #pragma omp atomic
1887  lhs_real[1] += CIMAG(val);
1888  }
1889 }
1890 #endif
1891 
1892 #ifdef _OPENMP
1893 
1908 static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g,
1909  const R *psij_const, const R *xj, const INT n, const INT m,
1910  const INT my_u0, const INT my_o0)
1911 {
1912  INT ar_u,ar_o,l;
1913 
1914  uo2(&ar_u,&ar_o,*xj, n, m);
1915 
1916  if (ar_u < ar_o)
1917  {
1918  INT u = MAX(my_u0,ar_u);
1919  INT o = MIN(my_o0,ar_o);
1920  INT offset_psij = u-ar_u;
1921 #ifdef OMP_ASSERT
1922  assert(offset_psij >= 0);
1923  assert(o-u <= 2*m+1);
1924  assert(offset_psij+o-u <= 2*m+1);
1925 #endif
1926 
1927  for (l = 0; l <= o-u; l++)
1928  g[u+l] += psij_const[offset_psij+l] * f;
1929  }
1930  else
1931  {
1932  INT u = MAX(my_u0,ar_u);
1933  INT o = my_o0;
1934  INT offset_psij = u-ar_u;
1935 #ifdef OMP_ASSERT
1936  assert(offset_psij >= 0);
1937  assert(o-u <= 2*m+1);
1938  assert(offset_psij+o-u <= 2*m+1);
1939 #endif
1940 
1941  for (l = 0; l <= o-u; l++)
1942  g[u+l] += psij_const[offset_psij+l] * f;
1943 
1944  u = my_u0;
1945  o = MIN(my_o0,ar_o);
1946  offset_psij += my_u0-ar_u+n;
1947 
1948 #ifdef OMP_ASSERT
1949  if (u <= o)
1950  {
1951  assert(o-u <= 2*m+1);
1952  if (offset_psij+o-u > 2*m+1)
1953  {
1954  fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
1955  }
1956  assert(offset_psij+o-u <= 2*m+1);
1957  }
1958 #endif
1959  for (l = 0; l <= o-u; l++)
1960  g[u+l] += psij_const[offset_psij+l] * f;
1961  }
1962 }
1963 #endif
1964 
1965 static void nfft_trafo_1d_B(X(plan) *ths)
1966 {
1967  const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
1968  const C *g = (C*)ths->g;
1969 
1970  if (ths->flags & PRE_FULL_PSI)
1971  {
1972  INT k;
1973 #ifdef _OPENMP
1974  #pragma omp parallel for default(shared) private(k)
1975 #endif
1976  for (k = 0; k < M; k++)
1977  {
1978  INT l;
1979  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1980  ths->f[j] = K(0.0);
1981  for (l = 0; l < m2p2; l++)
1982  ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
1983  }
1984  return;
1985  } /* if(PRE_FULL_PSI) */
1986 
1987  if (ths->flags & PRE_PSI)
1988  {
1989  INT k;
1990 #ifdef _OPENMP
1991  #pragma omp parallel for default(shared) private(k)
1992 #endif
1993  for (k = 0; k < M; k++)
1994  {
1995  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1996  nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
1997  &ths->x[j], n, m);
1998  }
1999  return;
2000  } /* if(PRE_PSI) */
2001 
2002  if (ths->flags & PRE_FG_PSI)
2003  {
2004  INT k;
2005  R fg_exp_l[m2p2];
2006 
2007  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2008 
2009 #ifdef _OPENMP
2010  #pragma omp parallel for default(shared) private(k)
2011 #endif
2012  for (k = 0; k < M; k++)
2013  {
2014  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2015  const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2016  R fg_psij2 = K(1.0);
2017  R psij_const[m2p2];
2018  INT l;
2019 
2020  psij_const[0] = fg_psij0;
2021 
2022  for (l = 1; l < m2p2; l++)
2023  {
2024  fg_psij2 *= fg_psij1;
2025  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2026  }
2027 
2028  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2029  }
2030 
2031  return;
2032  } /* if(PRE_FG_PSI) */
2033 
2034  if (ths->flags & FG_PSI)
2035  {
2036  INT k;
2037  R fg_exp_l[m2p2];
2038 
2039  sort(ths);
2040 
2041  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2042 
2043 #ifdef _OPENMP
2044  #pragma omp parallel for default(shared) private(k)
2045 #endif
2046  for (k = 0; k < M; k++)
2047  {
2048  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2049  INT u, o, l;
2050  R fg_psij0, fg_psij1, fg_psij2;
2051  R psij_const[m2p2];
2052 
2053  uo(ths, (INT)j, &u, &o, (INT)0);
2054  fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
2055  fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
2056  fg_psij2 = K(1.0);
2057 
2058  psij_const[0] = fg_psij0;
2059 
2060  for (l = 1; l < m2p2; l++)
2061  {
2062  fg_psij2 *= fg_psij1;
2063  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2064  }
2065 
2066  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2067  }
2068  return;
2069  } /* if(FG_PSI) */
2070 
2071  if (ths->flags & PRE_LIN_PSI)
2072  {
2073  const INT K = ths->K, ip_s = K / (m + 2);
2074  INT k;
2075 
2076  sort(ths);
2077 
2078 #ifdef _OPENMP
2079  #pragma omp parallel for default(shared) private(k)
2080 #endif
2081  for (k = 0; k < M; k++)
2082  {
2083  INT u, o, l;
2084  R ip_y, ip_w;
2085  INT ip_u;
2086  R psij_const[m2p2];
2087  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2088 
2089  uo(ths, (INT)j, &u, &o, (INT)0);
2090 
2091  ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2092  ip_u = (INT)(LRINT(FLOOR(ip_y)));
2093  ip_w = ip_y - (R)(ip_u);
2094 
2095  for (l = 0; l < m2p2; l++)
2096  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2097  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2098 
2099  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2100  }
2101  return;
2102  } /* if(PRE_LIN_PSI) */
2103  else
2104  {
2105  /* no precomputed psi at all */
2106  INT k;
2107 
2108  sort(ths);
2109 
2110 #ifdef _OPENMP
2111  #pragma omp parallel for default(shared) private(k)
2112 #endif
2113  for (k = 0; k < M; k++)
2114  {
2115  R psij_const[m2p2];
2116  INT u, o, l;
2117  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2118 
2119  uo(ths, (INT)j, &u, &o, (INT)0);
2120 
2121  for (l = 0; l < m2p2; l++)
2122  psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
2123 
2124  nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2125  }
2126  }
2127 }
2128 
2129 
2130 #ifdef OMP_ASSERT
2131 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2132 { \
2133  assert(ar_x[2*k] >= min_u_a || k == M-1); \
2134  if (k > 0) \
2135  assert(ar_x[2*k-2] < min_u_a); \
2136 }
2137 #else
2138 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
2139 #endif
2140 
2141 #ifdef OMP_ASSERT
2142 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2143 { \
2144  assert(ar_x[2*k] >= min_u_b || k == M-1); \
2145  if (k > 0) \
2146  assert(ar_x[2*k-2] < min_u_b); \
2147 }
2148 #else
2149 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
2150 #endif
2151 
2152 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2153 { \
2154  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2155  ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2156 }
2157 
2158 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2159 { \
2160  R psij_const[2 * m + 2]; \
2161  INT u, o, l; \
2162  R fg_psij0 = ths->psi[2 * j]; \
2163  R fg_psij1 = ths->psi[2 * j + 1]; \
2164  R fg_psij2 = K(1.0); \
2165  \
2166  psij_const[0] = fg_psij0; \
2167  for (l = 1; l <= 2 * m + 1; l++) \
2168  { \
2169  fg_psij2 *= fg_psij1; \
2170  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2171  } \
2172  \
2173  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2174  ths->x + j, n, m, my_u0, my_o0); \
2175 }
2176 
2177 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2178 { \
2179  R psij_const[2 * m + 2]; \
2180  R fg_psij0, fg_psij1, fg_psij2; \
2181  INT u, o, l; \
2182  \
2183  uo(ths, j, &u, &o, (INT)0); \
2184  fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/n,0)); \
2185  fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
2186  fg_psij2 = K(1.0); \
2187  psij_const[0] = fg_psij0; \
2188  for (l = 1; l <= 2 * m + 1; l++) \
2189  { \
2190  fg_psij2 *= fg_psij1; \
2191  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2192  } \
2193  \
2194  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2195  ths->x + j, n, m, my_u0, my_o0); \
2196 }
2197 
2198 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2199 { \
2200  R psij_const[2 * m + 2]; \
2201  INT ip_u; \
2202  R ip_y, ip_w; \
2203  INT u, o, l; \
2204  \
2205  uo(ths, j, &u, &o, (INT)0); \
2206  \
2207  ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
2208  ip_u = LRINT(FLOOR(ip_y)); \
2209  ip_w = ip_y - ip_u; \
2210  for (l = 0; l < 2 * m + 2; l++) \
2211  psij_const[l] \
2212  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2213  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2214  \
2215  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2216  ths->x + j, n, m, my_u0, my_o0); \
2217 }
2218 
2219 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2220 { \
2221  R psij_const[2 * m + 2]; \
2222  INT u, o, l; \
2223  \
2224  uo(ths, j, &u, &o, (INT)0); \
2225  \
2226  for (l = 0; l <= 2 * m + 1; l++) \
2227  psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/n,0)); \
2228  \
2229  nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2230  ths->x + j, n, m, my_u0, my_o0); \
2231 }
2232 
2233 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2234 { \
2235  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2236  { \
2237  _Pragma("omp parallel private(k)") \
2238  { \
2239  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2240  INT *ar_x = ths->index_x; \
2241  \
2242  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2243  &min_u_b, &max_u_b, 1, &n, m); \
2244  \
2245  if (min_u_a != -1) \
2246  { \
2247  k = index_x_binary_search(ar_x, M, min_u_a); \
2248  \
2249  MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2250  \
2251  while (k < M) \
2252  { \
2253  INT u_prod = ar_x[2*k]; \
2254  INT j = ar_x[2*k+1]; \
2255  \
2256  if (u_prod < min_u_a || u_prod > max_u_a) \
2257  break; \
2258  \
2259  MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2260  \
2261  k++; \
2262  } \
2263  } \
2264  \
2265  if (min_u_b != -1) \
2266  { \
2267  k = index_x_binary_search(ar_x, M, min_u_b); \
2268  \
2269  MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2270  \
2271  while (k < M) \
2272  { \
2273  INT u_prod = ar_x[2*k]; \
2274  INT j = ar_x[2*k+1]; \
2275  \
2276  if (u_prod < min_u_b || u_prod > max_u_b) \
2277  break; \
2278  \
2279  MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2280  \
2281  k++; \
2282  } \
2283  } \
2284  } /* omp parallel */ \
2285  return; \
2286  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
2287 }
2288 
2289 static void nfft_adjoint_1d_B(X(plan) *ths)
2290 {
2291  const INT n = ths->n[0], M = ths->M_total, m = ths->m;
2292  INT k;
2293  C *g = (C*)ths->g;
2294 
2295  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
2296 
2297  if (ths->flags & PRE_FULL_PSI)
2298  {
2299  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2300  (INT)1, ths->n, m, ths->flags, ths->index_x);
2301  return;
2302  } /* if(PRE_FULL_PSI) */
2303 
2304  if (ths->flags & PRE_PSI)
2305  {
2306 #ifdef _OPENMP
2307  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
2308 #endif
2309 
2310 #ifdef _OPENMP
2311  #pragma omp parallel for default(shared) private(k)
2312 #endif
2313  for (k = 0; k < M; k++)
2314  {
2315  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2316 #ifdef _OPENMP
2317  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2318 #else
2319  nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2320 #endif
2321  }
2322 
2323  return;
2324  } /* if(PRE_PSI) */
2325 
2326  if (ths->flags & PRE_FG_PSI)
2327  {
2328  R fg_exp_l[2 * m + 2];
2329 
2330  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2331 
2332 #ifdef _OPENMP
2333  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
2334 #endif
2335 
2336 
2337 #ifdef _OPENMP
2338  #pragma omp parallel for default(shared) private(k)
2339 #endif
2340  for (k = 0; k < M; k++)
2341  {
2342  R psij_const[2 * m + 2];
2343  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2344  INT l;
2345  R fg_psij0 = ths->psi[2 * j];
2346  R fg_psij1 = ths->psi[2 * j + 1];
2347  R fg_psij2 = K(1.0);
2348 
2349  psij_const[0] = fg_psij0;
2350  for (l = 1; l <= 2 * m + 1; l++)
2351  {
2352  fg_psij2 *= fg_psij1;
2353  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2354  }
2355 
2356 #ifdef _OPENMP
2357  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2358 #else
2359  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2360 #endif
2361  }
2362 
2363  return;
2364  } /* if(PRE_FG_PSI) */
2365 
2366  if (ths->flags & FG_PSI)
2367  {
2368  R fg_exp_l[2 * m + 2];
2369 
2370  nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2371 
2372  sort(ths);
2373 
2374 #ifdef _OPENMP
2375  MACRO_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
2376 #endif
2377 
2378 #ifdef _OPENMP
2379  #pragma omp parallel for default(shared) private(k)
2380 #endif
2381  for (k = 0; k < M; k++)
2382  {
2383  INT u,o,l;
2384  R psij_const[2 * m + 2];
2385  R fg_psij0, fg_psij1, fg_psij2;
2386  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2387 
2388  uo(ths, j, &u, &o, (INT)0);
2389  fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
2390  fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
2391  fg_psij2 = K(1.0);
2392  psij_const[0] = fg_psij0;
2393  for (l = 1; l <= 2 * m + 1; l++)
2394  {
2395  fg_psij2 *= fg_psij1;
2396  psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2397  }
2398 
2399 #ifdef _OPENMP
2400  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2401 #else
2402  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2403 #endif
2404  }
2405 
2406  return;
2407  } /* if(FG_PSI) */
2408 
2409  if (ths->flags & PRE_LIN_PSI)
2410  {
2411  const INT K = ths->K;
2412  const INT ip_s = K / (m + 2);
2413 
2414  sort(ths);
2415 
2416 #ifdef _OPENMP
2417  MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
2418 #endif
2419 
2420 #ifdef _OPENMP
2421  #pragma omp parallel for default(shared) private(k)
2422 #endif
2423  for (k = 0; k < M; k++)
2424  {
2425  INT u,o,l;
2426  INT ip_u;
2427  R ip_y, ip_w;
2428  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2429  R psij_const[2 * m + 2];
2430 
2431  uo(ths, j, &u, &o, (INT)0);
2432 
2433  ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2434  ip_u = (INT)(LRINT(FLOOR(ip_y)));
2435  ip_w = ip_y - (R)(ip_u);
2436  for (l = 0; l < 2 * m + 2; l++)
2437  psij_const[l]
2438  = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2439  + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2440 
2441 #ifdef _OPENMP
2442  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2443 #else
2444  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2445 #endif
2446  }
2447  return;
2448  } /* if(PRE_LIN_PSI) */
2449 
2450  /* no precomputed psi at all */
2451  sort(ths);
2452 
2453 #ifdef _OPENMP
2454  MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2455 #endif
2456 
2457 #ifdef _OPENMP
2458  #pragma omp parallel for default(shared) private(k)
2459 #endif
2460  for (k = 0; k < M; k++)
2461  {
2462  INT u,o,l;
2463  R psij_const[2 * m + 2];
2464  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2465 
2466  uo(ths, j, &u, &o, (INT)0);
2467 
2468  for (l = 0; l <= 2 * m + 1; l++)
2469  psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
2470 
2471 #ifdef _OPENMP
2472  nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2473 #else
2474  nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2475 #endif
2476  }
2477 }
2478 
2479 void X(trafo_1d)(X(plan) *ths)
2480 {
2481  const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
2482  C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2483 
2484  ths->g_hat = ths->g1;
2485  ths->g = ths->g2;
2486 
2487  {
2488  C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2489  R *c_phi_inv1, *c_phi_inv2;
2490 
2491  TIC(0)
2492 #ifdef _OPENMP
2493  {
2494  INT k;
2495  #pragma omp parallel for default(shared) private(k)
2496  for (k = 0; k < ths->n_total; k++)
2497  ths->g_hat[k] = 0.0;
2498  }
2499 #else
2500  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
2501 #endif
2502  if(ths->flags & PRE_PHI_HUT)
2503  {
2504  INT k;
2505  c_phi_inv1 = ths->c_phi_inv[0];
2506  c_phi_inv2 = &ths->c_phi_inv[0][N2];
2507 
2508 #ifdef _OPENMP
2509  #pragma omp parallel for default(shared) private(k)
2510 #endif
2511  for (k = 0; k < N2; k++)
2512  {
2513  g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2514  g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2515  }
2516  }
2517  else
2518  {
2519  INT k;
2520 #ifdef _OPENMP
2521  #pragma omp parallel for default(shared) private(k)
2522 #endif
2523  for (k = 0; k < N2; k++)
2524  {
2525  g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
2526  g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2527  }
2528  }
2529  TOC(0)
2530 
2531  TIC_FFTW(1)
2532  FFTW(execute)(ths->my_fftw_plan1);
2533  TOC_FFTW(1);
2534 
2535  TIC(2);
2536  nfft_trafo_1d_B(ths);
2537  TOC(2);
2538  }
2539 }
2540 
2541 void X(adjoint_1d)(X(plan) *ths)
2542 {
2543  INT n,N;
2544  C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2545  R *c_phi_inv1, *c_phi_inv2;
2546 
2547  N=ths->N[0];
2548  n=ths->n[0];
2549 
2550  ths->g_hat=ths->g1;
2551  ths->g=ths->g2;
2552 
2553  f_hat1=(C*)ths->f_hat;
2554  f_hat2=(C*)&ths->f_hat[N/2];
2555  g_hat1=(C*)&ths->g_hat[n-N/2];
2556  g_hat2=(C*)ths->g_hat;
2557 
2558  TIC(2)
2559  nfft_adjoint_1d_B(ths);
2560  TOC(2)
2561 
2562  TIC_FFTW(1)
2563  FFTW(execute)(ths->my_fftw_plan2);
2564  TOC_FFTW(1);
2565 
2566  TIC(0)
2567  if(ths->flags & PRE_PHI_HUT)
2568  {
2569  INT k;
2570  c_phi_inv1=ths->c_phi_inv[0];
2571  c_phi_inv2=&ths->c_phi_inv[0][N/2];
2572 
2573 #ifdef _OPENMP
2574  #pragma omp parallel for default(shared) private(k)
2575 #endif
2576  for (k = 0; k < N/2; k++)
2577  {
2578  f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2579  f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2580  }
2581  }
2582  else
2583  {
2584  INT k;
2585 
2586 #ifdef _OPENMP
2587  #pragma omp parallel for default(shared) private(k)
2588 #endif
2589  for (k = 0; k < N/2; k++)
2590  {
2591  f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
2592  f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2593  }
2594  }
2595  TOC(0)
2596 }
2597 
2598 
2599 /* ################################################ SPECIFIC VERSIONS FOR d=2 */
2600 
2601 static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
2602 {
2603  INT l;
2604  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2605 
2606  fg_exp_b0 = EXP(K(-1.0)/b);
2607  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2608  fg_exp_b1 = K(1.0);
2609  fg_exp_b2 = K(1.0);
2610  fg_exp_l[0] = K(1.0);
2611  for(l=1; l <= 2*m+1; l++)
2612  {
2613  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2614  fg_exp_b1 *= fg_exp_b0_sq;
2615  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2616  }
2617 }
2618 
2619 static void nfft_trafo_2d_compute(C *fj, const C *g, const R *psij_const0,
2620  const R *psij_const1, const R *xj0, const R *xj1, const INT n0,
2621  const INT n1, const INT m)
2622 {
2623  INT u0,o0,l0,u1,o1,l1;
2624  const C *gj;
2625  const R *psij0,*psij1;
2626 
2627  psij0=psij_const0;
2628  psij1=psij_const1;
2629 
2630  uo2(&u0,&o0,*xj0, n0, m);
2631  uo2(&u1,&o1,*xj1, n1, m);
2632 
2633  *fj=0;
2634 
2635  if (u0 < o0)
2636  if(u1 < o1)
2637  for(l0=0; l0<=2*m+1; l0++,psij0++)
2638  {
2639  psij1=psij_const1;
2640  gj=g+(u0+l0)*n1+u1;
2641  for(l1=0; l1<=2*m+1; l1++)
2642  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2643  }
2644  else
2645  for(l0=0; l0<=2*m+1; l0++,psij0++)
2646  {
2647  psij1=psij_const1;
2648  gj=g+(u0+l0)*n1+u1;
2649  for(l1=0; l1<2*m+1-o1; l1++)
2650  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2651  gj=g+(u0+l0)*n1;
2652  for(l1=0; l1<=o1; l1++)
2653  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2654  }
2655  else
2656  if(u1<o1)
2657  {
2658  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2659  {
2660  psij1=psij_const1;
2661  gj=g+(u0+l0)*n1+u1;
2662  for(l1=0; l1<=2*m+1; l1++)
2663  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2664  }
2665  for(l0=0; l0<=o0; l0++,psij0++)
2666  {
2667  psij1=psij_const1;
2668  gj=g+l0*n1+u1;
2669  for(l1=0; l1<=2*m+1; l1++)
2670  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2671  }
2672  }
2673  else
2674  {
2675  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2676  {
2677  psij1=psij_const1;
2678  gj=g+(u0+l0)*n1+u1;
2679  for(l1=0; l1<2*m+1-o1; l1++)
2680  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2681  gj=g+(u0+l0)*n1;
2682  for(l1=0; l1<=o1; l1++)
2683  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2684  }
2685  for(l0=0; l0<=o0; l0++,psij0++)
2686  {
2687  psij1=psij_const1;
2688  gj=g+l0*n1+u1;
2689  for(l1=0; l1<2*m+1-o1; l1++)
2690  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2691  gj=g+l0*n1;
2692  for(l1=0; l1<=o1; l1++)
2693  (*fj) += (*psij0) * (*psij1++) * (*gj++);
2694  }
2695  }
2696 }
2697 
2698 #ifdef _OPENMP
2699 /* adjoint NFFT two-dimensional case with OpenMP atomic operations */
2700 static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g,
2701  const R *psij_const0, const R *psij_const1, const R *xj0,
2702  const R *xj1, const INT n0, const INT n1, const INT m)
2703 {
2704  INT u0,o0,l0,u1,o1,l1;
2705  const INT lprod = (2*m+2) * (2*m+2);
2706 
2707  INT index_temp0[2*m+2];
2708  INT index_temp1[2*m+2];
2709 
2710  uo2(&u0,&o0,*xj0, n0, m);
2711  uo2(&u1,&o1,*xj1, n1, m);
2712 
2713  for (l0=0; l0<=2*m+1; l0++)
2714  index_temp0[l0] = (u0+l0)%n0;
2715 
2716  for (l1=0; l1<=2*m+1; l1++)
2717  index_temp1[l1] = (u1+l1)%n1;
2718 
2719  for(l0=0; l0<=2*m+1; l0++)
2720  {
2721  for(l1=0; l1<=2*m+1; l1++)
2722  {
2723  INT i = index_temp0[l0] * n1 + index_temp1[l1];
2724  C *lhs = g+i;
2725  R *lhs_real = (R*)lhs;
2726  C val = psij_const0[l0] * psij_const1[l1] * f;
2727 
2728  #pragma omp atomic
2729  lhs_real[0] += CREAL(val);
2730 
2731  #pragma omp atomic
2732  lhs_real[1] += CIMAG(val);
2733  }
2734  }
2735 }
2736 #endif
2737 
2738 #ifdef _OPENMP
2739 
2757 static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g,
2758  const R *psij_const0, const R *psij_const1, const R *xj0,
2759  const R *xj1, const INT n0, const INT n1, const INT m,
2760  const INT my_u0, const INT my_o0)
2761 {
2762  INT ar_u0,ar_o0,l0,u1,o1,l1;
2763  const INT lprod = (2*m+2) * (2*m+2);
2764  INT index_temp1[2*m+2];
2765 
2766  uo2(&ar_u0,&ar_o0,*xj0, n0, m);
2767  uo2(&u1,&o1,*xj1, n1, m);
2768 
2769  for (l1 = 0; l1 <= 2*m+1; l1++)
2770  index_temp1[l1] = (u1+l1)%n1;
2771 
2772  if(ar_u0 < ar_o0)
2773  {
2774  INT u0 = MAX(my_u0,ar_u0);
2775  INT o0 = MIN(my_o0,ar_o0);
2776  INT offset_psij = u0-ar_u0;
2777 #ifdef OMP_ASSERT
2778  assert(offset_psij >= 0);
2779  assert(o0-u0 <= 2*m+1);
2780  assert(offset_psij+o0-u0 <= 2*m+1);
2781 #endif
2782 
2783  for (l0 = 0; l0 <= o0-u0; l0++)
2784  {
2785  INT i0 = (u0+l0) * n1;
2786  const C val0 = psij_const0[offset_psij+l0];
2787 
2788  for(l1=0; l1<=2*m+1; l1++)
2789  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2790  }
2791  }
2792  else
2793  {
2794  INT u0 = MAX(my_u0,ar_u0);
2795  INT o0 = my_o0;
2796  INT offset_psij = u0-ar_u0;
2797 #ifdef OMP_ASSERT
2798  assert(offset_psij >= 0);
2799  assert(o0-u0 <= 2*m+1);
2800  assert(offset_psij+o0-u0 <= 2*m+1);
2801 #endif
2802 
2803  for (l0 = 0; l0 <= o0-u0; l0++)
2804  {
2805  INT i0 = (u0+l0) * n1;
2806  const C val0 = psij_const0[offset_psij+l0];
2807 
2808  for(l1=0; l1<=2*m+1; l1++)
2809  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2810  }
2811 
2812  u0 = my_u0;
2813  o0 = MIN(my_o0,ar_o0);
2814  offset_psij += my_u0-ar_u0+n0;
2815 
2816 #ifdef OMP_ASSERT
2817  if (u0<=o0)
2818  {
2819  assert(o0-u0 <= 2*m+1);
2820  assert(offset_psij+o0-u0 <= 2*m+1);
2821  }
2822 #endif
2823 
2824  for (l0 = 0; l0 <= o0-u0; l0++)
2825  {
2826  INT i0 = (u0+l0) * n1;
2827  const C val0 = psij_const0[offset_psij+l0];
2828 
2829  for(l1=0; l1<=2*m+1; l1++)
2830  g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2831  }
2832  }
2833 }
2834 #endif
2835 
2836 #ifndef _OPENMP
2837 static void nfft_adjoint_2d_compute_serial(const C *fj, C *g,
2838  const R *psij_const0, const R *psij_const1, const R *xj0,
2839  const R *xj1, const INT n0, const INT n1, const INT m)
2840 {
2841  INT u0,o0,l0,u1,o1,l1;
2842  C *gj;
2843  const R *psij0,*psij1;
2844 
2845  psij0=psij_const0;
2846  psij1=psij_const1;
2847 
2848  uo2(&u0,&o0,*xj0, n0, m);
2849  uo2(&u1,&o1,*xj1, n1, m);
2850 
2851  if(u0<o0)
2852  if(u1<o1)
2853  for(l0=0; l0<=2*m+1; l0++,psij0++)
2854  {
2855  psij1=psij_const1;
2856  gj=g+(u0+l0)*n1+u1;
2857  for(l1=0; l1<=2*m+1; l1++)
2858  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2859  }
2860  else
2861  for(l0=0; l0<=2*m+1; l0++,psij0++)
2862  {
2863  psij1=psij_const1;
2864  gj=g+(u0+l0)*n1+u1;
2865  for(l1=0; l1<2*m+1-o1; l1++)
2866  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2867  gj=g+(u0+l0)*n1;
2868  for(l1=0; l1<=o1; l1++)
2869  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2870  }
2871  else
2872  if(u1<o1)
2873  {
2874  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2875  {
2876  psij1=psij_const1;
2877  gj=g+(u0+l0)*n1+u1;
2878  for(l1=0; l1<=2*m+1; l1++)
2879  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2880  }
2881  for(l0=0; l0<=o0; l0++,psij0++)
2882  {
2883  psij1=psij_const1;
2884  gj=g+l0*n1+u1;
2885  for(l1=0; l1<=2*m+1; l1++)
2886  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2887  }
2888  }
2889  else
2890  {
2891  for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2892  {
2893  psij1=psij_const1;
2894  gj=g+(u0+l0)*n1+u1;
2895  for(l1=0; l1<2*m+1-o1; l1++)
2896  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2897  gj=g+(u0+l0)*n1;
2898  for(l1=0; l1<=o1; l1++)
2899  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2900  }
2901  for(l0=0; l0<=o0; l0++,psij0++)
2902  {
2903  psij1=psij_const1;
2904  gj=g+l0*n1+u1;
2905  for(l1=0; l1<2*m+1-o1; l1++)
2906  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2907  gj=g+l0*n1;
2908  for(l1=0; l1<=o1; l1++)
2909  (*gj++) += (*psij0) * (*psij1++) * (*fj);
2910  }
2911  }
2912 }
2913 #endif
2914 
2915 static void nfft_trafo_2d_B(X(plan) *ths)
2916 {
2917  const C *g = (C*)ths->g;
2918  const INT n0 = ths->n[0];
2919  const INT n1 = ths->n[1];
2920  const INT M = ths->M_total;
2921  const INT m = ths->m;
2922 
2923  INT k;
2924 
2925  if(ths->flags & PRE_FULL_PSI)
2926  {
2927  const INT lprod = (2*m+2) * (2*m+2);
2928 #ifdef _OPENMP
2929  #pragma omp parallel for default(shared) private(k)
2930 #endif
2931  for (k = 0; k < M; k++)
2932  {
2933  INT l;
2934  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2935  ths->f[j] = K(0.0);
2936  for (l = 0; l < lprod; l++)
2937  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
2938  }
2939  return;
2940  } /* if(PRE_FULL_PSI) */
2941 
2942  if(ths->flags & PRE_PSI)
2943  {
2944 #ifdef _OPENMP
2945  #pragma omp parallel for default(shared) private(k)
2946 #endif
2947  for (k = 0; k < M; k++)
2948  {
2949  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2950  nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2951  }
2952 
2953  return;
2954  } /* if(PRE_PSI) */
2955 
2956  if(ths->flags & PRE_FG_PSI)
2957  {
2958  R fg_exp_l[2*(2*m+2)];
2959 
2960  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2961  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
2962 
2963 #ifdef _OPENMP
2964  #pragma omp parallel for default(shared) private(k)
2965 #endif
2966  for (k = 0; k < M; k++)
2967  {
2968  R psij_const[2*(2*m+2)];
2969  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2970  INT l;
2971  R fg_psij0 = ths->psi[2*j*2];
2972  R fg_psij1 = ths->psi[2*j*2+1];
2973  R fg_psij2 = K(1.0);
2974 
2975  psij_const[0] = fg_psij0;
2976  for (l = 1; l <= 2*m+1; l++)
2977  {
2978  fg_psij2 *= fg_psij1;
2979  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
2980  }
2981 
2982  fg_psij0 = ths->psi[2*(j*2+1)];
2983  fg_psij1 = ths->psi[2*(j*2+1)+1];
2984  fg_psij2 = K(1.0);
2985  psij_const[2*m+2] = fg_psij0;
2986  for (l = 1; l <= 2*m+1; l++)
2987  {
2988  fg_psij2 *= fg_psij1;
2989  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
2990  }
2991 
2992  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2993  }
2994 
2995  return;
2996  } /* if(PRE_FG_PSI) */
2997 
2998  if(ths->flags & FG_PSI)
2999  {
3000  R fg_exp_l[2*(2*m+2)];
3001 
3002  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3003  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3004 
3005  sort(ths);
3006 
3007 #ifdef _OPENMP
3008  #pragma omp parallel for default(shared) private(k)
3009 #endif
3010  for (k = 0; k < M; k++)
3011  {
3012  INT u, o, l;
3013  R fg_psij0, fg_psij1, fg_psij2;
3014  R psij_const[2*(2*m+2)];
3015  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3016 
3017  uo(ths, j, &u, &o, (INT)0);
3018  fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
3019  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3020  fg_psij2 = K(1.0);
3021  psij_const[0] = fg_psij0;
3022  for (l = 1; l <= 2*m+1; l++)
3023  {
3024  fg_psij2 *= fg_psij1;
3025  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3026  }
3027 
3028  uo(ths,j,&u,&o, (INT)1);
3029  fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3030  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3031  fg_psij2 = K(1.0);
3032  psij_const[2*m+2] = fg_psij0;
3033  for(l=1; l<=2*m+1; l++)
3034  {
3035  fg_psij2 *= fg_psij1;
3036  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3037  }
3038 
3039  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3040  }
3041 
3042  return;
3043  } /* if(FG_PSI) */
3044 
3045  if(ths->flags & PRE_LIN_PSI)
3046  {
3047  const INT K = ths->K, ip_s = K / (m + 2);
3048 
3049  sort(ths);
3050 
3051 #ifdef _OPENMP
3052  #pragma omp parallel for default(shared) private(k)
3053 #endif
3054  for (k = 0; k < M; k++)
3055  {
3056  INT u, o, l;
3057  R ip_y, ip_w;
3058  INT ip_u;
3059  R psij_const[2*(2*m+2)];
3060  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3061 
3062  uo(ths,j,&u,&o,(INT)0);
3063  ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
3064  ip_u = (INT)LRINT(FLOOR(ip_y));
3065  ip_w = ip_y - (R)(ip_u);
3066  for (l = 0; l < 2*m+2; l++)
3067  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3068 
3069  uo(ths,j,&u,&o,(INT)1);
3070  ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
3071  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3072  ip_w = ip_y - (R)(ip_u);
3073  for (l = 0; l < 2*m+2; l++)
3074  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3075 
3076  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3077  }
3078  return;
3079  } /* if(PRE_LIN_PSI) */
3080 
3081  /* no precomputed psi at all */
3082 
3083  sort(ths);
3084 
3085 #ifdef _OPENMP
3086  #pragma omp parallel for default(shared) private(k)
3087 #endif
3088  for (k = 0; k < M; k++)
3089  {
3090  R psij_const[2*(2*m+2)];
3091  INT u, o, l;
3092  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3093 
3094  uo(ths,j,&u,&o,(INT)0);
3095  for (l = 0; l <= 2*m+1; l++)
3096  psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3097 
3098  uo(ths,j,&u,&o,(INT)1);
3099  for (l = 0; l <= 2*m+1; l++)
3100  psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
3101 
3102  nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3103  }
3104 }
3105 
3106 #ifdef OMP_ASSERT
3107 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3108 { \
3109  assert(ar_x[2*k] >= min_u_a || k == M-1); \
3110  if (k > 0) \
3111  assert(ar_x[2*k-2] < min_u_a); \
3112 }
3113 #else
3114 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
3115 #endif
3116 
3117 #ifdef OMP_ASSERT
3118 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3119 { \
3120  assert(ar_x[2*k] >= min_u_b || k == M-1); \
3121  if (k > 0) \
3122  assert(ar_x[2*k-2] < min_u_b); \
3123 }
3124 #else
3125 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
3126 #endif
3127 
3128 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3129  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3130  ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3131  ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3132 
3133 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3134 { \
3135  R psij_const[2*(2*m+2)]; \
3136  INT u, o, l; \
3137  R fg_psij0 = ths->psi[2*j*2]; \
3138  R fg_psij1 = ths->psi[2*j*2+1]; \
3139  R fg_psij2 = K(1.0); \
3140  \
3141  psij_const[0] = fg_psij0; \
3142  for(l=1; l<=2*m+1; l++) \
3143  { \
3144  fg_psij2 *= fg_psij1; \
3145  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3146  } \
3147  \
3148  fg_psij0 = ths->psi[2*(j*2+1)]; \
3149  fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3150  fg_psij2 = K(1.0); \
3151  psij_const[2*m+2] = fg_psij0; \
3152  for(l=1; l<=2*m+1; l++) \
3153  { \
3154  fg_psij2 *= fg_psij1; \
3155  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3156  } \
3157  \
3158  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3159  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3160  n0, n1, m, my_u0, my_o0); \
3161 }
3162 
3163 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3164 { \
3165  R psij_const[2*(2*m+2)]; \
3166  R fg_psij0, fg_psij1, fg_psij2; \
3167  INT u, o, l; \
3168  \
3169  uo(ths,j,&u,&o,(INT)0); \
3170  fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/n0,0)); \
3171  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
3172  fg_psij2 = K(1.0); \
3173  psij_const[0] = fg_psij0; \
3174  for(l=1; l<=2*m+1; l++) \
3175  { \
3176  fg_psij2 *= fg_psij1; \
3177  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3178  } \
3179  \
3180  uo(ths,j,&u,&o,(INT)1); \
3181  fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/n1,1)); \
3182  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
3183  fg_psij2 = K(1.0); \
3184  psij_const[2*m+2] = fg_psij0; \
3185  for(l=1; l<=2*m+1; l++) \
3186  { \
3187  fg_psij2 *= fg_psij1; \
3188  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3189  } \
3190  \
3191  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3192  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3193  n0, n1, m, my_u0, my_o0); \
3194 }
3195 
3196 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3197 { \
3198  R psij_const[2*(2*m+2)]; \
3199  INT u, o, l; \
3200  INT ip_u; \
3201  R ip_y, ip_w; \
3202  \
3203  uo(ths,j,&u,&o,(INT)0); \
3204  ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
3205  ip_u = LRINT(FLOOR(ip_y)); \
3206  ip_w = ip_y-ip_u; \
3207  for(l=0; l < 2*m+2; l++) \
3208  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3209  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3210  \
3211  uo(ths,j,&u,&o,(INT)1); \
3212  ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
3213  ip_u = LRINT(FLOOR(ip_y)); \
3214  ip_w = ip_y-ip_u; \
3215  for(l=0; l < 2*m+2; l++) \
3216  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3217  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3218  \
3219  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3220  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3221  n0, n1, m, my_u0, my_o0); \
3222 }
3223 
3224 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3225 { \
3226  R psij_const[2*(2*m+2)]; \
3227  INT u, o, l; \
3228  \
3229  uo(ths,j,&u,&o,(INT)0); \
3230  for(l=0;l<=2*m+1;l++) \
3231  psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/n0,0)); \
3232  \
3233  uo(ths,j,&u,&o,(INT)1); \
3234  for(l=0;l<=2*m+1;l++) \
3235  psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
3236  \
3237  nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3238  psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3239  n0, n1, m, my_u0, my_o0); \
3240 }
3241 
3242 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3243 { \
3244  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3245  { \
3246  _Pragma("omp parallel private(k)") \
3247  { \
3248  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3249  INT *ar_x = ths->index_x; \
3250  \
3251  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3252  &min_u_b, &max_u_b, 2, ths->n, m); \
3253  \
3254  if (min_u_a != -1) \
3255  { \
3256  k = index_x_binary_search(ar_x, M, min_u_a); \
3257  \
3258  MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3259  \
3260  while (k < M) \
3261  { \
3262  INT u_prod = ar_x[2*k]; \
3263  INT j = ar_x[2*k+1]; \
3264  \
3265  if (u_prod < min_u_a || u_prod > max_u_a) \
3266  break; \
3267  \
3268  MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3269  \
3270  k++; \
3271  } \
3272  } \
3273  \
3274  if (min_u_b != -1) \
3275  { \
3276  INT k = index_x_binary_search(ar_x, M, min_u_b); \
3277  \
3278  MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3279  \
3280  while (k < M) \
3281  { \
3282  INT u_prod = ar_x[2*k]; \
3283  INT j = ar_x[2*k+1]; \
3284  \
3285  if (u_prod < min_u_b || u_prod > max_u_b) \
3286  break; \
3287  \
3288  MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3289  \
3290  k++; \
3291  } \
3292  } \
3293  } /* omp parallel */ \
3294  return; \
3295  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
3296 }
3297 
3298 
3299 static void nfft_adjoint_2d_B(X(plan) *ths)
3300 {
3301  const INT n0 = ths->n[0];
3302  const INT n1 = ths->n[1];
3303  const INT M = ths->M_total;
3304  const INT m = ths->m;
3305  C* g = (C*) ths->g;
3306  INT k;
3307 
3308  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
3309 
3310  if(ths->flags & PRE_FULL_PSI)
3311  {
3312  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3313  (INT)2, ths->n, m, ths->flags, ths->index_x);
3314  return;
3315  } /* if(PRE_FULL_PSI) */
3316 
3317  if(ths->flags & PRE_PSI)
3318  {
3319 #ifdef _OPENMP
3320  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
3321 #endif
3322 
3323 #ifdef _OPENMP
3324  #pragma omp parallel for default(shared) private(k)
3325 #endif
3326  for (k = 0; k < M; k++)
3327  {
3328  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3329 #ifdef _OPENMP
3330  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3331 #else
3332  nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3333 #endif
3334  }
3335  return;
3336  } /* if(PRE_PSI) */
3337 
3338  if(ths->flags & PRE_FG_PSI)
3339  {
3340  R fg_exp_l[2*(2*m+2)];
3341 
3342  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3343  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3344 
3345 #ifdef _OPENMP
3346  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
3347 #endif
3348 
3349 
3350 #ifdef _OPENMP
3351  #pragma omp parallel for default(shared) private(k)
3352 #endif
3353  for (k = 0; k < M; k++)
3354  {
3355  R psij_const[2*(2*m+2)];
3356  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3357  INT l;
3358  R fg_psij0 = ths->psi[2*j*2];
3359  R fg_psij1 = ths->psi[2*j*2+1];
3360  R fg_psij2 = K(1.0);
3361 
3362  psij_const[0] = fg_psij0;
3363  for(l=1; l<=2*m+1; l++)
3364  {
3365  fg_psij2 *= fg_psij1;
3366  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3367  }
3368 
3369  fg_psij0 = ths->psi[2*(j*2+1)];
3370  fg_psij1 = ths->psi[2*(j*2+1)+1];
3371  fg_psij2 = K(1.0);
3372  psij_const[2*m+2] = fg_psij0;
3373  for(l=1; l<=2*m+1; l++)
3374  {
3375  fg_psij2 *= fg_psij1;
3376  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3377  }
3378 
3379 #ifdef _OPENMP
3380  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3381 #else
3382  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3383 #endif
3384  }
3385 
3386  return;
3387  } /* if(PRE_FG_PSI) */
3388 
3389  if(ths->flags & FG_PSI)
3390  {
3391  R fg_exp_l[2*(2*m+2)];
3392 
3393  nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3394  nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3395 
3396  sort(ths);
3397 
3398 #ifdef _OPENMP
3399  MACRO_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
3400 #endif
3401 
3402 #ifdef _OPENMP
3403  #pragma omp parallel for default(shared) private(k)
3404 #endif
3405  for (k = 0; k < M; k++)
3406  {
3407  INT u, o, l;
3408  R fg_psij0, fg_psij1, fg_psij2;
3409  R psij_const[2*(2*m+2)];
3410  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3411 
3412  uo(ths,j,&u,&o,(INT)0);
3413  fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
3414  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3415  fg_psij2 = K(1.0);
3416  psij_const[0] = fg_psij0;
3417  for(l=1; l<=2*m+1; l++)
3418  {
3419  fg_psij2 *= fg_psij1;
3420  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3421  }
3422 
3423  uo(ths,j,&u,&o,(INT)1);
3424  fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3425  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3426  fg_psij2 = K(1.0);
3427  psij_const[2*m+2] = fg_psij0;
3428  for(l=1; l<=2*m+1; l++)
3429  {
3430  fg_psij2 *= fg_psij1;
3431  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3432  }
3433 
3434 #ifdef _OPENMP
3435  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3436 #else
3437  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3438 #endif
3439  }
3440 
3441  return;
3442  } /* if(FG_PSI) */
3443 
3444  if(ths->flags & PRE_LIN_PSI)
3445  {
3446  const INT K = ths->K;
3447  const INT ip_s = K / (m + 2);
3448 
3449  sort(ths);
3450 
3451 #ifdef _OPENMP
3452  MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
3453 #endif
3454 
3455 #ifdef _OPENMP
3456  #pragma omp parallel for default(shared) private(k)
3457 #endif
3458  for (k = 0; k < M; k++)
3459  {
3460  INT u,o,l;
3461  INT ip_u;
3462  R ip_y, ip_w;
3463  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3464  R psij_const[2*(2*m+2)];
3465 
3466  uo(ths,j,&u,&o,(INT)0);
3467  ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
3468  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3469  ip_w = ip_y - (R)(ip_u);
3470  for(l=0; l < 2*m+2; l++)
3471  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3472  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3473 
3474  uo(ths,j,&u,&o,(INT)1);
3475  ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
3476  ip_u = (INT)(LRINT(FLOOR(ip_y)));
3477  ip_w = ip_y - (R)(ip_u);
3478  for(l=0; l < 2*m+2; l++)
3479  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3480  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3481 
3482 #ifdef _OPENMP
3483  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3484 #else
3485  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3486 #endif
3487  }
3488  return;
3489  } /* if(PRE_LIN_PSI) */
3490 
3491  /* no precomputed psi at all */
3492  sort(ths);
3493 
3494 #ifdef _OPENMP
3495  MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3496 #endif
3497 
3498 #ifdef _OPENMP
3499  #pragma omp parallel for default(shared) private(k)
3500 #endif
3501  for (k = 0; k < M; k++)
3502  {
3503  INT u,o,l;
3504  R psij_const[2*(2*m+2)];
3505  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3506 
3507  uo(ths,j,&u,&o,(INT)0);
3508  for(l=0;l<=2*m+1;l++)
3509  psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3510 
3511  uo(ths,j,&u,&o,(INT)1);
3512  for(l=0;l<=2*m+1;l++)
3513  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
3514 
3515 #ifdef _OPENMP
3516  nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3517 #else
3518  nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3519 #endif
3520  }
3521 }
3522 
3523 
3524 void X(trafo_2d)(X(plan) *ths)
3525 {
3526  INT k0,k1,n0,n1,N0,N1;
3527  C *g_hat,*f_hat;
3528  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3529  R ck01, ck02, ck11, ck12;
3530  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3531 
3532  ths->g_hat=ths->g1;
3533  ths->g=ths->g2;
3534 
3535  N0=ths->N[0];
3536  N1=ths->N[1];
3537  n0=ths->n[0];
3538  n1=ths->n[1];
3539 
3540  f_hat=(C*)ths->f_hat;
3541  g_hat=(C*)ths->g_hat;
3542 
3543  TIC(0)
3544 #ifdef _OPENMP
3545  #pragma omp parallel for default(shared) private(k0)
3546  for (k0 = 0; k0 < ths->n_total; k0++)
3547  ths->g_hat[k0] = 0.0;
3548 #else
3549  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
3550 #endif
3551  if(ths->flags & PRE_PHI_HUT)
3552  {
3553  c_phi_inv01=ths->c_phi_inv[0];
3554  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3555 
3556 #ifdef _OPENMP
3557  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3558 #endif
3559  for(k0=0;k0<N0/2;k0++)
3560  {
3561  ck01=c_phi_inv01[k0];
3562  ck02=c_phi_inv02[k0];
3563 
3564  c_phi_inv11=ths->c_phi_inv[1];
3565  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3566 
3567  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3568  f_hat11=f_hat + k0*N1;
3569  g_hat21=g_hat + k0*n1+n1-(N1/2);
3570  f_hat21=f_hat + ((N0/2)+k0)*N1;
3571  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3572  f_hat12=f_hat + k0*N1+(N1/2);
3573  g_hat22=g_hat + k0*n1;
3574  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3575 
3576  for(k1=0;k1<N1/2;k1++)
3577  {
3578  ck11=c_phi_inv11[k1];
3579  ck12=c_phi_inv12[k1];
3580 
3581  g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3582  g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3583  g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3584  g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3585  }
3586  }
3587  }
3588  else
3589 #ifdef _OPENMP
3590  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3591 #endif
3592  for(k0=0;k0<N0/2;k0++)
3593  {
3594  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3595  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3596  for(k1=0;k1<N1/2;k1++)
3597  {
3598  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3599  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3600  g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3601  g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3602  g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3603  g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3604  }
3605  }
3606 
3607  TOC(0)
3608 
3609  TIC_FFTW(1)
3610  FFTW(execute)(ths->my_fftw_plan1);
3611  TOC_FFTW(1);
3612 
3613  TIC(2);
3614  nfft_trafo_2d_B(ths);
3615  TOC(2);
3616 }
3617 
3618 void X(adjoint_2d)(X(plan) *ths)
3619 {
3620  INT k0,k1,n0,n1,N0,N1;
3621  C *g_hat,*f_hat;
3622  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3623  R ck01, ck02, ck11, ck12;
3624  C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3625 
3626  ths->g_hat=ths->g1;
3627  ths->g=ths->g2;
3628 
3629  N0=ths->N[0];
3630  N1=ths->N[1];
3631  n0=ths->n[0];
3632  n1=ths->n[1];
3633 
3634  f_hat=(C*)ths->f_hat;
3635  g_hat=(C*)ths->g_hat;
3636 
3637  TIC(2);
3638  nfft_adjoint_2d_B(ths);
3639  TOC(2);
3640 
3641  TIC_FFTW(1)
3642  FFTW(execute)(ths->my_fftw_plan2);
3643  TOC_FFTW(1);
3644 
3645  TIC(0)
3646  if(ths->flags & PRE_PHI_HUT)
3647  {
3648  c_phi_inv01=ths->c_phi_inv[0];
3649  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3650 
3651 #ifdef _OPENMP
3652  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3653 #endif
3654  for(k0=0;k0<N0/2;k0++)
3655  {
3656  ck01=c_phi_inv01[k0];
3657  ck02=c_phi_inv02[k0];
3658 
3659  c_phi_inv11=ths->c_phi_inv[1];
3660  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3661 
3662  g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3663  f_hat11=f_hat + k0*N1;
3664  g_hat21=g_hat + k0*n1+n1-(N1/2);
3665  f_hat21=f_hat + ((N0/2)+k0)*N1;
3666  g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3667  f_hat12=f_hat + k0*N1+(N1/2);
3668  g_hat22=g_hat + k0*n1;
3669  f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3670 
3671  for(k1=0;k1<N1/2;k1++)
3672  {
3673  ck11=c_phi_inv11[k1];
3674  ck12=c_phi_inv12[k1];
3675 
3676  f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3677  f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3678  f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3679  f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3680  }
3681  }
3682  }
3683  else
3684 #ifdef _OPENMP
3685  #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3686 #endif
3687  for(k0=0;k0<N0/2;k0++)
3688  {
3689  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3690  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3691  for(k1=0;k1<N1/2;k1++)
3692  {
3693  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3694  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3695  f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3696  f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3697  f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3698  f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3699  }
3700  }
3701  TOC(0)
3702 }
3703 
3704 /* ################################################ SPECIFIC VERSIONS FOR d=3 */
3705 
3706 static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
3707 {
3708  INT l;
3709  R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
3710 
3711  fg_exp_b0 = EXP(-K(1.0) / b);
3712  fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
3713  fg_exp_b1 = K(1.0);
3714  fg_exp_b2 = K(1.0);
3715  fg_exp_l[0] = K(1.0);
3716  for(l=1; l <= 2*m+1; l++)
3717  {
3718  fg_exp_b2 = fg_exp_b1*fg_exp_b0;
3719  fg_exp_b1 *= fg_exp_b0_sq;
3720  fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
3721  }
3722 }
3723 
3724 static void nfft_trafo_3d_compute(C *fj, const C *g, const R *psij_const0,
3725  const R *psij_const1, const R *psij_const2, const R *xj0, const R *xj1,
3726  const R *xj2, const INT n0, const INT n1, const INT n2, const INT m)
3727 {
3728  INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
3729  const C *gj;
3730  const R *psij0, *psij1, *psij2;
3731 
3732  psij0 = psij_const0;
3733  psij1 = psij_const1;
3734  psij2 = psij_const2;
3735 
3736  uo2(&u0, &o0, *xj0, n0, m);
3737  uo2(&u1, &o1, *xj1, n1, m);
3738  uo2(&u2, &o2, *xj2, n2, m);
3739 
3740  *fj = 0;
3741 
3742  if (u0 < o0)
3743  if (u1 < o1)
3744  if (u2 < o2)
3745  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3746  {
3747  psij1 = psij_const1;
3748  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3749  {
3750  psij2 = psij_const2;
3751  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3752  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3753  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3754  }
3755  }
3756  else
3757  /* asserts (u2>o2)*/
3758  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3759  {
3760  psij1 = psij_const1;
3761  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3762  {
3763  psij2 = psij_const2;
3764  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3765  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3766  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3767  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3768  for (l2 = 0; l2 <= o2; l2++)
3769  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3770  }
3771  }
3772  else /* asserts (u1>o1)*/
3773  if (u2 < o2)
3774  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3775  {
3776  psij1 = psij_const1;
3777  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3778  {
3779  psij2 = psij_const2;
3780  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3781  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3782  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3783  }
3784  for (l1 = 0; l1 <= o1; l1++, psij1++)
3785  {
3786  psij2 = psij_const2;
3787  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3788  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3789  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3790  }
3791  }
3792  else/* asserts (u2>o2) */
3793  {
3794  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3795  {
3796  psij1 = psij_const1;
3797  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3798  {
3799  psij2 = psij_const2;
3800  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3801  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3802  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3803  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3804  for (l2 = 0; l2 <= o2; l2++)
3805  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3806  }
3807  for (l1 = 0; l1 <= o1; l1++, psij1++)
3808  {
3809  psij2 = psij_const2;
3810  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3811  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3812  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3813  gj = g + ((u0 + l0) * n1 + l1) * n2;
3814  for (l2 = 0; l2 <= o2; l2++)
3815  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3816  }
3817  }
3818  }
3819  else /* asserts (u0>o0) */
3820  if (u1 < o1)
3821  if (u2 < o2)
3822  {
3823  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3824  {
3825  psij1 = psij_const1;
3826  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3827  {
3828  psij2 = psij_const2;
3829  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3830  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3831  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3832  }
3833  }
3834 
3835  for (l0 = 0; l0 <= o0; l0++, psij0++)
3836  {
3837  psij1 = psij_const1;
3838  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3839  {
3840  psij2 = psij_const2;
3841  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3842  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3843  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3844  }
3845  }
3846  } else/* asserts (u2>o2) */
3847  {
3848  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3849  {
3850  psij1 = psij_const1;
3851  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3852  {
3853  psij2 = psij_const2;
3854  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3855  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3856  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3857  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3858  for (l2 = 0; l2 <= o2; l2++)
3859  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3860  }
3861  }
3862 
3863  for (l0 = 0; l0 <= o0; l0++, psij0++)
3864  {
3865  psij1 = psij_const1;
3866  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3867  {
3868  psij2 = psij_const2;
3869  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3870  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3871  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3872  gj = g + (l0 * n1 + (u1 + l1)) * n2;
3873  for (l2 = 0; l2 <= o2; l2++)
3874  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3875  }
3876  }
3877  }
3878  else /* asserts (u1>o1) */
3879  if (u2 < o2)
3880  {
3881  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3882  {
3883  psij1 = psij_const1;
3884  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3885  {
3886  psij2 = psij_const2;
3887  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3888  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3889  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3890  }
3891  for (l1 = 0; l1 <= o1; l1++, psij1++)
3892  {
3893  psij2 = psij_const2;
3894  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3895  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3896  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3897  }
3898  }
3899  for (l0 = 0; l0 <= o0; l0++, psij0++)
3900  {
3901  psij1 = psij_const1;
3902  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3903  {
3904  psij2 = psij_const2;
3905  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3906  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3907  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3908  }
3909  for (l1 = 0; l1 <= o1; l1++, psij1++)
3910  {
3911  psij2 = psij_const2;
3912  gj = g + (l0 * n1 + l1) * n2 + u2;
3913  for (l2 = 0; l2 <= 2 * m + 1; l2++)
3914  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3915  }
3916  }
3917  } else/* asserts (u2>o2) */
3918  {
3919  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3920  {
3921  psij1 = psij_const1;
3922  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3923  {
3924  psij2 = psij_const2;
3925  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3926  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3927  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3928  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3929  for (l2 = 0; l2 <= o2; l2++)
3930  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3931  }
3932  for (l1 = 0; l1 <= o1; l1++, psij1++)
3933  {
3934  psij2 = psij_const2;
3935  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3936  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3937  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3938  gj = g + ((u0 + l0) * n1 + l1) * n2;
3939  for (l2 = 0; l2 <= o2; l2++)
3940  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3941  }
3942  }
3943 
3944  for (l0 = 0; l0 <= o0; l0++, psij0++)
3945  {
3946  psij1 = psij_const1;
3947  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3948  {
3949  psij2 = psij_const2;
3950  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3951  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3952  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3953  gj = g + (l0 * n1 + (u1 + l1)) * n2;
3954  for (l2 = 0; l2 <= o2; l2++)
3955  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3956  }
3957  for (l1 = 0; l1 <= o1; l1++, psij1++)
3958  {
3959  psij2 = psij_const2;
3960  gj = g + (l0 * n1 + l1) * n2 + u2;
3961  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3962  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3963  gj = g + (l0 * n1 + l1) * n2;
3964  for (l2 = 0; l2 <= o2; l2++)
3965  (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3966  }
3967  }
3968  }
3969 }
3970 
3971 #ifdef _OPENMP
3972 
3993 static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g,
3994  const R *psij_const0, const R *psij_const1, const R *psij_const2,
3995  const R *xj0, const R *xj1, const R *xj2,
3996  const INT n0, const INT n1, const INT n2, const INT m,
3997  const INT my_u0, const INT my_o0)
3998 {
3999  INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4000  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4001 
4002  INT index_temp1[2*m+2];
4003  INT index_temp2[2*m+2];
4004 
4005  uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4006  uo2(&u1,&o1,*xj1, n1, m);
4007  uo2(&u2,&o2,*xj2, n2, m);
4008 
4009  for (l1=0; l1<=2*m+1; l1++)
4010  index_temp1[l1] = (u1+l1)%n1;
4011 
4012  for (l2=0; l2<=2*m+1; l2++)
4013  index_temp2[l2] = (u2+l2)%n2;
4014 
4015  if(ar_u0<ar_o0)
4016  {
4017  INT u0 = MAX(my_u0,ar_u0);
4018  INT o0 = MIN(my_o0,ar_o0);
4019  INT offset_psij = u0-ar_u0;
4020 #ifdef OMP_ASSERT
4021  assert(offset_psij >= 0);
4022  assert(o0-u0 <= 2*m+1);
4023  assert(offset_psij+o0-u0 <= 2*m+1);
4024 #endif
4025 
4026  for (l0 = 0; l0 <= o0-u0; l0++)
4027  {
4028  const INT i0 = (u0+l0) * n1;
4029  const C val0 = psij_const0[offset_psij+l0];
4030 
4031  for(l1=0; l1<=2*m+1; l1++)
4032  {
4033  const INT i1 = (i0 + index_temp1[l1]) * n2;
4034  const C val1 = psij_const1[l1];
4035 
4036  for(l2=0; l2<=2*m+1; l2++)
4037  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4038  }
4039  }
4040  }
4041  else
4042  {
4043  INT u0 = MAX(my_u0,ar_u0);
4044  INT o0 = my_o0;
4045  INT offset_psij = u0-ar_u0;
4046 #ifdef OMP_ASSERT
4047  assert(offset_psij >= 0);
4048  assert(o0-u0 <= 2*m+1);
4049  assert(offset_psij+o0-u0 <= 2*m+1);
4050 #endif
4051 
4052  for (l0 = 0; l0 <= o0-u0; l0++)
4053  {
4054  INT i0 = (u0+l0) * n1;
4055  const C val0 = psij_const0[offset_psij+l0];
4056 
4057  for(l1=0; l1<=2*m+1; l1++)
4058  {
4059  const INT i1 = (i0 + index_temp1[l1]) * n2;
4060  const C val1 = psij_const1[l1];
4061 
4062  for(l2=0; l2<=2*m+1; l2++)
4063  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4064  }
4065  }
4066 
4067  u0 = my_u0;
4068  o0 = MIN(my_o0,ar_o0);
4069  offset_psij += my_u0-ar_u0+n0;
4070 
4071 #ifdef OMP_ASSERT
4072  if (u0<=o0)
4073  {
4074  assert(o0-u0 <= 2*m+1);
4075  assert(offset_psij+o0-u0 <= 2*m+1);
4076  }
4077 #endif
4078  for (l0 = 0; l0 <= o0-u0; l0++)
4079  {
4080  INT i0 = (u0+l0) * n1;
4081  const C val0 = psij_const0[offset_psij+l0];
4082 
4083  for(l1=0; l1<=2*m+1; l1++)
4084  {
4085  const INT i1 = (i0 + index_temp1[l1]) * n2;
4086  const C val1 = psij_const1[l1];
4087 
4088  for(l2=0; l2<=2*m+1; l2++)
4089  g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4090  }
4091  }
4092  }
4093 }
4094 #endif
4095 
4096 #ifdef _OPENMP
4097 /* adjoint NFFT three-dimensional case with OpenMP atomic operations */
4098 static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g,
4099  const R *psij_const0, const R *psij_const1, const R *psij_const2,
4100  const R *xj0, const R *xj1, const R *xj2,
4101  const INT n0, const INT n1, const INT n2, const INT m)
4102 {
4103  INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
4104  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4105 
4106  INT index_temp0[2*m+2];
4107  INT index_temp1[2*m+2];
4108  INT index_temp2[2*m+2];
4109 
4110  uo2(&u0,&o0,*xj0, n0, m);
4111  uo2(&u1,&o1,*xj1, n1, m);
4112  uo2(&u2,&o2,*xj2, n2, m);
4113 
4114  for (l0=0; l0<=2*m+1; l0++)
4115  index_temp0[l0] = (u0+l0)%n0;
4116 
4117  for (l1=0; l1<=2*m+1; l1++)
4118  index_temp1[l1] = (u1+l1)%n1;
4119 
4120  for (l2=0; l2<=2*m+1; l2++)
4121  index_temp2[l2] = (u2+l2)%n2;
4122 
4123  for(l0=0; l0<=2*m+1; l0++)
4124  {
4125  for(l1=0; l1<=2*m+1; l1++)
4126  {
4127  for(l2=0; l2<=2*m+1; l2++)
4128  {
4129  INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4130  C *lhs = g+i;
4131  R *lhs_real = (R*)lhs;
4132  C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4133 
4134 #pragma omp atomic
4135  lhs_real[0] += CREAL(val);
4136 
4137 #pragma omp atomic
4138  lhs_real[1] += CIMAG(val);
4139  }
4140  }
4141  }
4142 }
4143 #endif
4144 
4145 #ifndef _OPENMP
4146 static void nfft_adjoint_3d_compute_serial(const C *fj, C *g,
4147  const R *psij_const0, const R *psij_const1, const R *psij_const2, const R *xj0,
4148  const R *xj1, const R *xj2, const INT n0, const INT n1, const INT n2,
4149  const INT m)
4150 {
4151  INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4152  C *gj;
4153  const R *psij0, *psij1, *psij2;
4154 
4155  psij0 = psij_const0;
4156  psij1 = psij_const1;
4157  psij2 = psij_const2;
4158 
4159  uo2(&u0, &o0, *xj0, n0, m);
4160  uo2(&u1, &o1, *xj1, n1, m);
4161  uo2(&u2, &o2, *xj2, n2, m);
4162 
4163  if (u0 < o0)
4164  if (u1 < o1)
4165  if (u2 < o2)
4166  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4167  {
4168  psij1 = psij_const1;
4169  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4170  {
4171  psij2 = psij_const2;
4172  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4173  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4174  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4175  }
4176  }
4177  else
4178  /* asserts (u2>o2)*/
4179  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4180  {
4181  psij1 = psij_const1;
4182  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4183  {
4184  psij2 = psij_const2;
4185  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4186  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4187  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4188  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4189  for (l2 = 0; l2 <= o2; l2++)
4190  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4191  }
4192  }
4193  else /* asserts (u1>o1)*/
4194  if (u2 < o2)
4195  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4196  {
4197  psij1 = psij_const1;
4198  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4199  {
4200  psij2 = psij_const2;
4201  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4202  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4203  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4204  }
4205  for (l1 = 0; l1 <= o1; l1++, psij1++)
4206  {
4207  psij2 = psij_const2;
4208  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4209  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4210  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4211  }
4212  }
4213  else/* asserts (u2>o2) */
4214  {
4215  for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4216  {
4217  psij1 = psij_const1;
4218  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4219  {
4220  psij2 = psij_const2;
4221  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4222  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4223  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4224  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4225  for (l2 = 0; l2 <= o2; l2++)
4226  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4227  }
4228  for (l1 = 0; l1 <= o1; l1++, psij1++)
4229  {
4230  psij2 = psij_const2;
4231  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4232  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4233  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4234  gj = g + ((u0 + l0) * n1 + l1) * n2;
4235  for (l2 = 0; l2 <= o2; l2++)
4236  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4237  }
4238  }
4239  }
4240  else /* asserts (u0>o0) */
4241  if (u1 < o1)
4242  if (u2 < o2)
4243  {
4244  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4245  {
4246  psij1 = psij_const1;
4247  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4248  {
4249  psij2 = psij_const2;
4250  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4251  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4252  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4253  }
4254  }
4255 
4256  for (l0 = 0; l0 <= o0; l0++, psij0++)
4257  {
4258  psij1 = psij_const1;
4259  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4260  {
4261  psij2 = psij_const2;
4262  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4263  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4264  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4265  }
4266  }
4267  } else/* asserts (u2>o2) */
4268  {
4269  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4270  {
4271  psij1 = psij_const1;
4272  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4273  {
4274  psij2 = psij_const2;
4275  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4276  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4277  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4278  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4279  for (l2 = 0; l2 <= o2; l2++)
4280  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4281  }
4282  }
4283 
4284  for (l0 = 0; l0 <= o0; l0++, psij0++)
4285  {
4286  psij1 = psij_const1;
4287  for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4288  {
4289  psij2 = psij_const2;
4290  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4291  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4292  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4293  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4294  for (l2 = 0; l2 <= o2; l2++)
4295  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4296  }
4297  }
4298  }
4299  else /* asserts (u1>o1) */
4300  if (u2 < o2)
4301  {
4302  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4303  {
4304  psij1 = psij_const1;
4305  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4306  {
4307  psij2 = psij_const2;
4308  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4309  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4310  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4311  }
4312  for (l1 = 0; l1 <= o1; l1++, psij1++)
4313  {
4314  psij2 = psij_const2;
4315  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4316  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4317  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4318  }
4319  }
4320  for (l0 = 0; l0 <= o0; l0++, psij0++)
4321  {
4322  psij1 = psij_const1;
4323  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4324  {
4325  psij2 = psij_const2;
4326  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4327  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4328  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4329  }
4330  for (l1 = 0; l1 <= o1; l1++, psij1++)
4331  {
4332  psij2 = psij_const2;
4333  gj = g + (l0 * n1 + l1) * n2 + u2;
4334  for (l2 = 0; l2 <= 2 * m + 1; l2++)
4335  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4336  }
4337  }
4338  } else/* asserts (u2>o2) */
4339  {
4340  for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4341  {
4342  psij1 = psij_const1;
4343  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4344  {
4345  psij2 = psij_const2;
4346  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4347  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4348  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4349  gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4350  for (l2 = 0; l2 <= o2; l2++)
4351  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4352  }
4353  for (l1 = 0; l1 <= o1; l1++, psij1++)
4354  {
4355  psij2 = psij_const2;
4356  gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4357  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4358  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4359  gj = g + ((u0 + l0) * n1 + l1) * n2;
4360  for (l2 = 0; l2 <= o2; l2++)
4361  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4362  }
4363  }
4364 
4365  for (l0 = 0; l0 <= o0; l0++, psij0++)
4366  {
4367  psij1 = psij_const1;
4368  for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4369  {
4370  psij2 = psij_const2;
4371  gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4372  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4373  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4374  gj = g + (l0 * n1 + (u1 + l1)) * n2;
4375  for (l2 = 0; l2 <= o2; l2++)
4376  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4377  }
4378  for (l1 = 0; l1 <= o1; l1++, psij1++)
4379  {
4380  psij2 = psij_const2;
4381  gj = g + (l0 * n1 + l1) * n2 + u2;
4382  for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4383  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4384  gj = g + (l0 * n1 + l1) * n2;
4385  for (l2 = 0; l2 <= o2; l2++)
4386  (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4387  }
4388  }
4389  }
4390 }
4391 #endif
4392 
4393 static void nfft_trafo_3d_B(X(plan) *ths)
4394 {
4395  const INT n0 = ths->n[0];
4396  const INT n1 = ths->n[1];
4397  const INT n2 = ths->n[2];
4398  const INT M = ths->M_total;
4399  const INT m = ths->m;
4400 
4401  const C* g = (C*) ths->g;
4402 
4403  INT k;
4404 
4405  if(ths->flags & PRE_FULL_PSI)
4406  {
4407  const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4408 #ifdef _OPENMP
4409  #pragma omp parallel for default(shared) private(k)
4410 #endif
4411  for (k = 0; k < M; k++)
4412  {
4413  INT l;
4414  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4415  ths->f[j] = K(0.0);
4416  for (l = 0; l < lprod; l++)
4417  ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4418  }
4419  return;
4420  } /* if(PRE_FULL_PSI) */
4421 
4422  if(ths->flags & PRE_PSI)
4423  {
4424 #ifdef _OPENMP
4425  #pragma omp parallel for default(shared) private(k)
4426 #endif
4427  for (k = 0; k < M; k++)
4428  {
4429  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4430  nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4431  }
4432  return;
4433  } /* if(PRE_PSI) */
4434 
4435  if(ths->flags & PRE_FG_PSI)
4436  {
4437  R fg_exp_l[3*(2*m+2)];
4438 
4439  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4440  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4441  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4442 
4443 #ifdef _OPENMP
4444  #pragma omp parallel for default(shared) private(k)
4445 #endif
4446  for (k = 0; k < M; k++)
4447  {
4448  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4449  INT l;
4450  R psij_const[3*(2*m+2)];
4451  R fg_psij0 = ths->psi[2*j*3];
4452  R fg_psij1 = ths->psi[2*j*3+1];
4453  R fg_psij2 = K(1.0);
4454 
4455  psij_const[0] = fg_psij0;
4456  for(l=1; l<=2*m+1; l++)
4457  {
4458  fg_psij2 *= fg_psij1;
4459  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4460  }
4461 
4462  fg_psij0 = ths->psi[2*(j*3+1)];
4463  fg_psij1 = ths->psi[2*(j*3+1)+1];
4464  fg_psij2 = K(1.0);
4465  psij_const[2*m+2] = fg_psij0;
4466  for(l=1; l<=2*m+1; l++)
4467  {
4468  fg_psij2 *= fg_psij1;
4469  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4470  }
4471 
4472  fg_psij0 = ths->psi[2*(j*3+2)];
4473  fg_psij1 = ths->psi[2*(j*3+2)+1];
4474  fg_psij2 = K(1.0);
4475  psij_const[2*(2*m+2)] = fg_psij0;
4476  for(l=1; l<=2*m+1; l++)
4477  {
4478  fg_psij2 *= fg_psij1;
4479  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4480  }
4481 
4482  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4483  }
4484 
4485  return;
4486  } /* if(PRE_FG_PSI) */
4487 
4488  if(ths->flags & FG_PSI)
4489  {
4490  R fg_exp_l[3*(2*m+2)];
4491 
4492  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4493  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4494  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4495 
4496  sort(ths);
4497 
4498 #ifdef _OPENMP
4499  #pragma omp parallel for default(shared) private(k)
4500 #endif
4501  for (k = 0; k < M; k++)
4502  {
4503  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4504  INT u, o, l;
4505  R psij_const[3*(2*m+2)];
4506  R fg_psij0, fg_psij1, fg_psij2;
4507 
4508  uo(ths,j,&u,&o,(INT)0);
4509  fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4510  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
4511  fg_psij2 = K(1.0);
4512  psij_const[0] = fg_psij0;
4513  for(l=1; l<=2*m+1; l++)
4514  {
4515  fg_psij2 *= fg_psij1;
4516  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4517  }
4518 
4519  uo(ths,j,&u,&o,(INT)1);
4520  fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4521  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
4522  fg_psij2 = K(1.0);
4523  psij_const[2*m+2] = fg_psij0;
4524  for(l=1; l<=2*m+1; l++)
4525  {
4526  fg_psij2 *= fg_psij1;
4527  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4528  }
4529 
4530  uo(ths,j,&u,&o,(INT)2);
4531  fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
4532  fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
4533  fg_psij2 = K(1.0);
4534  psij_const[2*(2*m+2)] = fg_psij0;
4535  for(l=1; l<=2*m+1; l++)
4536  {
4537  fg_psij2 *= fg_psij1;
4538  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4539  }
4540 
4541  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4542  }
4543 
4544  return;
4545  } /* if(FG_PSI) */
4546 
4547  if(ths->flags & PRE_LIN_PSI)
4548  {
4549  const INT K = ths->K, ip_s = K / (m + 2);
4550 
4551  sort(ths);
4552 
4553 #ifdef _OPENMP
4554  #pragma omp parallel for default(shared) private(k)
4555 #endif
4556  for (k = 0; k < M; k++)
4557  {
4558  INT u, o, l;
4559  R ip_y, ip_w;
4560  INT ip_u;
4561  R psij_const[3*(2*m+2)];
4562  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4563 
4564  uo(ths,j,&u,&o,(INT)0);
4565  ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
4566  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4567  ip_w = ip_y - (R)(ip_u);
4568  for(l=0; l < 2*m+2; l++)
4569  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4570  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4571 
4572  uo(ths,j,&u,&o,(INT)1);
4573  ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
4574  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4575  ip_w = ip_y - (R)(ip_u);
4576  for(l=0; l < 2*m+2; l++)
4577  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4578  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4579 
4580  uo(ths,j,&u,&o,(INT)2);
4581  ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
4582  ip_u = (INT)(LRINT(FLOOR(ip_y)));
4583  ip_w = ip_y - (R)(ip_u);
4584  for(l=0; l < 2*m+2; l++)
4585  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4586  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4587 
4588  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4589  }
4590  return;
4591  } /* if(PRE_LIN_PSI) */
4592 
4593  /* no precomputed psi at all */
4594 
4595  sort(ths);
4596 
4597 #ifdef _OPENMP
4598  #pragma omp parallel for default(shared) private(k)
4599 #endif
4600  for (k = 0; k < M; k++)
4601  {
4602  R psij_const[3*(2*m+2)];
4603  INT u, o, l;
4604  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4605 
4606  uo(ths,j,&u,&o,(INT)0);
4607  for(l=0;l<=2*m+1;l++)
4608  psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
4609 
4610  uo(ths,j,&u,&o,(INT)1);
4611  for(l=0;l<=2*m+1;l++)
4612  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
4613 
4614  uo(ths,j,&u,&o,(INT)2);
4615  for(l=0;l<=2*m+1;l++)
4616  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
4617 
4618  nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4619  }
4620 }
4621 
4622 #ifdef OMP_ASSERT
4623 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4624 { \
4625  assert(ar_x[2*k] >= min_u_a || k == M-1); \
4626  if (k > 0) \
4627  assert(ar_x[2*k-2] < min_u_a); \
4628 }
4629 #else
4630 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
4631 #endif
4632 
4633 #ifdef OMP_ASSERT
4634 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4635 { \
4636  assert(ar_x[2*k] >= min_u_b || k == M-1); \
4637  if (k > 0) \
4638  assert(ar_x[2*k-2] < min_u_b); \
4639 }
4640 #else
4641 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
4642 #endif
4643 
4644 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4645  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4646  ths->psi+j*3*(2*m+2), \
4647  ths->psi+(j*3+1)*(2*m+2), \
4648  ths->psi+(j*3+2)*(2*m+2), \
4649  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4650  n0, n1, n2, m, my_u0, my_o0);
4651 
4652 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4653 { \
4654  INT u, o, l; \
4655  R psij_const[3*(2*m+2)]; \
4656  R fg_psij0 = ths->psi[2*j*3]; \
4657  R fg_psij1 = ths->psi[2*j*3+1]; \
4658  R fg_psij2 = K(1.0); \
4659  \
4660  psij_const[0] = fg_psij0; \
4661  for(l=1; l<=2*m+1; l++) \
4662  { \
4663  fg_psij2 *= fg_psij1; \
4664  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4665  } \
4666  \
4667  fg_psij0 = ths->psi[2*(j*3+1)]; \
4668  fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4669  fg_psij2 = K(1.0); \
4670  psij_const[2*m+2] = fg_psij0; \
4671  for(l=1; l<=2*m+1; l++) \
4672  { \
4673  fg_psij2 *= fg_psij1; \
4674  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4675  } \
4676  \
4677  fg_psij0 = ths->psi[2*(j*3+2)]; \
4678  fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4679  fg_psij2 = K(1.0); \
4680  psij_const[2*(2*m+2)] = fg_psij0; \
4681  for(l=1; l<=2*m+1; l++) \
4682  { \
4683  fg_psij2 *= fg_psij1; \
4684  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4685  } \
4686  \
4687  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4688  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4689  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4690  n0, n1, n2, m, my_u0, my_o0); \
4691 }
4692 
4693 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4694 { \
4695  INT u, o, l; \
4696  R psij_const[3*(2*m+2)]; \
4697  R fg_psij0, fg_psij1, fg_psij2; \
4698  \
4699  uo(ths,j,&u,&o,(INT)0); \
4700  fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/n0,0)); \
4701  fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
4702  fg_psij2 = K(1.0); \
4703  psij_const[0] = fg_psij0; \
4704  for(l=1; l<=2*m+1; l++) \
4705  { \
4706  fg_psij2 *= fg_psij1; \
4707  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4708  } \
4709  \
4710  uo(ths,j,&u,&o,(INT)1); \
4711  fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/n1,1)); \
4712  fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
4713  fg_psij2 = K(1.0); \
4714  psij_const[2*m+2] = fg_psij0; \
4715  for(l=1; l<=2*m+1; l++) \
4716  { \
4717  fg_psij2 *= fg_psij1; \
4718  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4719  } \
4720  \
4721  uo(ths,j,&u,&o,(INT)2); \
4722  fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/n2,2)); \
4723  fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
4724  fg_psij2 = K(1.0); \
4725  psij_const[2*(2*m+2)] = fg_psij0; \
4726  for(l=1; l<=2*m+1; l++) \
4727  { \
4728  fg_psij2 *= fg_psij1; \
4729  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4730  } \
4731  \
4732  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4733  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4734  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4735  n0, n1, n2, m, my_u0, my_o0); \
4736 }
4737 
4738 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
4739 { \
4740  INT u, o, l; \
4741  R psij_const[3*(2*m+2)]; \
4742  INT ip_u; \
4743  R ip_y, ip_w; \
4744  \
4745  uo(ths,j,&u,&o,(INT)0); \
4746  ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
4747  ip_u = LRINT(FLOOR(ip_y)); \
4748  ip_w = ip_y-ip_u; \
4749  for(l=0; l < 2*m+2; l++) \
4750  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4751  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
4752  \
4753  uo(ths,j,&u,&o,(INT)1); \
4754  ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
4755  ip_u = LRINT(FLOOR(ip_y)); \
4756  ip_w = ip_y-ip_u; \
4757  for(l=0; l < 2*m+2; l++) \
4758  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4759  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4760  \
4761  uo(ths,j,&u,&o,(INT)2); \
4762  ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
4763  ip_u = LRINT(FLOOR(ip_y)); \
4764  ip_w = ip_y-ip_u; \
4765  for(l=0; l < 2*m+2; l++) \
4766  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4767  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4768  \
4769  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4770  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4771  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4772  n0, n1, n2, m, my_u0, my_o0); \
4773 }
4774 
4775 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
4776 { \
4777  INT u, o, l; \
4778  R psij_const[3*(2*m+2)]; \
4779  \
4780  uo(ths,j,&u,&o,(INT)0); \
4781  for(l=0;l<=2*m+1;l++) \
4782  psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/n0,0)); \
4783  \
4784  uo(ths,j,&u,&o,(INT)1); \
4785  for(l=0;l<=2*m+1;l++) \
4786  psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
4787  \
4788  uo(ths,j,&u,&o,(INT)2); \
4789  for(l=0;l<=2*m+1;l++) \
4790  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
4791  \
4792  nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4793  psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4794  ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4795  n0, n1, n2, m, my_u0, my_o0); \
4796 }
4797 
4798 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
4799 { \
4800  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
4801  { \
4802  _Pragma("omp parallel private(k)") \
4803  { \
4804  INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
4805  INT *ar_x = ths->index_x; \
4806  \
4807  nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
4808  &min_u_b, &max_u_b, 3, ths->n, m); \
4809  \
4810  if (min_u_a != -1) \
4811  { \
4812  k = index_x_binary_search(ar_x, M, min_u_a); \
4813  \
4814  MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4815  \
4816  while (k < M) \
4817  { \
4818  INT u_prod = ar_x[2*k]; \
4819  INT j = ar_x[2*k+1]; \
4820  \
4821  if (u_prod < min_u_a || u_prod > max_u_a) \
4822  break; \
4823  \
4824  MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4825  \
4826  k++; \
4827  } \
4828  } \
4829  \
4830  if (min_u_b != -1) \
4831  { \
4832  INT k = index_x_binary_search(ar_x, M, min_u_b); \
4833  \
4834  MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4835  \
4836  while (k < M) \
4837  { \
4838  INT u_prod = ar_x[2*k]; \
4839  INT j = ar_x[2*k+1]; \
4840  \
4841  if (u_prod < min_u_b || u_prod > max_u_b) \
4842  break; \
4843  \
4844  MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4845  \
4846  k++; \
4847  } \
4848  } \
4849  } /* omp parallel */ \
4850  return; \
4851  } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
4852 }
4853 
4854 static void nfft_adjoint_3d_B(X(plan) *ths)
4855 {
4856  INT k;
4857  const INT n0 = ths->n[0];
4858  const INT n1 = ths->n[1];
4859  const INT n2 = ths->n[2];
4860  const INT M = ths->M_total;
4861  const INT m = ths->m;
4862 
4863  C* g = (C*) ths->g;
4864 
4865  memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
4866 
4867  if(ths->flags & PRE_FULL_PSI)
4868  {
4869  nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
4870  (INT)3, ths->n, m, ths->flags, ths->index_x);
4871  return;
4872  } /* if(PRE_FULL_PSI) */
4873 
4874  if(ths->flags & PRE_PSI)
4875  {
4876 #ifdef _OPENMP
4877  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
4878 #endif
4879 
4880 #ifdef _OPENMP
4881  #pragma omp parallel for default(shared) private(k)
4882 #endif
4883  for (k = 0; k < M; k++)
4884  {
4885  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4886 #ifdef _OPENMP
4887  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4888 #else
4889  nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4890 #endif
4891  }
4892  return;
4893  } /* if(PRE_PSI) */
4894 
4895  if(ths->flags & PRE_FG_PSI)
4896  {
4897  R fg_exp_l[3*(2*m+2)];
4898 
4899  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4900  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4901  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4902 
4903 #ifdef _OPENMP
4904  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
4905 #endif
4906 
4907 #ifdef _OPENMP
4908  #pragma omp parallel for default(shared) private(k)
4909 #endif
4910  for (k = 0; k < M; k++)
4911  {
4912  R psij_const[3*(2*m+2)];
4913  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4914  INT l;
4915  R fg_psij0 = ths->psi[2*j*3];
4916  R fg_psij1 = ths->psi[2*j*3+1];
4917  R fg_psij2 = K(1.0);
4918 
4919  psij_const[0] = fg_psij0;
4920  for(l=1; l<=2*m+1; l++)
4921  {
4922  fg_psij2 *= fg_psij1;
4923  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4924  }
4925 
4926  fg_psij0 = ths->psi[2*(j*3+1)];
4927  fg_psij1 = ths->psi[2*(j*3+1)+1];
4928  fg_psij2 = K(1.0);
4929  psij_const[2*m+2] = fg_psij0;
4930  for(l=1; l<=2*m+1; l++)
4931  {
4932  fg_psij2 *= fg_psij1;
4933  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4934  }
4935 
4936  fg_psij0 = ths->psi[2*(j*3+2)];
4937  fg_psij1 = ths->psi[2*(j*3+2)+1];
4938  fg_psij2 = K(1.0);
4939  psij_const[2*(2*m+2)] = fg_psij0;
4940  for(l=1; l<=2*m+1; l++)
4941  {
4942  fg_psij2 *= fg_psij1;
4943  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4944  }
4945 
4946 #ifdef _OPENMP
4947  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4948 #else
4949  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4950 #endif
4951  }
4952 
4953  return;
4954  } /* if(PRE_FG_PSI) */
4955 
4956  if(ths->flags & FG_PSI)
4957  {
4958  R fg_exp_l[3*(2*m+2)];
4959 
4960  nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4961  nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4962  nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4963 
4964  sort(ths);
4965 
4966 #ifdef _OPENMP
4967  MACRO_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
4968 #endif
4969 
4970 #ifdef _OPENMP
4971  #pragma omp parallel for default(shared) private(k)
4972 #endif
4973  for (k = 0; k < M; k++)
4974  {
4975  INT u,o,l;
4976  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4977  R psij_const[3*(2*m+2)];
4978  R fg_psij0, fg_psij1, fg_psij2;
4979 
4980  uo(ths,j,&u,&o,(INT)0);
4981  fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4982  fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
4983  fg_psij2 = K(1.0);
4984  psij_const[0] = fg_psij0;
4985  for(l=1; l<=2*m+1; l++)
4986  {
4987  fg_psij2 *= fg_psij1;
4988  psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4989  }
4990 
4991  uo(ths,j,&u,&o,(INT)1);
4992  fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4993  fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
4994  fg_psij2 = K(1.0);
4995  psij_const[2*m+2] = fg_psij0;
4996  for(l=1; l<=2*m+1; l++)
4997  {
4998  fg_psij2 *= fg_psij1;
4999  psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5000  }
5001 
5002  uo(ths,j,&u,&o,(INT)2);
5003  fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
5004  fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
5005  fg_psij2 = K(1.0);
5006  psij_const[2*(2*m+2)] = fg_psij0;
5007  for(l=1; l<=2*m+1; l++)
5008  {
5009  fg_psij2 *= fg_psij1;
5010  psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5011  }
5012 
5013 #ifdef _OPENMP
5014  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5015 #else
5016  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5017 #endif
5018  }
5019 
5020  return;
5021  } /* if(FG_PSI) */
5022 
5023  if(ths->flags & PRE_LIN_PSI)
5024  {
5025  const INT K = ths->K;
5026  const INT ip_s = K / (m + 2);
5027 
5028  sort(ths);
5029 
5030 #ifdef _OPENMP
5031  MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
5032 #endif
5033 
5034 #ifdef _OPENMP
5035  #pragma omp parallel for default(shared) private(k)
5036 #endif
5037  for (k = 0; k < M; k++)
5038  {
5039  INT u,o,l;
5040  INT ip_u;
5041  R ip_y, ip_w;
5042  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5043  R psij_const[3*(2*m+2)];
5044 
5045  uo(ths,j,&u,&o,(INT)0);
5046  ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
5047  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5048  ip_w = ip_y - (R)(ip_u);
5049  for(l=0; l < 2*m+2; l++)
5050  psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5051  ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5052 
5053  uo(ths,j,&u,&o,(INT)1);
5054  ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
5055  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5056  ip_w = ip_y - (R)(ip_u);
5057  for(l=0; l < 2*m+2; l++)
5058  psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5059  ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5060 
5061  uo(ths,j,&u,&o,(INT)2);
5062  ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
5063  ip_u = (INT)(LRINT(FLOOR(ip_y)));
5064  ip_w = ip_y - (R)(ip_u);
5065  for(l=0; l < 2*m+2; l++)
5066  psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5067  ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5068 
5069 #ifdef _OPENMP
5070  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5071 #else
5072  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5073 #endif
5074  }
5075  return;
5076  } /* if(PRE_LIN_PSI) */
5077 
5078  /* no precomputed psi at all */
5079  sort(ths);
5080 
5081 #ifdef _OPENMP
5082  MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5083 #endif
5084 
5085 #ifdef _OPENMP
5086  #pragma omp parallel for default(shared) private(k)
5087 #endif
5088  for (k = 0; k < M; k++)
5089  {
5090  INT u,o,l;
5091  R psij_const[3*(2*m+2)];
5092  INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5093 
5094  uo(ths,j,&u,&o,(INT)0);
5095  for(l=0;l<=2*m+1;l++)
5096  psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
5097 
5098  uo(ths,j,&u,&o,(INT)1);
5099  for(l=0;l<=2*m+1;l++)
5100  psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
5101 
5102  uo(ths,j,&u,&o,(INT)2);
5103  for(l=0;l<=2*m+1;l++)
5104  psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
5105 
5106 #ifdef _OPENMP
5107  nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5108 #else
5109  nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5110 #endif
5111  }
5112 }
5113 
5114 
5115 void X(trafo_3d)(X(plan) *ths)
5116 {
5117  INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5118  C *g_hat,*f_hat;
5119  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5120  R ck01, ck02, ck11, ck12, ck21, ck22;
5121  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5122  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5123 
5124  ths->g_hat=ths->g1;
5125  ths->g=ths->g2;
5126 
5127  N0=ths->N[0];
5128  N1=ths->N[1];
5129  N2=ths->N[2];
5130  n0=ths->n[0];
5131  n1=ths->n[1];
5132  n2=ths->n[2];
5133 
5134  f_hat=(C*)ths->f_hat;
5135  g_hat=(C*)ths->g_hat;
5136 
5137  TIC(0)
5138 #ifdef _OPENMP
5139  #pragma omp parallel for default(shared) private(k0)
5140  for (k0 = 0; k0 < ths->n_total; k0++)
5141  ths->g_hat[k0] = 0.0;
5142 #else
5143  memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
5144 #endif
5145 
5146  if(ths->flags & PRE_PHI_HUT)
5147  {
5148  c_phi_inv01=ths->c_phi_inv[0];
5149  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5150 
5151 #ifdef _OPENMP
5152  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5153 #endif
5154  for(k0=0;k0<N0/2;k0++)
5155  {
5156  ck01=c_phi_inv01[k0];
5157  ck02=c_phi_inv02[k0];
5158  c_phi_inv11=ths->c_phi_inv[1];
5159  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5160 
5161  for(k1=0;k1<N1/2;k1++)
5162  {
5163  ck11=c_phi_inv11[k1];
5164  ck12=c_phi_inv12[k1];
5165  c_phi_inv21=ths->c_phi_inv[2];
5166  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5167 
5168  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5169  f_hat111=f_hat + (k0*N1+k1)*N2;
5170  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5171  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5172  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5173  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5174  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5175  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5176 
5177  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5178  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5179  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5180  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5181  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5182  f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5183  g_hat222=g_hat + (k0*n1+k1)*n2;
5184  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5185 
5186  for(k2=0;k2<N2/2;k2++)
5187  {
5188  ck21=c_phi_inv21[k2];
5189  ck22=c_phi_inv22[k2];
5190 
5191  g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5192  g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5193  g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5194  g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5195 
5196  g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5197  g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5198  g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5199  g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5200  }
5201  }
5202  }
5203  }
5204  else
5205 #ifdef _OPENMP
5206  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5207 #endif
5208  for(k0=0;k0<N0/2;k0++)
5209  {
5210  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5211  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5212  for(k1=0;k1<N1/2;k1++)
5213  {
5214  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5215  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5216 
5217  for(k2=0;k2<N2/2;k2++)
5218  {
5219  ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5220  ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5221 
5222  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5223  g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5224  g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5225  g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5226 
5227  g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5228  g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5229  g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5230  g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5231  }
5232  }
5233  }
5234 
5235  TOC(0)
5236 
5237  TIC_FFTW(1)
5238  FFTW(execute)(ths->my_fftw_plan1);
5239  TOC_FFTW(1);
5240 
5241  TIC(2);
5242  nfft_trafo_3d_B(ths);
5243  TOC(2);
5244 }
5245 
5246 void X(adjoint_3d)(X(plan) *ths)
5247 {
5248  INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5249  C *g_hat,*f_hat;
5250  R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5251  R ck01, ck02, ck11, ck12, ck21, ck22;
5252  C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5253  C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5254 
5255  ths->g_hat=ths->g1;
5256  ths->g=ths->g2;
5257 
5258  N0=ths->N[0];
5259  N1=ths->N[1];
5260  N2=ths->N[2];
5261  n0=ths->n[0];
5262  n1=ths->n[1];
5263  n2=ths->n[2];
5264 
5265  f_hat=(C*)ths->f_hat;
5266  g_hat=(C*)ths->g_hat;
5267 
5268  TIC(2);
5269  nfft_adjoint_3d_B(ths);
5270  TOC(2);
5271 
5272  TIC_FFTW(1)
5273  FFTW(execute)(ths->my_fftw_plan2);
5274  TOC_FFTW(1);
5275 
5276  TIC(0)
5277  if(ths->flags & PRE_PHI_HUT)
5278  {
5279  c_phi_inv01=ths->c_phi_inv[0];
5280  c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5281 
5282 #ifdef _OPENMP
5283  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5284 #endif
5285  for(k0=0;k0<N0/2;k0++)
5286  {
5287  ck01=c_phi_inv01[k0];
5288  ck02=c_phi_inv02[k0];
5289  c_phi_inv11=ths->c_phi_inv[1];
5290  c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5291 
5292  for(k1=0;k1<N1/2;k1++)
5293  {
5294  ck11=c_phi_inv11[k1];
5295  ck12=c_phi_inv12[k1];
5296  c_phi_inv21=ths->c_phi_inv[2];
5297  c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5298 
5299  g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5300  f_hat111=f_hat + (k0*N1+k1)*N2;
5301  g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5302  f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5303  g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5304  f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5305  g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5306  f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5307 
5308  g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5309  f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5310  g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5311  f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5312  g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5313  f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5314  g_hat222=g_hat + (k0*n1+k1)*n2;
5315  f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5316 
5317  for(k2=0;k2<N2/2;k2++)
5318  {
5319  ck21=c_phi_inv21[k2];
5320  ck22=c_phi_inv22[k2];
5321 
5322  f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5323  f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5324  f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5325  f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5326 
5327  f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5328  f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5329  f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5330  f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5331  }
5332  }
5333  }
5334  }
5335  else
5336 #ifdef _OPENMP
5337  #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5338 #endif
5339  for(k0=0;k0<N0/2;k0++)
5340  {
5341  ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5342  ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5343  for(k1=0;k1<N1/2;k1++)
5344  {
5345  ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5346  ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5347 
5348  for(k2=0;k2<N2/2;k2++)
5349  {
5350  ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5351  ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5352 
5353  f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5354  f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5355  f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5356  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5357 
5358  f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5359  f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5360  f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5361  f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5362  }
5363  }
5364  }
5365 
5366  TOC(0)
5367 }
5368 
5371 void X(trafo)(X(plan) *ths)
5372 {
5373  switch(ths->d)
5374  {
5375  case 1: X(trafo_1d)(ths); break;
5376  case 2: X(trafo_2d)(ths); break;
5377  case 3: X(trafo_3d)(ths); break;
5378  default:
5379  {
5380  /* use ths->my_fftw_plan1 */
5381  ths->g_hat = ths->g1;
5382  ths->g = ths->g2;
5383 
5387  TIC(0)
5388  D_A(ths);
5389  TOC(0)
5390 
5395  TIC_FFTW(1)
5396  FFTW(execute)(ths->my_fftw_plan1);
5397  TOC_FFTW(1)
5398 
5402  TIC(2)
5403  B_A(ths);
5404  TOC(2)
5405  }
5406  }
5407 } /* nfft_trafo */
5408 
5409 void X(adjoint)(X(plan) *ths)
5410 {
5411  switch(ths->d)
5412  {
5413  case 1: X(adjoint_1d)(ths); break;
5414  case 2: X(adjoint_2d)(ths); break;
5415  case 3: X(adjoint_3d)(ths); break;
5416  default:
5417  {
5418  /* use ths->my_fftw_plan2 */
5419  ths->g_hat=ths->g1;
5420  ths->g=ths->g2;
5421 
5425  TIC(2)
5426  B_T(ths);
5427  TOC(2)
5428 
5433  TIC_FFTW(1)
5434  FFTW(execute)(ths->my_fftw_plan2);
5435  TOC_FFTW(1)
5436 
5440  TIC(0)
5441  D_T(ths);
5442  TOC(0)
5443  }
5444  }
5445 } /* nfft_adjoint */
5446 
5447 
5450 static void precompute_phi_hut(X(plan) *ths)
5451 {
5452  INT ks[ths->d]; /* index over all frequencies */
5453  INT t; /* index over all dimensions */
5454 
5455  ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) * sizeof(R*));
5456 
5457  for (t = 0; t < ths->d; t++)
5458  {
5459  ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) * sizeof(R));
5460 
5461  for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
5462  {
5463  ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
5464  }
5465  }
5466 } /* nfft_phi_hut */
5467 
5472 void X(precompute_lin_psi)(X(plan) *ths)
5473 {
5474  INT t;
5475  INT j;
5476  R step;
5478  for (t=0; t<ths->d; t++)
5479  {
5480  step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
5481  for(j = 0;j <= ths->K; j++)
5482  {
5483  ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
5484  } /* for(j) */
5485  } /* for(t) */
5486 }
5487 
5488 void X(precompute_fg_psi)(X(plan) *ths)
5489 {
5490  INT t;
5491  INT u, o;
5493  sort(ths);
5494 
5495  for (t=0; t<ths->d; t++)
5496  {
5497  INT j;
5498 #ifdef _OPENMP
5499  #pragma omp parallel for default(shared) private(j,u,o)
5500 #endif
5501  for (j = 0; j < ths->M_total; j++)
5502  {
5503  uo(ths,j,&u,&o,t);
5504 
5505  ths->psi[2*(j*ths->d+t)]=
5506  (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
5507 
5508  ths->psi[2*(j*ths->d+t)+1]=
5509  EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
5510  } /* for(j) */
5511  }
5512  /* for(t) */
5513 } /* nfft_precompute_fg_psi */
5514 
5515 void X(precompute_psi)(X(plan) *ths)
5516 {
5517  INT t; /* index over all dimensions */
5518  INT l; /* index u<=l<=o */
5519  INT lj; /* index 0<=lj<u+o+1 */
5520  INT u, o; /* depends on x_j */
5521 
5522  sort(ths);
5523 
5524  for (t=0; t<ths->d; t++)
5525  {
5526  INT j;
5527 #ifdef _OPENMP
5528  #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5529 #endif
5530  for (j = 0; j < ths->M_total; j++)
5531  {
5532  uo(ths,j,&u,&o,t);
5533 
5534  for(l = u, lj = 0; l <= o; l++, lj++)
5535  ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
5536  (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
5537  } /* for(j) */
5538  }
5539  /* for(t) */
5540 } /* nfft_precompute_psi */
5541 
5542 #ifdef _OPENMP
5543 static void nfft_precompute_full_psi_omp(X(plan) *ths)
5544 {
5545  INT j;
5546  INT lprod;
5548  {
5549  INT t;
5550  for(t=0,lprod = 1; t<ths->d; t++)
5551  lprod *= 2*ths->m+2;
5552  }
5553 
5554  #pragma omp parallel for default(shared) private(j)
5555  for(j=0; j<ths->M_total; j++)
5556  {
5557  INT t,t2;
5558  INT l_L;
5559  INT l[ths->d];
5560  INT lj[ths->d];
5561  INT ll_plain[ths->d+1];
5563  INT u[ths->d], o[ths->d];
5565  R phi_prod[ths->d+1];
5566  INT ix = j*lprod;
5567 
5568  phi_prod[0]=1;
5569  ll_plain[0]=0;
5570 
5571  MACRO_init_uo_l_lj_t;
5572 
5573  for(l_L=0; l_L<lprod; l_L++, ix++)
5574  {
5575  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5576 
5577  ths->psi_index_g[ix]=ll_plain[ths->d];
5578  ths->psi[ix]=phi_prod[ths->d];
5579 
5580  MACRO_count_uo_l_lj_t;
5581  } /* for(l_L) */
5582 
5583  ths->psi_index_f[j]=lprod;
5584  } /* for(j) */
5585 }
5586 #endif
5587 
5588 void X(precompute_full_psi)(X(plan) *ths)
5589 {
5590 #ifdef _OPENMP
5591  sort(ths);
5592 
5593  nfft_precompute_full_psi_omp(ths);
5594 #else
5595  INT t, t2; /* index over all dimensions */
5596  INT j; /* index over all nodes */
5597  INT l_L; /* plain index 0 <= l_L < lprod */
5598  INT l[ths->d]; /* multi index u<=l<=o */
5599  INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
5600  INT ll_plain[ths->d+1]; /* postfix plain index */
5601  INT lprod; /* 'bandwidth' of matrix B */
5602  INT u[ths->d], o[ths->d]; /* depends on x_j */
5603 
5604  R phi_prod[ths->d+1];
5605 
5606  INT ix, ix_old;
5607 
5608  sort(ths);
5609 
5610  phi_prod[0] = K(1.0);
5611  ll_plain[0] = 0;
5612 
5613  for (t = 0, lprod = 1; t < ths->d; t++)
5614  lprod *= 2 * ths->m + 2;
5615 
5616  for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
5617  {
5618  MACRO_init_uo_l_lj_t;
5619 
5620  for (l_L = 0; l_L < lprod; l_L++, ix++)
5621  {
5622  MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5623 
5624  ths->psi_index_g[ix] = ll_plain[ths->d];
5625  ths->psi[ix] = phi_prod[ths->d];
5626 
5627  MACRO_count_uo_l_lj_t;
5628  } /* for(l_L) */
5629 
5630  ths->psi_index_f[j] = ix - ix_old;
5631  ix_old = ix;
5632  } /* for(j) */
5633 #endif
5634 }
5635 
5636 void X(precompute_one_psi)(X(plan) *ths)
5637 {
5638  if(ths->flags & PRE_LIN_PSI)
5639  X(precompute_lin_psi)(ths);
5640  if(ths->flags & PRE_FG_PSI)
5641  X(precompute_fg_psi)(ths);
5642  if(ths->flags & PRE_PSI)
5643  X(precompute_psi)(ths);
5644  if(ths->flags & PRE_FULL_PSI)
5645  X(precompute_full_psi)(ths);
5646 }
5647 
5648 static void init_help(X(plan) *ths)
5649 {
5650  INT t; /* index over all dimensions */
5651  INT lprod; /* 'bandwidth' of matrix B */
5652 
5653  if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5654  ths->flags |= NFFT_SORT_NODES;
5655 
5656  ths->N_total = intprod(ths->N, 0, ths->d);
5657  ths->n_total = intprod(ths->n, 0, ths->d);
5658 
5659  ths->sigma = (R*) Y(malloc)((size_t)(ths->d) * sizeof(R));
5660 
5661  for(t = 0;t < ths->d; t++)
5662  ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
5663 
5664  WINDOW_HELP_INIT;
5665 
5666  if(ths->flags & MALLOC_X)
5667  ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) * sizeof(R));
5668 
5669  if(ths->flags & MALLOC_F_HAT)
5670  ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) * sizeof(C));
5671 
5672  if(ths->flags & MALLOC_F)
5673  ths->f = (C*)Y(malloc)((size_t)(ths->M_total) * sizeof(C));
5674 
5675  if(ths->flags & PRE_PHI_HUT)
5676  precompute_phi_hut(ths);
5677 
5678  if (ths->flags & PRE_LIN_PSI)
5679  {
5680  if (ths->K == 0)
5681  {
5682  ths->K = Y(m2K)(ths->m);
5683  }
5684  ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) * sizeof(R));
5685  }
5686 
5687  if(ths->flags & PRE_FG_PSI)
5688  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) * sizeof(R));
5689 
5690  if(ths->flags & PRE_PSI)
5691  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) * sizeof(R));
5692 
5693  if(ths->flags & PRE_FULL_PSI)
5694  {
5695  for (t = 0, lprod = 1; t < ths->d; t++)
5696  lprod *= 2 * ths->m + 2;
5697 
5698  ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(R));
5699 
5700  ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) * sizeof(INT));
5701  ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(INT));
5702  }
5703 
5704  if(ths->flags & FFTW_INIT)
5705  {
5706 #ifdef _OPENMP
5707  INT nthreads = Y(get_num_threads)();
5708 #endif
5709 
5710  ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) * sizeof(C));
5711 
5712  if(ths->flags & FFT_OUT_OF_PLACE)
5713  ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) * sizeof(C));
5714  else
5715  ths->g2 = ths->g1;
5716 
5717 #ifdef _OPENMP
5718 #pragma omp critical (nfft_omp_critical_fftw_plan)
5719 {
5720  FFTW(plan_with_nthreads)(nthreads);
5721 #endif
5722  {
5723  int *_n = Y(malloc)((size_t)(ths->d) * sizeof(int));
5724 
5725  for (t = 0; t < ths->d; t++)
5726  _n[t] = (int)(ths->n[t]);
5727 
5728  ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
5729  ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
5730  Y(free)(_n);
5731  }
5732 #ifdef _OPENMP
5733 }
5734 #endif
5735  }
5736 
5737  if(ths->flags & NFFT_SORT_NODES)
5738  ths->index_x = (INT*) Y(malloc)(sizeof(INT) * 2U * (size_t)(ths->M_total));
5739  else
5740  ths->index_x = NULL;
5741 
5742  ths->mv_trafo = (void (*) (void* ))X(trafo);
5743  ths->mv_adjoint = (void (*) (void* ))X(adjoint);
5744 }
5745 
5746 void X(init)(X(plan) *ths, int d, int *N, int M_total)
5747 {
5748  INT t; /* index over all dimensions */
5749 
5750  ths->d = (INT)d;
5751 
5752  ths->N = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
5753 
5754  for (t = 0; t < d; t++)
5755  ths->N[t] = (INT)N[t];
5756 
5757  ths->M_total = (INT)M_total;
5758 
5759  ths->n = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
5760 
5761  for (t = 0; t < d; t++)
5762  ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
5763 
5764  ths->m = WINDOW_HELP_ESTIMATE_m;
5765 
5766  if (d > 1)
5767  {
5768 #ifdef _OPENMP
5769  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5770  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
5771  NFFT_OMP_BLOCKWISE_ADJOINT;
5772 #else
5773  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5774  FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
5775 #endif
5776  }
5777  else
5778  ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5779  FFTW_INIT | FFT_OUT_OF_PLACE;
5780 
5781  ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
5782 
5783  ths->K = 0;
5784  init_help(ths);
5785 }
5786 
5787 void X(init_guru)(X(plan) *ths, int d, int *N, int M_total, int *n, int m,
5788  unsigned flags, unsigned fftw_flags)
5789 {
5790  INT t; /* index over all dimensions */
5791 
5792  ths->d = (INT)d;
5793  ths->M_total = (INT)M_total;
5794  ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5795 
5796  for (t = 0; t < d; t++)
5797  ths->N[t] = (INT)N[t];
5798 
5799  ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5800 
5801  for (t = 0; t < d; t++)
5802  ths->n[t] = (INT)n[t];
5803 
5804  ths->m = (INT)m;
5805 
5806  ths->flags = flags;
5807  ths->fftw_flags = fftw_flags;
5808 
5809  ths->K = 0;
5810  init_help(ths);
5811 }
5812 
5813 void X(init_lin)(X(plan) *ths, int d, int *N, int M_total, int *n, int m, int K,
5814  unsigned flags, unsigned fftw_flags)
5815 {
5816  INT t; /* index over all dimensions */
5817 
5818  ths->d = (INT)d;
5819  ths->M_total = (INT)M_total;
5820  ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5821 
5822  for (t = 0; t < d; t++)
5823  ths->N[t] = (INT)N[t];
5824 
5825  ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
5826 
5827  for (t = 0; t < d; t++)
5828  ths->n[t] = (INT)n[t];
5829 
5830  ths->m = (INT)m;
5831 
5832  ths->flags = flags;
5833  ths->fftw_flags = fftw_flags;
5834 
5835  ths->K = K;
5836  init_help(ths);
5837 }
5838 
5839 void X(init_1d)(X(plan) *ths, int N1, int M_total)
5840 {
5841  int N[1];
5842 
5843  N[0] = N1;
5844 
5845  X(init)(ths, 1, N, M_total);
5846 }
5847 
5848 void X(init_2d)(X(plan) *ths, int N1, int N2, int M_total)
5849 {
5850  int N[2];
5851 
5852  N[0] = N1;
5853  N[1] = N2;
5854  X(init)(ths, 2, N, M_total);
5855 }
5856 
5857 void X(init_3d)(X(plan) *ths, int N1, int N2, int N3, int M_total)
5858 {
5859  int N[3];
5860 
5861  N[0] = N1;
5862  N[1] = N2;
5863  N[2] = N3;
5864  X(init)(ths, 3, N, M_total);
5865 }
5866 
5867 const char* X(check)(X(plan) *ths)
5868 {
5869  INT j;
5870 
5871  if (!ths->f)
5872  return "Member f not initialized.";
5873 
5874  if (!ths->x)
5875  return "Member x not initialized.";
5876 
5877  if (!ths->f_hat)
5878  return "Member f_hat not initialized.";
5879 
5880  if ((ths->flags & PRE_LIN_PSI) && ths->K < ths->M_total)
5881  return "Number of nodes too small to use PRE_LIN_PSI.";
5882 
5883  for (j = 0; j < ths->M_total * ths->d; j++)
5884  {
5885  if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
5886  {
5887  return "ths->x out of range [-0.5,0.5)";
5888  }
5889  }
5890 
5891  for (j = 0; j < ths->d; j++)
5892  {
5893  if (ths->sigma[j] <= 1)
5894  return "Oversampling factor too small";
5895 
5896  if(ths->N[j] <= ths->m)
5897  return "Polynomial degree N is smaller than cut-off m";
5898 
5899  if(ths->N[j]%2 == 1)
5900  return "polynomial degree N has to be even";
5901  }
5902  return 0;
5903 }
5904 
5905 void X(finalize)(X(plan) *ths)
5906 {
5907  INT t; /* index over dimensions */
5908 
5909  if(ths->flags & NFFT_SORT_NODES)
5910  Y(free)(ths->index_x);
5911 
5912  if(ths->flags & FFTW_INIT)
5913  {
5914 #ifdef _OPENMP
5915  #pragma omp critical (nfft_omp_critical_fftw_plan)
5916 #endif
5917  FFTW(destroy_plan)(ths->my_fftw_plan2);
5918 #ifdef _OPENMP
5919  #pragma omp critical (nfft_omp_critical_fftw_plan)
5920 #endif
5921  FFTW(destroy_plan)(ths->my_fftw_plan1);
5922 
5923  if(ths->flags & FFT_OUT_OF_PLACE)
5924  Y(free)(ths->g2);
5925 
5926  Y(free)(ths->g1);
5927  }
5928 
5929  if(ths->flags & PRE_FULL_PSI)
5930  {
5931  Y(free)(ths->psi_index_g);
5932  Y(free)(ths->psi_index_f);
5933  Y(free)(ths->psi);
5934  }
5935 
5936  if(ths->flags & PRE_PSI)
5937  Y(free)(ths->psi);
5938 
5939  if(ths->flags & PRE_FG_PSI)
5940  Y(free)(ths->psi);
5941 
5942  if(ths->flags & PRE_LIN_PSI)
5943  Y(free)(ths->psi);
5944 
5945  if(ths->flags & PRE_PHI_HUT)
5946  {
5947  for (t = 0; t < ths->d; t++)
5948  Y(free)(ths->c_phi_inv[t]);
5949  Y(free)(ths->c_phi_inv);
5950  }
5951 
5952  if(ths->flags & MALLOC_F)
5953  Y(free)(ths->f);
5954 
5955  if(ths->flags & MALLOC_F_HAT)
5956  Y(free)(ths->f_hat);
5957 
5958  if(ths->flags & MALLOC_X)
5959  Y(free)(ths->x);
5960 
5961  WINDOW_HELP_FINALIZE;
5962 
5963  Y(free)(ths->sigma);
5964  Y(free)(ths->n);
5965  Y(free)(ths->N);
5966 }
#define TIC(a)
Timing, method works since the inaccurate timer is updated mostly in the measured function...
Definition: infft.h:1400
#define X(name)
Include header for C99 complex datatype.
Definition: fastsum.h:51
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.
Definition: infft.h:1368