Why is my fft function not working as expected?

Trying to take the fft of 2D array along one direction (x for example and not y) so it's a 1D FFT of 2D array. I have done this successfully before using Eigen library, but here I am not quite sure how to implement "fftw_plan_many_dft" correctly.

Example code:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
static const int nx = 8;
static const int ny = 8;
static const int ncomp = 2;
static const int nyk = ny/2 + 1;
using namespace std;

class MyFftwClass {
public:
    MyFftwClass(void);
    ~MyFftwClass(void);
    void r2cexecute(double rArr[], double cArr[][ncomp]);

private:
    static fftw_plan s_plan; // <-- shared by all instances
    double *m_buffer_in;
    fftw_complex *m_buffer_out;
    
};

class MyiFftwClass { //inverse
public:
    MyiFftwClass(void);
    ~MyiFftwClass(void);
    void c2rexecute(double cArr[][ncomp],double rArr[]);

private:
    static fftw_plan s_plan1; // <-- shared by all instances
	fftw_complex *m_buffer_in1;
    double *m_buffer_out1;
};

MyFftwClass r2c;
MyiFftwClass c2r; 
int main(){	
double *uFun; 
	uFun = (double*) fftw_malloc((nx*(ny+1))*sizeof(double));

    for(int i = 0; i< nx; i++){
		for(int j = 0; j< ny+1; j++){
            uFun[j + (ny+1)*i] =//some real function
        }
    }
 fftw_complex *uFunk;
uFunk = (fftw_complex*) fftw_malloc((nx*nyk) * sizeof(fftw_complex));
memset(uFunk, 42, (nx*nyk)* sizeof(fftw_complex));

r2c.r2cexecute(uFun,uFunk); //take forward FFT
 double *Out; 
Out = (double*) fftw_malloc((nx*(ny+1))*sizeof(double));
 memset(Out, 42, (nx*(ny+1)) * sizeof(double)); 

c2r.c2rexecute(uFunk,Out);
}
MyiFftwClass::MyiFftwClass(void)
{
    m_buffer_in1 = fftw_alloc_complex((nx*nyk));
    m_buffer_out1 = fftw_alloc_real((nx*ny)); 
    if (!(m_buffer_in1 && m_buffer_out1))
    {
        throw new std::runtime_error("Failed to allocate memory!");
    }

    if (!s_plan1)
    {
        s_plan1 = fftw_plan_many_dft_c2r(1,&nx, (nyk),m_buffer_in1,nullptr,(nyk),1,m_buffer_out1,nullptr, (ny+1),1, FFTW_MEASURE);
        
        if (!s_plan1)
        {
            throw new std::runtime_error("Failed to create plan!");
        }
    }

}
MyiFftwClass::~MyiFftwClass(void)
{

    fftw_free(m_buffer_in1);
    fftw_free(m_buffer_out1);
}

void MyiFftwClass::c2rexecute(double cArr[][ncomp],double rArr[]) //void MyFftwClass::execute(double *const in, fftw_complex *const out)
{
	memcpy(m_buffer_in1,cArr, sizeof(fftw_complex) * (nx*nyk));
    fftw_execute_dft_c2r(s_plan1, m_buffer_in1, m_buffer_out1); //instead of fftw_excute(plan)
	memcpy(rArr, m_buffer_out1,sizeof(double) * (nx*ny));
 //renormalize 
for(int i = 0; i < nx; i++){
		for( int j = 0; j < ny+1; j++){
			rArr[j + (ny+1)*i] = rArr[j + (ny+1)*i] * 1.0 / (nx);			
		}
	}
}

fftw_plan MyiFftwClass::s_plan1 = NULL;

MyFftwClass::MyFftwClass(void)
{
    m_buffer_in = fftw_alloc_real((nx*ny)); 
    m_buffer_out = fftw_alloc_complex((nx*nyk));
    if (!(m_buffer_in && m_buffer_out))
    {
        throw new std::runtime_error("Failed to allocate memory!");
    }
	
   if (!s_plan)
    {
	
        s_plan = fftw_plan_many_dft_r2c(1,&nx, (ny+1),m_buffer_in,nullptr,(ny+1),1,m_buffer_out,nullptr, (ny+1),1, FFTW_MEASURE);
   	
        if (!s_plan)
        {
            throw new std::runtime_error("Failed to create plan!");
        }
    }

    
}

MyFftwClass::~MyFftwClass(void)
{
    fftw_free(m_buffer_in);
    fftw_free(m_buffer_out);
}

void MyFftwClass::r2cexecute(double rArr[], double cArr[][ncomp]) MyFftwClass::execute(double *const in, fftw_complex *const out)
{

    memcpy(m_buffer_in, rArr,  sizeof(double) * (nx*ny));
    fftw_execute_dft_r2c(s_plan, m_buffer_in, m_buffer_out); 
    memcpy(cArr, m_buffer_out, sizeof(fftw_complex) * (nx*nyk));
}

fftw_plan MyFftwClass::s_plan = NULL;


This doesn't return the correct output, Out and uFun are not matching (which they should). What am I doing wrong here?
Last edited on
I think that by convention Mi,j is the element at row i, column j in the matrix M.
1. Which is the column index? Is it j or i?
2. What about storage order? Is it row-major or column major?

TBH I'm inclined to just use x, y everywhere: x is columns, y is rows. Cartesian coordinates. With the arrays in row major order, so adjacent elements on the same row are next to each other in memory.

Let me know if this helps or your set up is different.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <cmath>
#include <iomanip>
#include <complex>
#include <iostream>

#include <fftw3.h>

static const int nx = 8;
static const int ny = 8;
static const int nyk = ny / 2 + 1;

static constexpr int width = 8;

using namespace std;

// Access the element at column x, row y in an array.  xy are cartesian
// coordinates.  
int xy(int x, int y, int number_of_cols) { return x + y * number_of_cols; }
void print(double* m, int width, int height)
{
  std::cout << std::fixed << std::setprecision(2);
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; ++x)
      std::cout << std::setw(6) << m[xy(x, y, width)] << ' ';
    std::cout << '\n';
  }

  std::cout << '\n';
}

void print(fftw_complex* m, int width, int height)
{
  std::cout << std::fixed << std::setprecision(2);
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; ++x)
      std::cout << std::setw(6) << std::complex{m[xy(x, y, width)][0], m[xy(x, y, width)][1]} <<  ' ';
    std::cout << '\n';
  }

  std::cout << '\n';
}

int main(){	
  double* uFun        = fftw_alloc_real(nx * (ny + 1));
  fftw_complex* uFunk = fftw_alloc_complex(nx * (ny + 1));
  double* Out         = fftw_alloc_real(nx * (ny + 1));
  
  // This plan computes independent one-dimensional DFTs of each row of the
  // array.
  static fftw_plan r2c_plan = fftw_plan_many_dft_r2c(
    1,                          // int rank,
    &nx,                        // const int *n,
    ny + 1,                     // int howmany,
    uFun,                       // double *in,
    nullptr,                    // const int *inembed,
    1,                          // int istride, // adjacent elements within a row are adjacent.
    nx,                         // int idist,   // adjacent elements within a column are separate
    uFunk,                      // fftw_complex *out,
    nullptr,                    // const int *onembed,
    1,                          // int ostride, 
    nx,                         // int odist,
    FFTW_ESTIMATE               // unsigned flags
    );
  
  static fftw_plan c2r_plan = fftw_plan_many_dft_c2r(
    1,                          // int rank,
    &nx,                        // const int *n,
    ny + 1,                     // int howmany,
    uFunk,                      // double *in,
    nullptr,                    // const int *inembed,
    1,                          // int istride, // adjacent elements within a row are adjacent.
    nx,                         // int idist,   // adjacent elements within a column are separate
    Out,                        // fftw_complex *out,
    nullptr,                    // const int *onembed,
    1,                          // int ostride, 
    nx,                         // int odist,
    FFTW_ESTIMATE               // unsigned flags
    );

  for(int y = 0; y < ny + 1; y++) 
    for(int x = 0; x < nx; x++)
    {
      double const XXxy = +std::sin(x * 2.0 * 3.14159 / nx);
      double const YYxy = -std::cos(y * 3.14159 / ny) - 0.5;
      uFun[xy(x, y, nx)] = YYxy * YYxy * XXxy;
    }
  
  print(uFun, nx, ny+1);
  fftw_execute_dft_r2c(r2c_plan, uFun, uFunk);
  print(uFunk, nx, ny+1);
  fftw_execute_dft_c2r(c2r_plan, uFunk, Out);

  // renormalize:
  for(int y = 0; y < ny + 1; y++) 
    for(int x = 0; x < nx; x++)
      Out[xy(x, y, nx)] /= nx;
  
  print(Out, nx, ny+1);
  
  // Good practice: free stuff in reverse order of initialization
  fftw_free(Out);
  fftw_free(uFunk);
  fftw_free(uFun);
}


The output is here:

  0.00   1.59   2.25   1.59   0.00  -1.59  -2.25  -1.59
  0.00   1.43   2.03   1.43   0.00  -1.43  -2.03  -1.43
  0.00   1.03   1.46   1.03   0.00  -1.03  -1.46  -1.03
  0.00   0.55   0.78   0.55   0.00  -0.55  -0.78  -0.55
  0.00   0.18   0.25   0.18   0.00  -0.18  -0.25  -0.18
  0.00   0.01   0.01   0.01   0.00  -0.01  -0.01  -0.01
  0.00   0.03   0.04   0.03   0.00  -0.03  -0.04  -0.03
  0.00   0.13   0.18   0.13   0.00  -0.13  -0.18  -0.13
  0.00   0.18   0.25   0.18   0.00  -0.18  -0.25  -0.18

(0.00,0.00) (-0.00,-9.00) (0.00,-0.00) (0.00,-0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00)
(0.00,0.00) (-0.00,-8.11) (0.00,-0.00) (0.00,-0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00)
(0.00,0.00) (-0.00,-5.83) (0.00,-0.00) (0.00,-0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00)
(0.00,0.00) (-0.00,-3.12) (0.00,-0.00) (0.00,-0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00)
(0.00,0.00) (-0.00,-1.00) (0.00,-0.00) (0.00,-0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00)
(0.00,0.00) (-0.00,-0.06) (0.00,-0.00) (0.00,-0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00)
(0.00,0.00) (-0.00,-0.17) (0.00,-0.00) (0.00,-0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00)
(0.00,0.00) (-0.00,-0.72) (0.00,-0.00) (0.00,-0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00)
(0.00,0.00) (-0.00,-1.00) (0.00,-0.00) (0.00,-0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00) (0.00,0.00)

  0.00   1.59   2.25   1.59   0.00  -1.59  -2.25  -1.59
  0.00   1.43   2.03   1.43   0.00  -1.43  -2.03  -1.43
 -0.00   1.03   1.46   1.03   0.00  -1.03  -1.46  -1.03
  0.00   0.55   0.78   0.55   0.00  -0.55  -0.78  -0.55
 -0.00   0.18   0.25   0.18   0.00  -0.18  -0.25  -0.18
  0.00   0.01   0.01   0.01   0.00  -0.01  -0.01  -0.01
 -0.00   0.03   0.04   0.03   0.00  -0.03  -0.04  -0.03
  0.00   0.13   0.18   0.13   0.00  -0.13  -0.18  -0.13
  0.00   0.18   0.25   0.18   0.00  -0.18  -0.25  -0.18
Last edited on
@mbozzi
Thanks a lot, your understanding of fftw always amazes me! Your example works perfectly, but I guess for sake of consistency, I'd like to stick to my example code so it helps me see where my mistakes at. I see that your plans are set up differently and so if I change my plans accordingly:

c2r plan
 
s_plan1 = fftw_plan_many_dft_c2r(1,&nx, (ny+1),m_buffer_in1,nullptr,1,(nx),m_buffer_out1,nullptr,1, (nx), FFTW_ESTIMATE);


r2c plan:
 
s_plan = fftw_plan_many_dft_r2c(1,&nx, (ny+1),m_buffer_in,nullptr,1,(nx),m_buffer_out,nullptr,1, (nx), FFTW_ESTIMATE);

Now this returns "half" of the correct results, I think "half" the number of columns+1 (like nyk). The way I am printing my results is nowhere near your sophisticated function but I am doing something like:
1
2
3
4
5
6
7
8
9
std::ofstream file3("Out.txt");
    if (file3.is_open())
    {
        for (int i = 0; i < nx; ++i) {
        		for (int j = 0; j < ny+1; ++j) {
                    file3 << Out[j + (ny+1)*i] << '\n'; 
                }
        }
    }



0    8.88178e-16  0         4.44089e-16  0           0         0.931981  -29.7959
0   -0.931981    -1.31802  -0.931981   4.44089e-16   0         2.25       1.78e-106
0   -3.18198     -4.5      -3.18198   -4.996e-16     0        -32.432     1.78253e-106
0   -5.43198    -7.68198  -5.43198    -6.47085e-16 -7.29594   -2.79594    1.78253e-106
0   -6.36396    -9        -6.36396        0         11.25      18.932     1.782e-106
0   -5.43198    -7.68198  -5.43198        0        -11.7959    17.3878    1.78253e-106
0   -3.18198    -4.5      -3.18198        0         8.61396    3.56802    1.78e-106
0    -0.931981   -1.31802  -0.931981      0        -3.56802   0.931981   1.78e-106
-8.88178e-16   4.44089e-16 -4.44e-16 -1.90484e-17     0       -0.386039  -11.7959  1.78e-106



The correct output looks like:

 0    0          0           0           0             -0         -0         -0
-0   -0.931981  -1.31802    -0.931981   -1.61411e-16    0.931981   1.31802    0.931981
-0   -3.18198   -4.5        -3.18198    -5.51091e-16    3.18198    4.5        3.18198
-0   -5.43198   -7.68198    -5.43198    -9.40771e-16    5.43198    7.68198    5.43198
-0   -6.36396   -9          -6.36396    -1.10218e-15    6.36396    9          6.36396
-0   -5.43198   -7.68198    -5.43198    -9.40771e-16    5.43198    7.68198    5.43198 
-0   -3.18198   -4.5        -3.18198    -5.51091e-16    3.18198    4.5        3.18198
-0   -0.931981  -1.31802    -0.931981   -1.61411e-16    0.931981   1.31802    0.931981
-0   -0         -0          -0          -0              0          0          0

I think that the meaning of indices and and array dimensions are inconsistent in the original post. Or if there is a convention, I couldn't discern it.

Could you please answer these?
- Which is the row index into the matrix -- i, or j?
- Which is the number of rows in the matrix -- is it ny+1 or nx?
- Should the matrix use row- or column-major storage order?

For now (to avoid waiting for a reply), I decided:
The row index is i, the column index is j, uFun has ny + 1 rows and nx columns, and every matrix uses row-major storage order. The code is in the next post.
Last edited on
(Don't miss the post above this one.)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#include <cmath>
#include <iomanip>
#include <complex>
#include <cstring>
#include <iostream>

#include <fftw3.h>

static const int nx = 8;
static const int ny = 8;
static const int ncomp = 2;
static const int nyk = ny/2 + 1;
static const int nxk = nx/2 + 1;
using namespace std;

// NOTE(mbozzi): Non-static member data was removed from these classes because
// it is only used in the constructor.
class MyFftwClass {
public:
  MyFftwClass(void);

  // ~MyFftwClass(void);
  void r2cexecute(double rArr[], double cArr[][ncomp]);

private:
  static fftw_plan s_plan; // <-- shared by all instances
  // double *m_buffer_in;
  // fftw_complex *m_buffer_out;
};

class MyiFftwClass { //inverse
public:
  MyiFftwClass(void);
  // ~MyiFftwClass(void);
  void c2rexecute(double cArr[][ncomp],double rArr[]);
private:
  static fftw_plan s_plan1; // <-- shared by all instances
  // fftw_complex *m_buffer_in1;
  // double *m_buffer_out1;
};

MyFftwClass r2c;
MyiFftwClass c2r;

// Assumes row-major storage where i = row index, j = column index.
int ij(int i, int j, int number_of_cols) { return j + i * number_of_cols; }

void print(double* m, int width, int height)
{
  std::cout << std::fixed << std::setprecision(2);
  for (int i = 0; i < height; ++i)
  {
    for (int j = 0; j < width; ++j)
      std::cout << std::setw(6) << m[ij(i, j, width)] << ' ';
    std::cout << '\n';
  }

  std::cout << '\n';
}

void print(fftw_complex* m, int width, int height)
{
  std::cout << std::fixed << std::setprecision(2);
  for (int i = 0; i < height; ++i)
  {
    for (int j = 0; j < width; ++j)
      std::cout << std::setw(6) << std::complex{m[ij(i, j, width)][0], m[ij(i, j, width)][1]} <<  ' ';
    std::cout << '\n';
  }

  std::cout << '\n';
}

int main()
{
  double *uFun;
  uFun = (double*) fftw_malloc((nx*(ny+1))*sizeof(double));

  // for(int i = 0; i< nx; i++){
  //   for(int j = 0; j< ny+1; j++){
  //     uFun[j + (ny+1)*i] =//some real function
  //       }
  // }
  // NOTE(mbozzi): Using the same test function as earlier
  for(int i = 0; i < ny + 1; i++)
    for(int j = 0; j < nx; j++)
    {
      double const XX = +std::sin(j * 2.0 * 3.14159 / nx);
      double const YY = -std::cos(i * 3.14159 / ny) - 0.5;
      uFun[ij(i, j, nx)] = YY * YY * XX;
    }

  print(uFun, nx, ny + 1);

  // fftw_complex *uFunk;
  // uFunk = (fftw_complex*) fftw_malloc((nx*nyk) * sizeof(fftw_complex));
  // memset(uFunk, 42, (nx*nyk)* sizeof(fftw_complex));
  fftw_complex *uFunk;
  uFunk = (fftw_complex*) fftw_malloc((nxk * (ny + 1)) * sizeof(fftw_complex));
  memset(uFunk, 42, (nxk*(ny + 1))* sizeof(fftw_complex));

  r2c.r2cexecute(uFun,uFunk); //take forward FFT

  print(uFunk, nxk, ny + 1);

  double *Out;
  Out = (double*) fftw_malloc((nx*(ny+1))*sizeof(double));
  memset(Out, 42, (nx*(ny+1)) * sizeof(double));

  c2r.c2rexecute(uFunk,Out);

  print(Out, nx, ny + 1);
}

MyFftwClass::MyFftwClass(void)
{
  // m_buffer_in = fftw_alloc_real((nx*ny));
  // m_buffer_out = fftw_alloc_complex((nx*nyk));
  double* buffer_in = fftw_alloc_real(nx * (ny + 1));
  fftw_complex* buffer_out = fftw_alloc_complex(nxk * (ny + 1));
  if (!(buffer_in && buffer_out))
  {
    // throw new std::runtime_error("Failed to allocate memory!");
    throw std::runtime_error("Failed to allocate memory!");
  }

  if (!s_plan)
  {
    // s_plan = fftw_plan_many_dft_r2c(1,&nx, (ny+1),m_buffer_in,nullptr,(ny+1),1,m_buffer_out,nullptr, (ny+1),1, FFTW_MEASURE);
    s_plan = fftw_plan_many_dft_r2c(
      1,           // int rank,
      &nx,         // const int *n,
      ny + 1,      // int howmany,
      buffer_in,   // double *in,
      nullptr,     // const int *inembed,
      1,           // int istride, // adjacent elements within a row are adjacent.
      nx,          // int idist,   // adjacent elements within a column are separate
      buffer_out,  // fftw_complex *out,
      nullptr,     // const int *onembed,
      1,           // int ostride,
      nxk,         // int odist,
      FFTW_MEASURE // unsigned flags
      );

    if (!s_plan)
    {
      // throw new std::runtime_error("Failed to create plan!");
      throw std::runtime_error("Failed to create plan!");
    }
  }

  fftw_free(buffer_out);
  fftw_free(buffer_in);
}

// MyFftwClass::~MyFftwClass(void)
// {
//   fftw_free(m_buffer_in);
//   fftw_free(m_buffer_out);
// }

// void MyFftwClass::r2cexecute(double rArr[], double cArr[][ncomp]) MyFftwClass::execute(double *const in, fftw_complex *const out)
// {
//     memcpy(m_buffer_in, rArr,  sizeof(double) * (nx*ny));
//     fftw_execute_dft_r2c(s_plan, m_buffer_in, m_buffer_out);
//     memcpy(cArr, m_buffer_out, sizeof(fftw_complex) * (nx*nyk));
// }
void MyFftwClass::r2cexecute(double *const rArr, fftw_complex *const cArr)
{
  // NOTE(mbozzi): Use cArr anr rArr directly, instead of using
  // m_buffer_in/m_buffer_out followed by copying their contents.
  fftw_execute_dft_r2c(s_plan, rArr, cArr);
}


MyiFftwClass::MyiFftwClass(void)
{
  // m_buffer_in1 = fftw_alloc_complex((nx*nyk));
  // m_buffer_out1 = fftw_alloc_real((nx*ny));
  fftw_complex* buffer_in1 = fftw_alloc_complex(nxk * (ny + 1));
  double* buffer_out1 = fftw_alloc_real(nx*(ny + 1));

  if (!(buffer_in1 && buffer_out1))
  {
    // NOTE(mbozzi): prefer to "throw" instead of "throw new".  It's a
    // particularly bad idea in the case of out-of-memory error, because "new"
    // itself allocates memory.
    //
    // If you did "throw new", and care about error recovery, you ought to catch
    // the pointer and delete it:
    //   try { throw new std::runtime_error(""); } catch(std::runtime_error* pe) { delete pe; }
    // Else the new exception object will not be freed (a small memory leak).

    // NOTE(mbozzi): Consider calling std::abort instead of throwing exceptions.

    // throw new std::runtime_error("Failed to allocate memory!");
    throw std::runtime_error("Failed to allocate memory!");
  }

  if (!s_plan1)
  {
    // s_plan1 = fftw_plan_many_dft_c2r(1,&nx, (nyk),m_buffer_in1,nullptr,(nyk),1,m_buffer_out1,nullptr, (ny+1),1, FFTW_MEASURE);
    s_plan1 = fftw_plan_many_dft_c2r(
      1,           // int rank,
      &nx,         // const int *n,
      ny + 1,      // int howmany,
      buffer_in1,  // double *in,
      nullptr,     // const int *inembed,
      1,           // int istride, // adjacent elements within a row are adjacent.
      nxk,         // int idist,   // adjacent elements within a column are separate
      buffer_out1, // fftw_complex *out,
      nullptr,     // const int *onembed,
      1,           // int ostride,
      nx,          // int odist,
      FFTW_MEASURE // unsigned flags
      );
    if (!s_plan1)
    {
      // throw new std::runtime_error("Failed to create plan!");
      throw std::runtime_error("Failed to create plan!");
    }
  }

  fftw_free(buffer_out1);
  fftw_free(buffer_in1);
}

// MyiFftwClass::~MyiFftwClass(void)
// {
//   fftw_free(m_buffer_in1);
//   fftw_free(m_buffer_out1);
// }

//void MyiFftwClass::c2rexecute(double cArr[][ncomp],double rArr[]) void MyFftwClass::execute(double *const in, fftw_complex *const out)
// {
//   memcpy(m_buffer_in1,cArr, sizeof(fftw_complex) * (nx*nyk));
//   fftw_execute_dft_c2r(s_plan1, m_buffer_in1, m_buffer_out1); //instead of fftw_excute(plan)
//   memcpy(rArr, m_buffer_out1,sizeof(double) * (nx*ny));
//   //renormalize
//   for(int i = 0; i < nx; i++){
//     for( int j = 0; j < ny+1; j++){
//       //rArr[j + (ny+1)*i] = rArr[j + (ny+1)*i] * 1.0 / (nx);
//     }
//   }
// }
void MyiFftwClass::c2rexecute(double cArr[][ncomp],double rArr[])
{
  fftw_execute_dft_c2r(s_plan1, cArr, rArr); // instead of fftw_excute(plan)
  // renormalize
  for (int i = 0; i < ny + 1; ++i)
    for (int j = 0; j < nx; ++j)
      rArr[ij(i, j, nx)] /= nx;
}

fftw_plan MyiFftwClass::s_plan1 = NULL;
fftw_plan MyFftwClass::s_plan = NULL;
@mbozzi

Could you please answer these?
- Which is the row index into the matrix -- i, or j?
- Which is the number of rows in the matrix -- is it ny+1 or nx?
- Should the matrix use row- or column-major storage order?

I think what is causing me a lot of issues is jumping b/w MATLAB and C++ where MATLAB is a column-major layout by default and C/C++ is not. So, instead my question should be is there a preference between
row- or column-major storage order
? I am using OpenMP (eventually) to make things faster so should I choose a storage order based on that as well?

I am inclined to stay consistent with MATLAB for simplicity especially for comparison b/w results/outputs. But, I think I would like to be smart with my choice and not go with the easier option.
I think what is causing me a lot of issues is jumping b/w MATLAB and C++ where MATLAB is a column-major layout by default and C/C++ is not. So, instead my question should be is there a preference?
Not really.
I am inclined to stay consistent with MATLAB for simplicity especially for comparison b/w results/outputs. But, I think I would like to be smart with my choice and not go with the easier option. [...] I am using OpenMP (eventually) to make things faster so should I choose a storage order based on that as well?
I'm still new to OpenMP, but I don't think it cares at all about storage order. Column-major is a good choice.

That being said your choice of storage order shouldn't affect the output of your C++ program. Consider the matrix
| 1.0, 2.0 |
| 3.0, 4.0 |
It's represented by both the row-major array [1.0, 2.0, 3.0, 4.0] and the column-major array [1.0, 3.0, 2.0, 4.0]. Both arrays represent the same matrix. No matter which one your program is using, you should print out the original matrix every time.

To switch the above program to column-major storage order:
1. Replace ij with this (make sure to remove ij).
1
2
// Assumes column-major storage where i = row index, j = column index.
int ij_col_major(int i, int j, int number_of_rows) { return i + j * number_of_rows; }
Notice the third parameter changed from number_of_cols to number_of_rows. By removing ij from the program the compiler will force us to visit everywhere it is used, so we can adjust the third argument.

Go through all the calls to ij and change them to ij_col_major. Adjust the third argument to the height of the matrix instead of the width. For example
rArr[ij(i, j, nx)] /= nx;
should be changed into
rArr[ij_col_major(i, j, ny + 1)] /= nx;

2. Adjust the calls to FFTW as follows. The only changes are in idist, istride, odist & ostride:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
s_plan = fftw_plan_many_dft_r2c(
      1,           // int rank,
      &nx,         // const int *n,
      ny + 1,      // int howmany,
      buffer_in,   // double *in,
      nullptr,     // const int *inembed,
      ny + 1,      // int istride, // adjacent elements within a row vector are separate in memory
      1,           // int idist,   // adjacent elements within a column vector are adjacent in memory
      buffer_out,  // fftw_complex *out,
      nullptr,     // const int *onembed,
      ny + 1,      // int ostride,
      1,           // int odist,
      FFTW_MEASURE // unsigned flags
      );
    // ...
s_plan1 = fftw_plan_many_dft_c2r(
      1,           // int rank,
      &nx,         // const int *n,
      ny + 1,      // int howmany,
      buffer_in1,  // double *in,
      nullptr,     // const int *inembed,
      ny + 1,      // int istride, // adjacent elements within a row vector are separate in memory
      1,           // int idist,   // adjacent elements within a column vector are adjacent in memory
      buffer_out1, // fftw_complex *out,
      nullptr,     // const int *onembed,
      ny + 1,      // int ostride,
      1,           // int odist,
      FFTW_MEASURE // unsigned flags
      );


I think that's it.
Last edited on
@mbozzi

Thanks a lot! I think this answers my question and gets me the output I want. I am still struggling with indexing as I write in C/C++ but this is perfect!
Hi

Just my 2 cents worth*:

In the mid 1980's I saw some code that did a whole lot stuff with matrices, and used i and j as the indexes. The problem was confusion/interchange between i and j because they looked so similar. There is something written about that in CppCoreGuideLines.


* I freely admit that I am an amateur C++ coder, and don't know anything about the fftw library.
it makes no difference whether you store your matrix row or column major conceptually. BUT C++ WILL STORE IT ROW MAJOR at the bytes level off in your ram. You MUST understand that how you have it conceptually/logically stored for your thinking about it and design is one thing, but c++ only does it the row major way.

so if you store it column major, you are really just, in the c++ world, storing it transposed**. That is fine, as long as you understand that!

What does it mean or matter? If you store the 1-d matrix data** in c++ way, row major, it means that iterating over 1 column is expensive, because its skipping # of columns each iteration, and possibly having memory paging to do that (worst case, for large matrix, every single iteration!).
If you store it column major, it just reverses: now you changed it so that iterating over rows is now expensive, exactly the same as above, you are skipping entries and possibly jumping too far in memory.

You need to know WHICH way you have it in the actual c++ memory layout and design you code to iterate it, as much as is possible, so that its going sequentially in the c++ memory (so across rows in row major, or across columns in column major) and not the other way. Its simple, but its a pretty big deal to get this right from the design phase forward.

** I assume you will store it in 1d blocks. If you store it scattered to the 4 winds using 2d and pointer techniques, then it really does not matter, as it will have performance hits regardless of what you do. However, ... if you are multi threading it all, and you scatter it, but each thread/cpu is only dealing with 1 row or column in a solid 1d block, you don't lose as much there. Its when one single CPU is hopping memory like mad that you get penalized. I suspect a careful ** design or vector of vectors design would lose a lot less from the issue if you managed it very carefully.
Last edited on
Registered users can post here. Sign in or register to post.