Vector of struct vs. two 1D vectors

Pages: 123
Some suggestions:

1. Avoid making local variables static. The compiler has to ensure they're initialized and destroyed just once in a thread-safe way.

2. Avoid passing tiny objects like double by reference. By rule of thumb, pass-by-value is preferable over pass-by-reference if the computer needs to copy 3 * sizeof(void*) bytes or less. On a 64 bit machine a pointer is 8 bytes wide. See this: https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#Rf-in

3. Since there's only one set of constants, there's no point in passing them in as function parameters. Make them globally accessible and refer to them directly.

4. Give file-local functions internal linkage by marking them static. This may or may not help the compiler optimize.

Applying these suggestions to your program decreases the execution time from 154 seconds to 114 seconds:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#include <iostream>
#include <vector>
#include <cmath>
#include <random>
#include <fstream>
#include <chrono>

// Hyperparameters.
const double L{ 10 };
const int N_iter{ 20000 };
const int n_meas{ 10 };
const int n_part{ 1000 };
const double beta{ 300 };
const double gamma{ 0.05 };
const double kappa{ 1. / n_part };
const double sigma_2{ 1 };
const double stepsize{ 0.1 };
const double stepsize_half{ stepsize / 2.0 };

const int randomseed{ 1 };

struct coordinate {
  double x{ 0 };
  double y{ 0 };
};


static double compute_sq_distances(coordinate pos1,  coordinate pos2,
  double& dx, double& dy) {

  // box volume = [-L,L]^n

  const double two_L = 2 * L;
  dx = pos1.x - pos2.x;
  dx = dx > L ? dx - two_L : (dx < -L ? dx + two_L : dx);

  dy = pos1.y - pos2.y;
  dy = dy > L ? dy - two_L : (dy < -L ? dy + two_L : dy);

  return dx * dx + dy * dy;

}

static double force_term(double& dist2) {

  double pref{ -kappa / (2 * sigma_2) };
  double expo;
  expo = exp(-dist2 / (2 * sigma_2));

  return pref * expo;

}


static void compute_force(std::vector<coordinate>& forces, const std::vector<coordinate>& positions) {

  double dx, dy;
  double dist2;
  int n_part = forces.size();
  double force;

  for (int i = 0; i < n_part; ++i) {
    forces[i].x = forces[i].y = 0;
  }

  for (int i = 0; i < n_part; ++i) {
    for (int j = i + 1; j < n_part; ++j) {
      dist2 = compute_sq_distances(positions[i], positions[j], dx, dy);
      force = force_term(dist2);
      forces[i].x += force * dx;
      forces[i].y += force * dy;
      forces[j].x += -force * dx;
      forces[j].y += -force * dy;
    }
  }

  return;
}


static void B_step(const std::vector <coordinate>& positions, std::vector <coordinate>& velocities, std::vector <coordinate>& forces) {

  const int n_part = positions.size();

  // Update velocities.
  for (int i = 0; i < n_part; ++i) {
    velocities[i].x += stepsize_half * forces[i].x;
    velocities[i].y += stepsize_half * forces[i].y;
  }

  return;

}

static void A_step(std::vector <coordinate>& positions, const std::vector <coordinate>& velocities) {

  int n_part = positions.size();
  for (int i = 0; i < n_part; ++i) {
    positions[i].x += stepsize_half * velocities[i].x;
    positions[i].y += stepsize_half * velocities[i].y;
  }

  return;
}

static void O_step(std::vector <coordinate>& velocities, std::mt19937& RNG) {

  std::normal_distribution<> normal{ 0,1 };
  const int n_part = velocities.size();
  const double a = exp(-gamma * stepsize);
  const double pref = sqrt(1 / beta * (1 - a * a));

  for (int i = 0; i < n_part; ++i) {
    velocities[i].x = a * velocities[i].x + pref * normal(RNG);
    velocities[i].y = a * velocities[i].y + pref * normal(RNG);
  }

  return;

}

static void apply_periodic_boundaries(std::vector <coordinate>& positions) {

  const int n_part = positions.size();
  const double two_L = 2 * L;
  double x, y;

  for (int i = 0; i < n_part; ++i) {
    x = positions[i].x;
    positions[i].x = x > L ? x - two_L : (x < -L ? x + two_L : x);
    y = positions[i].y;
    positions[i].y = y > L ? y - two_L : (y < -L ? y + two_L : y);
  }

  return;

}



int main(int argc, char* argv[]) {
  // Prepare simulation.
  // Prepare RNG.
  std::mt19937 twister;

  std::seed_seq seq{ 1,20,3200,403,5 * randomseed + 1,12000,73667,9474 + randomseed,19151 - randomseed };
  std::vector < std::uint32_t > seeds(1);
  seq.generate(seeds.begin(), seeds.end());
  twister.seed(seeds.at(0));

  // Set positions.
  std::uniform_real_distribution<double> box_uniform(-L, L);
  std::vector <coordinate> positions(n_part);
  for (int i = 0; i < n_part; ++i) {
    positions[i].x = box_uniform(twister);
    positions[i].y = box_uniform(twister);
  }

  // Set forces.
  std::vector <coordinate> forces(n_part);
  compute_force(forces, positions);

  // Set velocities.
  std::vector <coordinate> velocities(n_part);

  // Main loop.
  const double stepsize_half = 0.5 * stepsize;
  std::normal_distribution<> normal{ 0,1 };

  auto t1 = std::chrono::high_resolution_clock::now();
  for (int i = 0; i <= N_iter; ++i) {

    B_step(positions, velocities, forces);
    A_step(positions, velocities);
    apply_periodic_boundaries(positions);
    O_step(velocities, twister);
    A_step(positions, velocities);
    apply_periodic_boundaries(positions);
    compute_force(forces, positions);
    B_step(positions, velocities, forces);

    if (i % 1000 == 0) std::cout << "Iteration " << i << " done!\n";

  }

  auto t2 = std::chrono::high_resolution_clock::now();
  auto ms_int = std::chrono::duration_cast <std::chrono::seconds> (t2 - t1);
  std::cout << "Execution took " << ms_int.count() << " seconds!\n";

  return 0;

}
Last edited on
new one runs a lot more iterations than before. I got 95 seconds on above version.

cut 2 sec off consistently with minor change.

static double force_term(const double& dist2)
{
return (-kappa / (2 * sigma_2)) * exp(-dist2 / (2 * sigma_2));
}
Some calculations can be moved into global as they are const. Consider:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#include <iostream>
#include <vector>
#include <cmath>
#include <random>
#include <chrono>
#include <algorithm>

// Hyperparameters.
constexpr double L { 10 };
constexpr unsigned N_iter { 10'000 };
constexpr unsigned n_part { 1'000 };
constexpr double beta { 300.0 };
constexpr double gamma { 0.05 };
constexpr double kappa { 1. / n_part };
constexpr double sigma_2 { 1 };
constexpr double stepsize { 0.1 };
constexpr double stepsize_half { stepsize / 2.0 };
constexpr double two_L { 2 * L };
constexpr double sigma22 { 2 * sigma_2 };
constexpr double pref { -kappa / sigma22 };
constexpr int randomseed { 1 };

const double a { std::exp(-gamma * stepsize) };
const double pref_O { std::sqrt(1 / beta * (1 - a * a)) };

std::normal_distribution<> normal{ 0, 1 };

struct coordinate {
    double x { };
    double y { };
};

static inline double compute_sq_distances(const coordinate& pos1, const coordinate& pos2, double& dx, double& dy) {
    dx = pos1.x - pos2.x;
    dx = dx > L ? dx - two_L : (dx < -L ? dx + two_L : dx);

    dy = pos1.y - pos2.y;
    dy = dy > L ? dy - two_L : (dy < -L ? dy + two_L : dy);

    return dx * dx + dy * dy;
}

static inline double force_term(double dist2) {
    return pref * std::exp(-dist2 / sigma22);
}

static void compute_force(std::vector<coordinate>& forces, const std::vector<coordinate>& positions) {
    double dx, dy;

    std::ranges::fill(forces, coordinate {});

    for (unsigned i {}; i < n_part; ++i)
        for (unsigned j { i + 1 }; j < n_part; ++j) {
            const auto force { force_term(compute_sq_distances(positions[i], positions[j], dx, dy)) };

            forces[i].x += force * dx;
            forces[i].y += force * dy;
            forces[j].x += -force * dx;
            forces[j].y += -force * dy;
        }
}

static void B_step(std::vector <coordinate>& velocities, std::vector <coordinate>& forces) {
    for (unsigned i {}; i < n_part; ++i) {
        velocities[i].x += stepsize_half * forces[i].x;
        velocities[i].y += stepsize_half * forces[i].y;
    }
}

static void A_step(std::vector <coordinate>& positions, const std::vector <coordinate>& velocities) {
    for (unsigned i {}; i < n_part; ++i) {
        positions[i].x += stepsize_half * velocities[i].x;
        positions[i].y += stepsize_half * velocities[i].y;
    }
}

static void O_step(std::vector <coordinate>& velocities, std::mt19937& RNG) {
    for (unsigned i {}; i < n_part; ++i) {
        velocities[i].x = a * velocities[i].x + pref_O * normal(RNG);
        velocities[i].y = a * velocities[i].y + pref_O * normal(RNG);
    }
}

static void apply_periodic_boundaries(std::vector <coordinate>& positions) {
    double x, y;

    for (unsigned i {}; i < n_part; ++i) {
        x = positions[i].x;
        positions[i].x = x > L ? x - two_L : (x < -L ? x + two_L : x);
        y = positions[i].y;
        positions[i].y = y > L ? y - two_L : (y < -L ? y + two_L : y);
    }
}

int main() {
  // Prepare simulation.
  // Prepare RNG.
    const std::seed_seq seq{ 1, 20, 3200, 403, 5 * randomseed + 1, 12000, 73667, 9474 + randomseed, 19151 - randomseed };

    std::vector<std::uint32_t> seeds(1);
    std::mt19937 twister;

    seq.generate(seeds.begin(), seeds.end());
    twister.seed(seeds.at(0));

    // Set positions.
    std::uniform_real_distribution<double> box_uniform(-L, L);
    std::vector<coordinate> positions(n_part);
    std::vector<coordinate> forces(n_part);
    std::vector<coordinate> velocities(n_part);

    std::ranges::fill(positions, coordinate(box_uniform(twister), box_uniform(twister)));
    compute_force(forces, positions);

    const auto t1 { std::chrono::high_resolution_clock::now() };

    // Main loop.
    for (unsigned i {}; i <= N_iter; ++i) {
        B_step(velocities, forces);
        A_step(positions, velocities);
        apply_periodic_boundaries(positions);
        O_step(velocities, twister);
        A_step(positions, velocities);
        apply_periodic_boundaries(positions);
        compute_force(forces, positions);
        B_step(velocities, forces);

        if (i > 0 && i % 1000 == 0)
            std::cout << "Iteration " << i << " done!\n";
    }

    const auto t2 { std::chrono::high_resolution_clock::now() };
    const auto ms_int { std::chrono::duration_cast <std::chrono::seconds> (t2 - t1) };

    std::cout << "Execution took " << ms_int.count() << " seconds!\n";
}


which on my laptop now takes 86 seconds (as opposed to 98 and 112 seconds).
that took 37 sec on mine but its not from the latest which increased to 20,000 end point for iterations.

I struggle with that one; the compiler is supposed to recognize constants even when not designated, but it does not always, just sometimes. I can't find any pattern, and I guess when it matters, do as you did and force the issue.

as a side note "gamma" is triggering my compiler as already defined.

not getting much out of it now... 2/3 of the remaining time is now in force_term()

this helps mine, does it yours..?

1
2
3
4
5
6
7
coordinate tmp; 	
    for (unsigned i {}; i < n_part; ++i)
	{
		tmp = positions[i];		
        for (unsigned j { i + 1 }; j < n_part; ++j) {
            const auto force { force_term(compute_sq_distances(tmp, positions[j], dx, dy)) };
Last edited on
Yes. And this helps even more. It brings my time down to 81 seconds.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static void compute_force(std::vector<coordinate>& forces, const std::vector<coordinate>& positions) {
    double dx, dy;

    std::ranges::fill(forces, coordinate {});

    for (unsigned i {}; i < n_part; ++i) {
        const auto posi { positions[i] };
        auto& frci { forces[i] };

        for (unsigned j { i + 1 }; j < n_part; ++j) {
            const auto force { force_term(compute_sq_distances(posi, positions[j], dx, dy)) };
            const auto frcdx { force * dx };
            const auto frcdy { force * dy };

            frci.x += frcdx;
            frci.y += frcdy;
            forces[j].x -= frcdx;
            forces[j].y -= frcdy;
        }
    }
}

as a side note "gamma" is triggering my compiler as already defined.


Not with VS. I get a clean compile with L4 (highest) warnings. There's no gamma defined as part of standard C++. There's lgamma and tgamma (both defined in cmath) but no gamma. Has the compiler you're using defined a gamma() function in cmath?
yes, its in the cygwin g++ cmath,

extern double gamma (double);

$ g++ -o gamma_test gamma_test.cpp 
$ ./gamma_test 1.0
gamma( 1 ) = 1

But the output on Ubuntu gcc v4.6.3 is 0!
Well, 1 == 0!, so it seems consistent to me!

Glibc has a gamma() function that is equivalent to lgamma(3) and computes the natural logarithm of the Gamma function.
How on earth did somebody think that "gamma" should mean "log of gamma"?

Anyway, yeah it's dumb historical compiler stuff. Wrapping the constants in their own namespace could make it more portable.

Last edited on
Yes, that looks like jargon crept out of its domain. The gamma I expect is just factorial, which is dumb too... just provide a factorial by that name and anyone needing 'gamma' knows what to do, and everyone else needing a factorial also knows what to do. And naming something after a greek letter is also just asking for collision troubles.
Thank you all for your many answers!
I came back to this now and would like to parallelize it using OpenMP.

mbozzi earlier proposed to turn the nested loop in the force routine
1
2
3
4
5
for (int i = 0; i < n_part; ++i)
  for (int j = i + 1; j < n_part; ++j)
  {
    // ...
  }

into a single loop via
1
2
3
4
5
6
7
8
9
10
11
12
13
for (int x = 0; x < (n_part - 1) * n_part / 2; ++x)
{
  int i = x / n_part;
  int j = x % n_part;

  if (i > j)
  {
    i = n_part - i - 2;
    j = n_part - j - 1;
  }

  // ...
}

I think there is an error here as it does not give the correct (i,j) pairs. It also gives pairs of type (i,i) and some pairs are missing entirely.
Example:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <iostream>
#include <cmath>

int main(){

    int n_part = 5;

    for (int x = 0; x < (n_part - 1) * n_part / 2; ++x)
    {
         int i = x / n_part;
         int j = x % n_part;

         if (i > j)
         {
              i = n_part - i - 2;
              j = n_part - j - 1;
         }
         std::cout<<"i="<<i<<" j="<<j<<std::endl;

    }

}

yields
1
2
3
4
5
6
7
8
9
10
i=0 j=0
i=0 j=1
i=0 j=2
i=0 j=3
i=0 j=4
i=2 j=4
i=1 j=1
i=1 j=2
i=1 j=3
i=1 j=4


I found a way to obtain the proper pairs, using
1
2
3
4
5
6
for (int k=0; k< n_part * (n_part - 1) / 2; ++k){
    int j = floor((1 + sqrt(1 + 8*k))/2);
    int i = k - j*(j - 1)/2;
    
    std::cout<<"i="<<i<<" j="<<j<<std::endl;
}

but this makes use of floating point arithmetic, and if I use that in the force routine proposed by mbozzi, the code becomes much slower.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
struct intermediate_result { double dx, dy, force; };
const long unsigned int k_max {n_part * (n_part - 1) / 2};
std::vector<intermediate_result> intermediate_results(k_max);
static void compute_force_naive2(std::vector<coordinate>& forces, const std::vector<coordinate>& positions) 
{
  constexpr double two_L  {2 * L};
  constexpr double two_sigma2 {2*sigma_2};
  constexpr double pref  {-kappa / two_sigma2}; 

  for (int i = 0; i < n_part; ++i) {
    forces[i].x = 0;
    forces[i].y = 0;
  }


  #pragma omp parallel for num_threads(1)
  for (int k = 0; k < k_max; ++k)
  {
    int j = floor((1 + sqrt(1 + 8*k))/2);
    int i = k - j*(j - 1)/2;

    double dx = positions[i].x - positions[j].x;
    if (dx > +L) dx -= two_L;
    else if (dx < -L) dx += two_L;

    double dy = positions[i].y - positions[j].y;
    if (dy > +L) dy -= two_L;
    else if (dy < -L) dy += two_L;

    intermediate_results[k].force = pref * exp(-(dx * dx + dy * dy) / two_sigma2);
    intermediate_results[k].dx = dx;
    intermediate_results[k].dy = dy;

  }
  
   for (int k = 0; k < k_max; ++k)
   {
     int j = floor((1 + sqrt(1 + 8*k))/2);
     int i = k - j*(j - 1)/2;

     forces[i].x += +intermediate_results[k].force * intermediate_results[k].dx;
     forces[i].y += +intermediate_results[k].force * intermediate_results[k].dy;

     forces[j].x += -intermediate_results[k].force * intermediate_results[k].dx;
     forces[j].y += -intermediate_results[k].force * intermediate_results[k].dy;
   }
}


Last edited on
It comes down to parallelizing the following nested loop efficiently:

1
2
3
4
5
6
7
8
9
10
11
for (int i=0; i<N; ++i){
    for (int j=i+1; j<N; ++j){
        
         double F = compute_force(i,j);
         
         forces[i] += F;
         forces[j] -= F;
         
     }
}


I thought it would be a standard task in computational physics, but Google did not yield an obvious solution, nor does ChatGPT.
Last edited on
The > should be >=:
https://coliru.stacked-crooked.com/a/6243f79cac2b7fb3

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <iostream>

int main() {
    int n_part = 5;

    for (int x = 0; x < (n_part - 1) * n_part / 2; ++x)
    {
         int i = x / n_part;
         int j = x % n_part;

         if (i >= j)
         {
              i = n_part - i - 2;
              j = n_part - j - 1;
         }
         std::cout<<"i="<<i<<" j="<<j<<std::endl;

    }
}
Last edited on
It appears that OpenMP can do the heavy lifting internally:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
static std::vector<std::vector<coordinate>> forces_for_all_tasks(
  omp_get_max_threads(), std::vector<coordinate>(n_part, coordinate{ .x = 0.0, .y = 0.0 }));
static void compute_force_naive4(std::vector<coordinate>& forces, const std::vector<coordinate>& positions)
{
  for (auto& forces_for_specific_task : forces_for_all_tasks)
    std::fill(forces_for_specific_task.begin(), forces_for_specific_task.end(), coordinate{ .x = 0.0, .y = 0.0 });

#pragma omp parallel for
  for (int i = 0; i < n_part; ++i) {
    for (int j = i + 1; j < n_part; ++j) {

      double dx = positions[i].x - positions[j].x;
      dx = dx > L ? dx - two_L : (dx < -L ? dx + two_L : dx);

      double dy = positions[i].y - positions[j].y;
      dy = dy > L ? dy - two_L : (dy < -L ? dy + two_L : dy);

      double dist2 = dx * dx + dy * dy;

      double expo = exp(-dist2 / (2 * sigma_2));

      double force = pref * expo;

      std::vector<coordinate>& forces_for_this_task = forces_for_all_tasks[omp_get_thread_num()];
      forces_for_this_task[i].x += force * dx;
      forces_for_this_task[i].y += force * dy;
      forces_for_this_task[j].x += -force * dx;
      forces_for_this_task[j].y += -force * dy;
    }
  }
  // Sum all of the task-specific forces into the output parameter.
  std::fill(forces.begin(), forces.end(), coordinate{ .x = 0.0, .y = 0.0 });
  for (auto const& forces_for_specific_task : forces_for_all_tasks)
    for (int i = 0; i < n_part; ++i)
    {
      forces[i].x += forces_for_specific_task[i].x;
      forces[i].y += forces_for_specific_task[i].y;
    }
}

Thanks mbozzi.

In your version 4, isn't the workload divided uneven across the threads without the loop fusion? I assume the outer loop iterations will just be distributed equally across the threads?
In your version 4, isn't the workload divided uneven across the threads without the loop fusion? I assume the outer loop iterations will just be distributed equally across the threads?

You're right and v4 is 30% slower than v3.

Here's a test program showing that some threads get assigned more work:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
int main() {
  int n_part = 10;
#pragma omp parallel for
  for (int i = 0; i < n_part; ++i)
  {
    for (int j = i + 1; j < n_part; ++j)
    {
      int n = omp_get_thread_num();
#pragma omp critical
      {
        std::cout << n << '\n';
      }
    }
  }
}


Try V4 but say #pragma omp parallel for schedule(dynamic) instead:

https://www.inf.ufsc.br/~bosco.sobral/ensino/ine5645/OpenMP_Dynamic_Scheduling.pdf
Thanks mbozzi, V4 with dynamic scheduling seems to be the king!

Here some runtime measurements (in seconds, I changed iterations and particle numbers):

1
2
3
4
5
threads  V4 static  V4 dynamic   V3
1          39         44         58
2	   30	      22         150 !
3	   22	      15         91 
4	   17	      11         21


It scales perfectly lol
Last edited on
Great! Regarding V3, those giant numbers must indicate a problem. It seems likely that V3 is wrong, but I can't find the problem in my code.

You may be able to improve a little more on v4 dynamic by splitting up the tasks just once following the approach discussed here:
https://doi.org/10.31449/inf.v45i4.3130

It seems to be better or worse depending on the compiler, but for me with the latest MSVC it was 1.04x (4%) faster than schedule(dynamic). That paper more-or-less answers the question regarding how to efficiently parallelize your triangular loop, though it's an approximation only and my implementation of it sucks:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
namespace v7
{
  static void sanity_check_task_endpoints(std::vector<int> const& task_endpoints, int iterations)
  {
    // Holds the number of inner loop iterations completed by each task.
    int const n_tasks = static_cast<int>(task_endpoints.size()) - 1;

    std::vector<int> measured_iteration_counts(n_tasks);
    std::vector<int> computed_iteration_counts(n_tasks);

    for (int t = 0; t < n_tasks; ++t)
    {
      int const start_row = task_endpoints.at(t);
      int const end_row = task_endpoints.at(t + 1);
      int const initial_iterations = iterations - ((n_part - start_row) * ((n_part - start_row) - 1) / 2);
      int const final_iterations = iterations - ((n_part - end_row) * ((n_part - end_row) - 1) / 2);

      computed_iteration_counts.at(t) = final_iterations - initial_iterations;

      // Actually run the inner loop to ensure it runs the correct number of times.
      for (int i = task_endpoints.at(t); i < task_endpoints.at(t + 1); ++i)
        for (int j = i + 1; j < n_part; ++j)
          measured_iteration_counts.at(t)++;
    }

    int const actual_sum_of_measured_iteration_counts = std::accumulate(
      measured_iteration_counts.begin(), measured_iteration_counts.end(), 0);
    int const expected_sum_of_measured_iteration_counts = iterations;

    assert(expected_sum_of_measured_iteration_counts == actual_sum_of_measured_iteration_counts);

    for (int t = 0; t < n_tasks; ++t)
      assert(measured_iteration_counts.at(t) == computed_iteration_counts.at(t));
  }

  // Straightforward implementation of Ádám Pintér and Sándor Szénási's algorithm 
  // here:
  //   https://doi.org/10.31449/inf.v45i4.3130
  static std::vector<int> compute_task_endpoints(int n_tasks, int n_iterations)
  {
    std::vector<int> result(n_tasks + 1);
    result[0] = 0;

    double constexpr a = -1;
    double constexpr b = 2 * n_part + 1;

    double k_previous = 0.0;
    for (int task = 1; task <= n_tasks; ++task)
    {
      double const c = ((n_tasks * (n_part - k_previous) * (n_part - k_previous + 1) -
        (n_part * (n_part + 1))) / n_tasks) - squared(n_part) - n_part;
      double const k = (-b + sqrt(squared(b) - 4 * a * c)) / (2.0 * a);
      double const i = floor(k);

      result[task] = static_cast<int>(i);
      k_previous = k;
    }

    sanity_check_task_endpoints(result, n_iterations);

    return result;
  }

  static auto const n_tasks = omp_get_max_threads();
  static auto const task_endpoints = compute_task_endpoints(n_tasks, ((n_part - 1) * n_part) / 2);

  static void compute_force_naive(std::vector<coordinate>& forces, const std::vector<coordinate>& positions)
  {
    static std::vector<std::vector<coordinate>> forces_for_all_tasks(
      omp_get_max_threads(), std::vector<coordinate>(n_part, coordinate{ .x = 0.0, .y = 0.0 }));
    for (auto& forces_for_specific_task : forces_for_all_tasks)
      std::fill(forces_for_specific_task.begin(), forces_for_specific_task.end(), coordinate{ .x = 0.0, .y = 0.0 });

    #pragma omp parallel
    {
      int const task_num = omp_get_thread_num();
      int const task_begin = task_endpoints[task_num];
      int const task_end = task_endpoints[task_num + 1];

      for (int i = task_begin; i < task_end; ++i) {
        for (int j = i + 1; j < n_part; ++j) {

          double dx = positions[i].x - positions[j].x;
          dx = dx > L ? dx - two_L : (dx < -L ? dx + two_L : dx);

          double dy = positions[i].y - positions[j].y;
          dy = dy > L ? dy - two_L : (dy < -L ? dy + two_L : dy);

          double dist2 = dx * dx + dy * dy;
          double expo = exp(-dist2 / (2 * sigma_2));
          double force = pref * expo;

          std::vector<coordinate>& forces_for_this_task = forces_for_all_tasks[task_num];
          forces_for_this_task[i].x += force * dx;
          forces_for_this_task[i].y += force * dy;
          forces_for_this_task[j].x += -force * dx;
          forces_for_this_task[j].y += -force * dy;
        }
      }
    };

    std::fill(forces.begin(), forces.end(), coordinate{ .x = 0.0, .y = 0.0 });
    for (auto const& forces_for_task : forces_for_all_tasks)
      for (int i = 0; i < n_part; ++i)
      {
        forces[i].x += forces_for_task[i].x;
        forces[i].y += forces_for_task[i].y;
      }
  }
}

Last edited on
if I recall correctly, .at is slower than [] and if .at isn't necessary, use []? Its minor, but speed is speed. Could have been an old implementation issue too, but maybe not, .at validates the index?



Pages: 123