Vector of struct vs. two 1D vectors

new one runs a lot more iterations than before. I got 95 seconds on above version.

cut 2 sec off consistently with minor change.

static double force_term(const double& dist2)
{
return (-kappa / (2 * sigma_2)) * exp(-dist2 / (2 * sigma_2));
}

seeplus (6469)

Some calculations can be moved into global as they are const. Consider:

#include <iostream>
#include <vector>
#include <cmath>
#include <random>
#include <chrono>
#include <algorithm>

// Hyperparameters.
constexpr double L { 10 };
constexpr unsigned N_iter { 10'000 };
constexpr unsigned n_part { 1'000 };
constexpr double beta { 300.0 };
constexpr double gamma { 0.05 };
constexpr double kappa { 1. / n_part };
constexpr double sigma_2 { 1 };
constexpr double stepsize { 0.1 };
constexpr double stepsize_half { stepsize / 2.0 };
constexpr double two_L { 2 * L };
constexpr double sigma22 { 2 * sigma_2 };
constexpr double pref { -kappa / sigma22 };
constexpr int randomseed { 1 };

const double a { std::exp(-gamma * stepsize) };
const double pref_O { std::sqrt(1 / beta * (1 - a * a)) };

std::normal_distribution<> normal{ 0, 1 };

struct coordinate {
    double x { };
    double y { };
};

static inline double compute_sq_distances(const coordinate& pos1, const coordinate& pos2, double& dx, double& dy) {
    dx = pos1.x - pos2.x;
    dx = dx > L ? dx - two_L : (dx < -L ? dx + two_L : dx);

    dy = pos1.y - pos2.y;
    dy = dy > L ? dy - two_L : (dy < -L ? dy + two_L : dy);

    return dx * dx + dy * dy;
}

static inline double force_term(double dist2) {
    return pref * std::exp(-dist2 / sigma22);
}

static void compute_force(std::vector<coordinate>& forces, const std::vector<coordinate>& positions) {
    double dx, dy;

    std::ranges::fill(forces, coordinate {});

    for (unsigned i {}; i < n_part; ++i)
        for (unsigned j { i + 1 }; j < n_part; ++j) {
            const auto force { force_term(compute_sq_distances(positions[i], positions[j], dx, dy)) };

            forces[i].x += force * dx;
            forces[i].y += force * dy;
            forces[j].x += -force * dx;
            forces[j].y += -force * dy;
        }
}

static void B_step(std::vector <coordinate>& velocities, std::vector <coordinate>& forces) {
    for (unsigned i {}; i < n_part; ++i) {
        velocities[i].x += stepsize_half * forces[i].x;
        velocities[i].y += stepsize_half * forces[i].y;
    }
}

static void A_step(std::vector <coordinate>& positions, const std::vector <coordinate>& velocities) {
    for (unsigned i {}; i < n_part; ++i) {
        positions[i].x += stepsize_half * velocities[i].x;
        positions[i].y += stepsize_half * velocities[i].y;
    }
}

static void O_step(std::vector <coordinate>& velocities, std::mt19937& RNG) {
    for (unsigned i {}; i < n_part; ++i) {
        velocities[i].x = a * velocities[i].x + pref_O * normal(RNG);
        velocities[i].y = a * velocities[i].y + pref_O * normal(RNG);
    }
}

static void apply_periodic_boundaries(std::vector <coordinate>& positions) {
    double x, y;

    for (unsigned i {}; i < n_part; ++i) {
        x = positions[i].x;
        positions[i].x = x > L ? x - two_L : (x < -L ? x + two_L : x);
        y = positions[i].y;
        positions[i].y = y > L ? y - two_L : (y < -L ? y + two_L : y);
    }
}

int main() {
  // Prepare simulation.
  // Prepare RNG.
    const std::seed_seq seq{ 1, 20, 3200, 403, 5 * randomseed + 1, 12000, 73667, 9474 + randomseed, 19151 - randomseed };

    std::vector<std::uint32_t> seeds(1);
    std::mt19937 twister;

    seq.generate(seeds.begin(), seeds.end());
    twister.seed(seeds.at(0));

    // Set positions.
    std::uniform_real_distribution<double> box_uniform(-L, L);
    std::vector<coordinate> positions(n_part);
    std::vector<coordinate> forces(n_part);
    std::vector<coordinate> velocities(n_part);

    std::ranges::fill(positions, coordinate(box_uniform(twister), box_uniform(twister)));
    compute_force(forces, positions);

    const auto t1 { std::chrono::high_resolution_clock::now() };

    // Main loop.
    for (unsigned i {}; i <= N_iter; ++i) {
        B_step(velocities, forces);
        A_step(positions, velocities);
        apply_periodic_boundaries(positions);
        O_step(velocities, twister);
        A_step(positions, velocities);
        apply_periodic_boundaries(positions);
        compute_force(forces, positions);
        B_step(velocities, forces);

        if (i > 0 && i % 1000 == 0)
            std::cout << "Iteration " << i << " done!\n";
    }

    const auto t2 { std::chrono::high_resolution_clock::now() };
    const auto ms_int { std::chrono::duration_cast <std::chrono::seconds> (t2 - t1) };

    std::cout << "Execution took " << ms_int.count() << " seconds!\n";
}

which on my laptop now takes 86 seconds (as opposed to 98 and 112 seconds).

that took 37 sec on mine but its not from the latest which increased to 20,000 end point for iterations.

I struggle with that one; the compiler is supposed to recognize constants even when not designated, but it does not always, just sometimes. I can't find any pattern, and I guess when it matters, do as you did and force the issue.

as a side note "gamma" is triggering my compiler as already defined.

not getting much out of it now... 2/3 of the remaining time is now in force_term()

this helps mine, does it yours..?

coordinate tmp; 	
    for (unsigned i {}; i < n_part; ++i)
	{
		tmp = positions[i];		
        for (unsigned j { i + 1 }; j < n_part; ++j) {
            const auto force { force_term(compute_sq_distances(tmp, positions[j], dx, dy)) };

Last edited on

seeplus (6469)

Yes. And this helps even more. It brings my time down to 81 seconds.

static void compute_force(std::vector<coordinate>& forces, const std::vector<coordinate>& positions) {
    double dx, dy;

    std::ranges::fill(forces, coordinate {});

    for (unsigned i {}; i < n_part; ++i) {
        const auto posi { positions[i] };
        auto& frci { forces[i] };

        for (unsigned j { i + 1 }; j < n_part; ++j) {
            const auto force { force_term(compute_sq_distances(posi, positions[j], dx, dy)) };
            const auto frcdx { force * dx };
            const auto frcdy { force * dy };

            frci.x += frcdx;
            frci.y += frcdy;
            forces[j].x -= frcdx;
            forces[j].y -= frcdy;
        }
    }
}

seeplus (6469)

as a side note "gamma" is triggering my compiler as already defined.

Not with VS. I get a clean compile with L4 (highest) warnings. There's no gamma defined as part of standard C++. There's lgamma and tgamma (both defined in cmath) but no gamma. Has the compiler you're using defined a gamma() function in cmath?

https://stackoverflow.com/questions/18116376/what-is-the-definition-for-gammadouble-x-and-why-is-it-different-on-two-gcc-ve

yes, its in the cygwin g++ cmath,

extern double gamma (double);

Duthomhas (13148)

Ganado (6786)

$ g++ -o gamma_test gamma_test.cpp 
$ ./gamma_test 1.0
gamma( 1 ) = 1

But the output on Ubuntu gcc v4.6.3 is 0!

Well, 1 == 0!, so it seems consistent to me!

Glibc has a gamma() function that is equivalent to lgamma(3) and computes the natural logarithm of the Gamma function.

How on earth did somebody think that "gamma" should mean "log of gamma"?

Anyway, yeah it's dumb historical compiler stuff. Wrapping the constants in their own namespace could make it more portable.

Last edited on

Yes, that looks like jargon crept out of its domain. The gamma I expect is just factorial, which is dumb too... just provide a factorial by that name and anyone needing 'gamma' knows what to do, and everyone else needing a factorial also knows what to do. And naming something after a greek letter is also just asking for collision troubles.

Thank you all for your many answers!

I came back to this now and would like to parallelize it using OpenMP.

mbozzi earlier proposed to turn the nested loop in the force routine

for (int i = 0; i < n_part; ++i)
  for (int j = i + 1; j < n_part; ++j)
  {
    // ...
  }

into a single loop via

for (int x = 0; x < (n_part - 1) * n_part / 2; ++x)
{
  int i = x / n_part;
  int j = x % n_part;

  if (i > j)
  {
    i = n_part - i - 2;
    j = n_part - j - 1;
  }

  // ...
}

I think there is an error here as it does not give the correct (i,j) pairs. It also gives pairs of type (i,i) and some pairs are missing entirely.
Example:

#include <iostream>
#include <cmath>

int main(){

    int n_part = 5;

    for (int x = 0; x < (n_part - 1) * n_part / 2; ++x)
    {
         int i = x / n_part;
         int j = x % n_part;

         if (i > j)
         {
              i = n_part - i - 2;
              j = n_part - j - 1;
         }
         std::cout<<"i="<<i<<" j="<<j<<std::endl;

    }

}

yields

i=0 j=0
i=0 j=1
i=0 j=2
i=0 j=3
i=0 j=4
i=2 j=4
i=1 j=1
i=1 j=2
i=1 j=3
i=1 j=4

I found a way to obtain the proper pairs, using

for (int k=0; k< n_part * (n_part - 1) / 2; ++k){
    int j = floor((1 + sqrt(1 + 8*k))/2);
    int i = k - j*(j - 1)/2;
    
    std::cout<<"i="<<i<<" j="<<j<<std::endl;
}

but this makes use of floating point arithmetic, and if I use that in the force routine proposed by mbozzi, the code becomes much slower.

struct intermediate_result { double dx, dy, force; };
const long unsigned int k_max {n_part * (n_part - 1) / 2};
std::vector<intermediate_result> intermediate_results(k_max);
static void compute_force_naive2(std::vector<coordinate>& forces, const std::vector<coordinate>& positions) 
{
  constexpr double two_L  {2 * L};
  constexpr double two_sigma2 {2*sigma_2};
  constexpr double pref  {-kappa / two_sigma2}; 

  for (int i = 0; i < n_part; ++i) {
    forces[i].x = 0;
    forces[i].y = 0;
  }


  #pragma omp parallel for num_threads(1)
  for (int k = 0; k < k_max; ++k)
  {
    int j = floor((1 + sqrt(1 + 8*k))/2);
    int i = k - j*(j - 1)/2;

    double dx = positions[i].x - positions[j].x;
    if (dx > +L) dx -= two_L;
    else if (dx < -L) dx += two_L;

    double dy = positions[i].y - positions[j].y;
    if (dy > +L) dy -= two_L;
    else if (dy < -L) dy += two_L;

    intermediate_results[k].force = pref * exp(-(dx * dx + dy * dy) / two_sigma2);
    intermediate_results[k].dx = dx;
    intermediate_results[k].dy = dy;

  }
  
   for (int k = 0; k < k_max; ++k)
   {
     int j = floor((1 + sqrt(1 + 8*k))/2);
     int i = k - j*(j - 1)/2;

     forces[i].x += +intermediate_results[k].force * intermediate_results[k].dx;
     forces[i].y += +intermediate_results[k].force * intermediate_results[k].dy;

     forces[j].x += -intermediate_results[k].force * intermediate_results[k].dx;
     forces[j].y += -intermediate_results[k].force * intermediate_results[k].dy;
   }
}

Last edited on

It comes down to parallelizing the following nested loop efficiently:

for (int i=0; i<N; ++i){
    for (int j=i+1; j<N; ++j){
        
         double F = compute_force(i,j);
         
         forces[i] += F;
         forces[j] -= F;
         
     }
}

I thought it would be a standard task in computational physics, but Google did not yield an obvious solution, nor does ChatGPT.

Last edited on

The > should be >=:
https://coliru.stacked-crooked.com/a/6243f79cac2b7fb3

#include <iostream>

int main() {
    int n_part = 5;

    for (int x = 0; x < (n_part - 1) * n_part / 2; ++x)
    {
         int i = x / n_part;
         int j = x % n_part;

         if (i >= j)
         {
              i = n_part - i - 2;
              j = n_part - j - 1;
         }
         std::cout<<"i="<<i<<" j="<<j<<std::endl;

    }
}

Last edited on

It appears that OpenMP can do the heavy lifting internally:

static std::vector<std::vector<coordinate>> forces_for_all_tasks(
  omp_get_max_threads(), std::vector<coordinate>(n_part, coordinate{ .x = 0.0, .y = 0.0 }));
static void compute_force_naive4(std::vector<coordinate>& forces, const std::vector<coordinate>& positions)
{
  for (auto& forces_for_specific_task : forces_for_all_tasks)
    std::fill(forces_for_specific_task.begin(), forces_for_specific_task.end(), coordinate{ .x = 0.0, .y = 0.0 });

#pragma omp parallel for
  for (int i = 0; i < n_part; ++i) {
    for (int j = i + 1; j < n_part; ++j) {

      double dx = positions[i].x - positions[j].x;
      dx = dx > L ? dx - two_L : (dx < -L ? dx + two_L : dx);

      double dy = positions[i].y - positions[j].y;
      dy = dy > L ? dy - two_L : (dy < -L ? dy + two_L : dy);

      double dist2 = dx * dx + dy * dy;

      double expo = exp(-dist2 / (2 * sigma_2));

      double force = pref * expo;

      std::vector<coordinate>& forces_for_this_task = forces_for_all_tasks[omp_get_thread_num()];
      forces_for_this_task[i].x += force * dx;
      forces_for_this_task[i].y += force * dy;
      forces_for_this_task[j].x += -force * dx;
      forces_for_this_task[j].y += -force * dy;
    }
  }
  // Sum all of the task-specific forces into the output parameter.
  std::fill(forces.begin(), forces.end(), coordinate{ .x = 0.0, .y = 0.0 });
  for (auto const& forces_for_specific_task : forces_for_all_tasks)
    for (int i = 0; i < n_part; ++i)
    {
      forces[i].x += forces_for_specific_task[i].x;
      forces[i].y += forces_for_specific_task[i].y;
    }
}

Thanks mbozzi.

In your version 4, isn't the workload divided uneven across the threads without the loop fusion? I assume the outer loop iterations will just be distributed equally across the threads?

In your version 4, isn't the workload divided uneven across the threads without the loop fusion? I assume the outer loop iterations will just be distributed equally across the threads?

You're right and v4 is 30% slower than v3.

Here's a test program showing that some threads get assigned more work:

int main() {
  int n_part = 10;
#pragma omp parallel for
  for (int i = 0; i < n_part; ++i)
  {
    for (int j = i + 1; j < n_part; ++j)
    {
      int n = omp_get_thread_num();
#pragma omp critical
      {
        std::cout << n << '\n';
      }
    }
  }
}

Try V4 but say #pragma omp parallel for schedule(dynamic) instead:

https://www.inf.ufsc.br/~bosco.sobral/ensino/ine5645/OpenMP_Dynamic_Scheduling.pdf

Thanks mbozzi, V4 with dynamic scheduling seems to be the king!

Here some runtime measurements (in seconds, I changed iterations and particle numbers):

threads  V4 static  V4 dynamic   V3
1          39         44         58
2	   30	      22         150 !
3	   22	      15         91 
4	   17	      11         21

It scales perfectly lol

Last edited on

Great! Regarding V3, those giant numbers must indicate a problem. It seems likely that V3 is wrong, but I can't find the problem in my code.

You may be able to improve a little more on v4 dynamic by splitting up the tasks just once following the approach discussed here:
https://doi.org/10.31449/inf.v45i4.3130

It seems to be better or worse depending on the compiler, but for me with the latest MSVC it was 1.04x (4%) faster than schedule(dynamic). That paper more-or-less answers the question regarding how to efficiently parallelize your triangular loop, though it's an approximation only and my implementation of it sucks:

namespace v7
{
  static void sanity_check_task_endpoints(std::vector<int> const& task_endpoints, int iterations)
  {
    // Holds the number of inner loop iterations completed by each task.
    int const n_tasks = static_cast<int>(task_endpoints.size()) - 1;

    std::vector<int> measured_iteration_counts(n_tasks);
    std::vector<int> computed_iteration_counts(n_tasks);

    for (int t = 0; t < n_tasks; ++t)
    {
      int const start_row = task_endpoints.at(t);
      int const end_row = task_endpoints.at(t + 1);
      int const initial_iterations = iterations - ((n_part - start_row) * ((n_part - start_row) - 1) / 2);
      int const final_iterations = iterations - ((n_part - end_row) * ((n_part - end_row) - 1) / 2);

      computed_iteration_counts.at(t) = final_iterations - initial_iterations;

      // Actually run the inner loop to ensure it runs the correct number of times.
      for (int i = task_endpoints.at(t); i < task_endpoints.at(t + 1); ++i)
        for (int j = i + 1; j < n_part; ++j)
          measured_iteration_counts.at(t)++;
    }

    int const actual_sum_of_measured_iteration_counts = std::accumulate(
      measured_iteration_counts.begin(), measured_iteration_counts.end(), 0);
    int const expected_sum_of_measured_iteration_counts = iterations;

    assert(expected_sum_of_measured_iteration_counts == actual_sum_of_measured_iteration_counts);

    for (int t = 0; t < n_tasks; ++t)
      assert(measured_iteration_counts.at(t) == computed_iteration_counts.at(t));
  }

  // Straightforward implementation of Ádám Pintér and Sándor Szénási's algorithm 
  // here:
  //   https://doi.org/10.31449/inf.v45i4.3130
  static std::vector<int> compute_task_endpoints(int n_tasks, int n_iterations)
  {
    std::vector<int> result(n_tasks + 1);
    result[0] = 0;

    double constexpr a = -1;
    double constexpr b = 2 * n_part + 1;

    double k_previous = 0.0;
    for (int task = 1; task <= n_tasks; ++task)
    {
      double const c = ((n_tasks * (n_part - k_previous) * (n_part - k_previous + 1) -
        (n_part * (n_part + 1))) / n_tasks) - squared(n_part) - n_part;
      double const k = (-b + sqrt(squared(b) - 4 * a * c)) / (2.0 * a);
      double const i = floor(k);

      result[task] = static_cast<int>(i);
      k_previous = k;
    }

    sanity_check_task_endpoints(result, n_iterations);

    return result;
  }

  static auto const n_tasks = omp_get_max_threads();
  static auto const task_endpoints = compute_task_endpoints(n_tasks, ((n_part - 1) * n_part) / 2);

  static void compute_force_naive(std::vector<coordinate>& forces, const std::vector<coordinate>& positions)
  {
    static std::vector<std::vector<coordinate>> forces_for_all_tasks(
      omp_get_max_threads(), std::vector<coordinate>(n_part, coordinate{ .x = 0.0, .y = 0.0 }));
    for (auto& forces_for_specific_task : forces_for_all_tasks)
      std::fill(forces_for_specific_task.begin(), forces_for_specific_task.end(), coordinate{ .x = 0.0, .y = 0.0 });

    #pragma omp parallel
    {
      int const task_num = omp_get_thread_num();
      int const task_begin = task_endpoints[task_num];
      int const task_end = task_endpoints[task_num + 1];

      for (int i = task_begin; i < task_end; ++i) {
        for (int j = i + 1; j < n_part; ++j) {

          double dx = positions[i].x - positions[j].x;
          dx = dx > L ? dx - two_L : (dx < -L ? dx + two_L : dx);

          double dy = positions[i].y - positions[j].y;
          dy = dy > L ? dy - two_L : (dy < -L ? dy + two_L : dy);

          double dist2 = dx * dx + dy * dy;
          double expo = exp(-dist2 / (2 * sigma_2));
          double force = pref * expo;

          std::vector<coordinate>& forces_for_this_task = forces_for_all_tasks[task_num];
          forces_for_this_task[i].x += force * dx;
          forces_for_this_task[i].y += force * dy;
          forces_for_this_task[j].x += -force * dx;
          forces_for_this_task[j].y += -force * dy;
        }
      }
    };

    std::fill(forces.begin(), forces.end(), coordinate{ .x = 0.0, .y = 0.0 });
    for (auto const& forces_for_task : forces_for_all_tasks)
      for (int i = 0; i < n_part; ++i)
      {
        forces[i].x += forces_for_task[i].x;
        forces[i].y += forces_for_task[i].y;
      }
  }
}

Last edited on