1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
|
#include <iostream>
#include <vector>
#include <mpi.h>
using namespace std;
double my_dotproduct( const vector<double> &x, const vector<double> &y );
int main( int argc, char *argv[] )
{
int num_procs, myrank;
MPI_Init( &argc, &argv );
MPI_Comm_size( MPI_COMM_WORLD, &num_procs );
MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
bool root = ( myrank == 0 );
const int N = 14;
vector<double> a, b;
if ( root ) // Only root processor will know contents of a and b ...
{
a.resize( N ); b.resize( N );
for ( int i = 0; i < N; i++ )
{
a[i] = i;
b[i] = 10;
}
}
double result = my_dotproduct( a, b ); // ... but all processors call the routine and receive result
if ( root ) cout << "Result is " << result << '\n';
MPI_Finalize();
}
//=======================================================================
double my_dotproduct( const vector<double> &x, const vector<double> &y )
{
int myrank, num_procs;
MPI_Comm_size( MPI_COMM_WORLD, &num_procs );
MPI_Comm_rank( MPI_COMM_WORLD, &myrank );
int n = x.size();
MPI_Bcast( &n, 1, MPI_INT, 0, MPI_COMM_WORLD ); // only root processor knows the correct size
int low = n / num_procs; // minimum number of data points for each processor
int leftOver = n % num_procs; // extra points to be assigned to the first few processors
int local_n = low + ( myrank < leftOver );
vector<int> scounts( num_procs );
vector<int> displs( num_procs, 0 );
for ( int i = 0; i < num_procs; i++ ) scounts[i] = low + ( i < leftOver );
for ( int i = 1; i < num_procs; i++ ) displs[i] = displs[i-1] + scounts[i-1];
// Scatter data to all processors (unequal distribution, so MPI_Scatterv)
vector<double> local_x( local_n ), local_y( local_n );
MPI_Scatterv( x.data(), scounts.data(), displs.data(), MPI_DOUBLE, local_x.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Scatterv( y.data(), scounts.data(), displs.data(), MPI_DOUBLE, local_y.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Calculate LOCAL dotproduct on each processor
double local_sum = 0.0;
for ( int i = 0; i < local_n; i++ ) local_sum += local_x[i] * local_y[i];
// Reduction operation; result broadcast to all processors using MPI_Allreduce
double result;
MPI_Allreduce( &local_sum, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
return result;
}
|