Mpi send and receive

I am a beginner in MPI programing and I am trying to do a matrix-vector multiplication (Ax=b). I divided A matrix into two matrix A1 and A2. I need to calculate Ax=b such a way that the process number 1 does the A1 * x multiplication and gives C1 and process number 2 does the A2 * x multiplication and gives C2 and at the end the sum of C1 and C2 will be wrapped up in C. when I run the code through cmd it stops working and I don't know what is the problem. I would be really grateful if you could help me to find out what is the problem in code, here is my code,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#define _CRT_SECURE_NO_WARNINGS
#include<iostream>
#include<fstream>
#include<vector>
#include<iterator>
#include<sstream>
#include<string>
#include<cstdlib>
#include<cmath>
#include<stdio.h>
#include<conio.h>
#include<algorithm>
#include<ctime> 
#include<iomanip>
# include <mpi.h>
# include <time.h>
#include<assert.h>

using namespace std;

void Initialise(int **res, int rows, int cols);
void Multiply(int **res, int **A, int **B, int aRows, int aCols, int bRows, int bCols);
void timestamp();
const static int tag = 1;



//**********************************************************
/*              |3  2  5|
    matrix A=   |4  3  1|
                |2  4  2|

   matrix A is divided into two matrix A1,A2

				|1  2  3|
	matrix A1=  |3  2  1|
				|1  2  0|

				|2  0  2|
	matrix A2=  |1  1  0|
				|1  2  2|


				| 2 |
	vector x=   | 1 |
				| 3 |


				| 23 |
	RHS =       | 14 |
				| 14 |

//*********************************************************
*/

int main(int argc,char **argv)
{

//****************** MPI *********************************
	int  id, p;
	double wtime;


	MPI_Status status;

	int aRows = 3;
	int aCols = 3;
	int bRows = 3;
	int bCols = 1;

	int** A = new int*[aRows];
	for (int i = 0; i < aRows; i++)
	{
		A[i] = new int[aCols];
	}
	int** A1 = new int*[aRows];
	for (int i = 0; i < aRows; i++)
	{
		A1[i] = new int[aCols];
	}
	int** A2 = new int*[aRows];
	for (int i = 0; i < aRows; i++)
	{
		A2[i] = new int[aCols];
	}

	int** B = new int*[bRows];
	for (int i = 0; i < bRows; i++)
	{
		B[i] = new int[bCols];
	}
	//***************************************

	int** C;
	C = new int*[aRows];
	for (int i = 0; i < aRows; i++)
	{
		C[i] = new int[bCols];
	}
	//************************************
	int** C1;
	C1 = new int*[aRows];
	for (int i = 0; i < aRows; i++)
	{
		C1[i] = new int[bCols];
	}
	//************************************
	int** C2;
	C2 = new int*[aRows];
	for (int i = 0; i < aRows; i++)
	{
		C2[i] = new int[bCols];
	}



// p -> no. of processes
// id -> process id

	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &id);

	MPI_Comm_size(MPI_COMM_WORLD, &p);
	cout << p << endl;
	char processor_name[MPI_MAX_PROCESSOR_NAME];
	int name_len;
	MPI_Get_processor_name(processor_name, &name_len);


	

	if (id == 0)// master
	{
		wtime = MPI_Wtime();



		//***************************************
		A[0][0] = 3;
		A[0][1] = 2;
		A[0][2] = 5;
		A[1][0] = 4;
		A[1][1] = 3;
		A[1][2] = 1;
		A[2][0] = 2;
		A[2][1] = 4;
		A[2][2] = 2;
		B[0][0] = 2;
		B[1][0] = 1;
		B[2][0] = 3;
		//**************************************
		A1[0][0] = 1;
		A1[0][1] = 2;
		A1[0][2] = 3;
		A1[1][0] = 3;
		A1[1][1] = 2;
		A1[1][2] = 1;
		A1[2][0] = 1;
		A1[2][1] = 2;
		A1[2][2] = 0;
		//**************************************
		A2[0][0] = 2;
		A2[0][1] = 0;
		A2[0][2] = 2;
		A2[1][0] = 1;
		A2[1][1] = 1;
		A2[1][2] = 0;
		A2[2][0] = 1;
		A2[2][1] = 2;
		A2[2][2] = 2;
		//*************************************
		B[0][0] = 2;
		B[1][0] = 1;
		B[2][0] = 3;

		//***********************************
		Multiply(C, A, B, aRows, aCols, bRows, bCols);

		for (int i = 0; i < aRows; i++)
		{
			for (int j = 0; j < bCols; j++)
			{
				std::cout << C[i][j] << ' ';
			}
			std::cout << '\n';
		}


		MPI_Send(&A1, aRows*aCols, MPI_INT, 1, 9, MPI_COMM_WORLD);
		MPI_Send(&B , bRows*bCols, MPI_INT, 1, 10, MPI_COMM_WORLD);

		MPI_Send(&A2, aRows*aCols, MPI_INT, 2, 11, MPI_COMM_WORLD);
		MPI_Send(&B , bRows*bCols, MPI_INT, 2, 12, MPI_COMM_WORLD);

	}

	for (id = 1; id < 3; id++)
	{
		if (id == 1)
		{


			MPI_Recv(&A1, aRows*aCols, MPI_INT, 0, 9, MPI_COMM_WORLD, &status);
			printf("receive data:%d", A1);
			MPI_Recv(&B, aRows*aCols, MPI_INT, 0, 10, MPI_COMM_WORLD, &status);
			printf("receive data:%d", B);

			Multiply(C1, A1, B, aRows, aCols, bRows, bCols);

			for (int i = 0; i < aRows; i++)
			{
				for (int j = 0; j < bCols; j++)
				{
					cout << C1[i][j] << endl;

				}
			}
		}

		//***************** receiver buffer *****************************
		else if (id == 2)
		{

			MPI_Recv(&A2, aRows*aCols, MPI_INT, 0, 11, MPI_COMM_WORLD, &status);
			printf("receive data:%d", A2);
			MPI_Recv(&B, aRows*aCols, MPI_INT, 0, 12, MPI_COMM_WORLD, &status);
			printf("receive data:%d", B);

			//**************************************************************
			MPI_Status status;
			Multiply(C2, A2, B, aRows, aCols, bRows, bCols);
			for (int i = 0; i < aRows; i++)
			{
				for (int j = 0; j < bCols; j++)
				{
					cout << C2[i][j] << endl;
				}

			}
		}
	}

	MPI_Finalize();
	//return 0;

	for (int i = 0; i < aRows; i++)
	{
		for (int j = 0; j < bCols; j++)
		{
			C[i][j] = C1[i][j] + C2[i][j];
		}

	}

	for (int i = 0; i < aRows; i++)
	{
		for (int j = 0; j < bCols; j++)
		{
			cout << C2[i][j] << endl;
		}

	}

}

void Multiply(int **res, int **A, int **B, int aRows, int aCols, int bRows, int bCols)
{
	if (aCols != bRows)
		return;

	for (int i = 0; i < aRows; i++)
	{
		for (int j = 0; j < bCols; j++)
		{
			res[i][j] = 0;
			for (int k = 0; k < aCols; k++)
			{
				res[i][j] += A[i][k] * B[k][j];
			}
		}
	}
}

void Initialise(int **res, int rows, int cols)
{
	for (int i = 0; i < rows; i++)
	{
		for (int j = 0; j < cols; j++)
		{
			res[i][j] = 0;
		}
	}
}


void timestamp()
{
# define TIME_SIZE 40

	static char time_buffer[TIME_SIZE];
	const struct std::tm *tm_ptr;
	std::time_t now;

	now = std::time(NULL);
	tm_ptr = std::localtime(&now);

	std::strftime(time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm_ptr);

	std::cout << time_buffer << "\n";

	return;
# undef TIME_SIZE
}

Last edited on
(1) Put code in code tags. Otherwise it is unreadable.

(2) Start with something simple - like sending a small 1-d array from one processor to another.

Your code as it stands is unsalvageable.
What does unslavageable means?
It means it's so broken that it's preferable to rewrite it from scratch rather than attempting to fix it.

By the way, dictionaries exist.
Last edited on
If you want a cheap and nasty version for TWO processors you can try this.

For convenience, I've flattened the arrays to 1-d, and used vectors rather than new/delete. (You can get at the data buffer with the .data() member function).

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#include <iostream>
#include <vector>
#include <numeric>
#include "mpi.h"
using namespace std;

int main( int argc, char* argv[] )
{
   int rank, nproc;
   MPI_Status stat;
   int nums[3];
   int rows, cols, r0, r1;

   // Initialise MPI
   MPI_Init( &argc, &argv );
   MPI_Comm_size( MPI_COMM_WORLD, &nproc );
   MPI_Comm_rank( MPI_COMM_WORLD, &rank  );

   int tag = 1;    // Not crucial, but useful to increment for debugging a crash

   if ( rank == 0 )                                   // Root processor
   {
      rows = 4;
      vector<double> A =                              // "Flattened" array (4 x 5 matrix)
                   { 1, 2, 3, 4, 5,
                     6, 7, 8, 9, 10,
                     11, 12, 13, 14, 15,
                     16, 17, 18, 19, 20 };
      vector<double> B = { 10, 20, 30, 40, 50 };      // RHS (5-element vector)
      vector<double> result( rows );

      // Do half the rows on the root and half on the other processor
      r0 = rows / 2;   r1 = rows - r0;   cols = B.size();
      nums[0] = r0;   nums[1] = r1;   nums[2] = cols;

      // Send data to other processor
      MPI_Send( nums, 3, MPI_INT, 1, tag++, MPI_COMM_WORLD );
      MPI_Send( A.data() + r0 * cols, r1 * cols, MPI_DOUBLE, 1, tag++, MPI_COMM_WORLD );
      MPI_Send( B.data(), cols, MPI_DOUBLE, 1, tag++, MPI_COMM_WORLD );

      // Do multiplies for the first r0 rows
      for ( int i = 0; i < r0; i++ ) result[i] = inner_product( A.begin() + i * cols, A.begin() + (i+1) * cols, B.begin(), 0.0 );

      // Receive results back for the last r1 rows from the other processor
      MPI_Recv( result.data() + r0, r1, MPI_DOUBLE, 1, tag++, MPI_COMM_WORLD, &stat );

      // print results
      for ( int j = 0; j < rows; j++ ) cout << result[j] << '\n';
   }

   else                                               // Processor 1

   {
      // Receive data; need sizes first
      MPI_Recv( nums, 3, MPI_INT, 0, tag++, MPI_COMM_WORLD, &stat );
      r0 = nums[0];   r1 = nums[1];   cols = nums[2];
      vector<double> A(r1*cols), B(cols), result(r1);
      MPI_Recv( A.data(), r1 * cols, MPI_DOUBLE, 0, tag++, MPI_COMM_WORLD, &stat );
      MPI_Recv( B.data(), cols, MPI_DOUBLE, 0, tag++, MPI_COMM_WORLD, &stat );

      // Do multiplies for the last r1 rows
      for ( int i = 0; i < r1; i++ ) result[i] = inner_product( A.begin() + i * cols, A.begin() + (i+1) * cols, B.begin(), 0.0 );

      // Send back data
      MPI_Send( result.data(), r1, MPI_DOUBLE, 0, tag++, MPI_COMM_WORLD );
   }

   MPI_Finalize();
}


With Microsoft MPI (yes, really!) and g++:
Batch file to compile:
set OPT1="C:\Program Files (x86)\Microsoft SDKs\MPI\Include"
set OPT2="C:\Program Files (x86)\Microsoft SDKs\MPI\Lib\x64\msmpi.lib"
g++ -I%OPT1% -o mult.exe mult.cpp %OPT2%


"C:\Program Files\Microsoft MPI\bin"\mpiexec -n 2 mult.exe
550
1300
2050
2800
Last edited on
Many thanks for your helpful answer,
I just don't understand this part
A.data() + r0 * cols, r1 * cols in the second send_mpi. Could you please help me with more detailed here?
I am trying to arrange it so that:
(a) root (processor 0) is the only processor that knows all the matrix;
(b) processor 0 deals with the first r0 rows of the multiply and processor 1 deals with the last r1 rows.

If the array is flattened (i.e. written out sequentially) then the first element that root must send to processor 1 is element index r0*cols (remember that arrays count from 0). A.data() is a pointer to the start of the array, so A.data()+r0*cols will point to the required element. r1 rows of cols elements then means sending r1*cols elements. This is the MPI_Send on line 38.

At the receiving end, processor 1 doesn't need to know about the whole array, so it can receive the data straight into the start of its A buffer; i.e. the pointer A.data() on line 58.
Last edited on
Registered users can post here. Sign in or register to post.