Problem Sorting a Really Long Array

Pages: 12
I have a problem which seems difficult to solve.

This is my code for a program which sorts a list of words and then removes the duplicates. The program uses a bubble sort approach for sorting and it only sorts by the first couple letters or each line.

The input is a text file called "input.txt" which is 2.14 million lines long and has one word in lower case on each line.

I am using Visual C++ .Net 2003 as my compiler.

The code compiles fine. The code also does exactly what I want for short input files (up to about 200 lines of input). However, it gives me an error when running if the input is longer than that. Do you have any ideas on how to fix the code so that I can process the input file that is 2.14 million lines long?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#include "stdafx.h"
#include <stdio.h> 
#include <stdlib.h> 
#include <ctime>
#include <cstdlib>
#include <string>
#include <sstream>
#include <windows.h>
#include <fstream>
#include <iostream>
#using <mscorlib.dll>
 
using namespace System;
using namespace System::IO;
using namespace std;
string FileArray[6000001];
string FileArray2[6000001];
string FileArray3[6000001];
char DataLine[10001];
int ReadMax;
int ReadCount;
int NumberOfWords = 0;
int ReadData ()
{
//
ReadMax = 6000000;
string tinput;
string line;
 
ifstream datafile ("input.txt");
ReadCount = 0;
if (datafile.is_open())
{
while (! datafile.eof() )
{

getline(datafile, line, '\n');
FileArray[ReadCount] = line;
//cout << FileArray[ReadCount] << endl;
ReadCount++;
}
datafile.close();
}
else cout << "Unable to open file"; 
ReadMax = ReadCount;
//cout << "ReadMax is " << ReadMax << endl;
//
return(0);
}
void ClearDataLine()
{

for(int i = 0; i < 1000; i++)
{
DataLine[i] = 0;
}

}
void DeleteDuplicates()
{
long counter = 0;
long counter2 = 0;
while(counter < (ReadMax-1))
{
while(FileArray2[counter] == FileArray2[(counter+1)])
{
counter++;
}
FileArray3[counter2] = FileArray2[counter];
counter2++;
counter++;
}
 
 
}
 
void ProcessData ()
{
string currentword;
string nextword;
string swapstring;
long counter = 0;
int counter2 = 0;
int endchar = 0;
int counter3 =0;
int lettercounter = 0;
int sortcount = 0;
bool changemade = 1;
bool finishedloop = 0;

while(changemade == 1)
{
changemade = 0;
counter = 0;
while(counter < (ReadMax-1))
{

//read the word on the current line
counter2 = 0;
finishedloop = 0;
while(finishedloop == 0)
{
if((counter2+1) == FileArray[counter].length())
{

DataLine[lettercounter] = FileArray[counter].at(counter2);
//cout << DataLine[lettercounter];
lettercounter++;


//cout << FileArray2[NumberOfWords];
//cout << endl;

finishedloop = 1;
lettercounter = 0;
}
else if(FileArray[counter].at(counter2) == ' ')
{
//do nothing
}
else if(FileArray[counter].at(counter2) == '-')
{
//do nothing
}
else
{
DataLine[lettercounter] = FileArray[counter].at(counter2);
//cout << DataLine[lettercounter];
lettercounter++;
}
/*
if(counter2 == 9999)
{
//cout << "too many characters" << endl;
finishedloop = 1;
}
*/
counter2++;

}
char* pointchar = DataLine;
currentword = pointchar;
ClearDataLine();
//read the word on the next line
counter2 = 0;
finishedloop = 0;
while(finishedloop == 0)
{
if((counter2+1) == FileArray[counter+1].length())
{

DataLine[lettercounter] = FileArray[counter+1].at(counter2);
//cout << DataLine[lettercounter];
lettercounter++;


//cout << FileArray2[NumberOfWords];
//cout << endl;

finishedloop = 1;
lettercounter = 0;
}
else
{
DataLine[lettercounter] = FileArray[counter+1].at(counter2);
//cout << DataLine[lettercounter];
lettercounter++;
}

if(counter2 == 9999)
{
//cout << "too many characters" << endl;
finishedloop = 1;
}

counter2++;

}
pointchar = DataLine;
nextword = pointchar;
ClearDataLine();
//if the first letter is earlier in the alphabet, switch the words
if(nextword.at(0) < currentword.at(0))
{
swapstring = currentword;
currentword = nextword;
nextword = swapstring;
changemade = 1;
}

if(nextword.at(0) == currentword.at(0))
{

if((nextword.length() == 1) && (currentword.length() > 1))
{
swapstring = currentword;
currentword = nextword;
nextword = swapstring;
changemade = 1;
}




if((nextword.length() > 1) && (currentword.length() > 1))
{
if(nextword.at(1) < currentword.at(1))
{
swapstring = currentword;
currentword = nextword;
nextword = swapstring;
changemade = 1;
} 



if(nextword.at(1) == currentword.at(1))
{
if((nextword.length()>2) && (currentword.length() >2))
{
if(nextword.at(2) < currentword.at(2))
{
swapstring = currentword;
currentword = nextword;
nextword = swapstring;
changemade = 1;
}
} 

if((nextword.length() == 2) && (currentword.length() > 2))
{
swapstring = currentword;
currentword = nextword;
nextword = swapstring;
changemade = 1;
} 

 
}
}
 

}



FileArray[counter] = currentword;
FileArray[counter+1] = nextword;

FileArray2[counter] = currentword;
counter++;
}
sortcount++;
cout << sortcount << endl;
}


}
 
 
 
 
 
int WriteData ()
{
//
string tinput;
string line;
ofstream outputfile ("wordlist.txt");
ReadCount = 0;
if (outputfile.is_open())
{
while (ReadCount < (ReadMax-1))
{
outputfile << FileArray3[ReadCount] << "\n";
//cout << FileArray2[ReadCount] << endl;
ReadCount++;
}
outputfile.close();
}
else cout << "Unable to open file"; 

//
return(0);
}
int _tmain()
{
string menuinput;
cout << "Reading input file..." << endl;
ReadData();
cout << "Read complete." << endl;
//cin >> menuinput;
cout << "Sorting words..." << endl;
ProcessData();
cout << endl << "Sorting complete." << endl;
//cin >> menuinput;
cout << "Deleting duplicate words..." << endl;
DeleteDuplicates();
cout << "Finished deleting duplicate words." << endl;
cout << "Writing output file..." << endl;
WriteData();
cout << "Write complete." << endl;

//cin >> menuinput;
return 0;
}


I would really appreciate any help that you can give me.

Thanks,

mnutsch
You can't allocate such large arrays on the stack.
http://www.cplusplus.com/doc/tutorial/dynamic/


Bubble sort takes O(n2) time to sort n elements, which means that to sort 2,140,000 elements, you'll have to perform 4,579,600,000,000 comparisons (some implementations can lower the number to 2,289,801,070,000 comparisons). A computer capable of doing a billion comparisons per second will take about an hour to complete the program.
(In reality, it will take even longer, since swapping is expensive [compared to comparing] and bubble sort performs a lot of swaps.)
Last edited on
Ok.. so what is the solution for the large array problem? Wouldn't a dynamic array of the same size run into the same issue?

I picked the bubble sort approach because it was simple and intuitive. I figured that when the program is ready, I can just let it run overnight and process the large input file.
I really wish you guys posting these large programs woud take 5 seconds to properly indent within whatever text editor you use. Typically there is a beautify option with any good code editor. Do you realize how hard it is to read what you posted?
If I understood correctly you need an output file with Sorted Non Duplicate words in it.
what helios stated is correct you can not create such large arrays in the stack, you need to think a logic in which you don't need to store all these words(lines) in many arrays.
A program along the following lines may help you, I have not tested this for large file since I don't have one. Hope this gives you some idea to change your program.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#pragma warning (disable : 4786)
#include<iostream>
#include<string>
#include<fstream>
#include<vector>
#include<algorithm>
using namespace std;

vector<string> noDupStr; // vector to store the non duplicate words

void Add(vector<string> &strVect,string word) // Add non duplicate word to the vector
{
	vector<string>::iterator start= strVect.begin();
	vector<string>::iterator end= strVect.end();
	bool dup=false;
	for(vector<string>::iterator it=start; it !=end; it++)
	{
		if(word==(*it) )
		{
			dup=true;
			break;
		}
	}
	if(!dup)
		strVect.push_back(word);
	//return strVect;
}

int main ()
{
	string line;
	ifstream f;
	f.open("test1.txt");
	if(!f.is_open())
	{
		cout << "error in opening file " << endl;
		return 1;
	}
	while(f.good())
	{
		getline(f,line);
		Add(noDupStr,line); //reference

	}
	f.close();

	sort(noDupStr.begin(),noDupStr.end());

	// then you write it

	// I have not tested for a large file since I don't have one

	return 0;

}

Last edited on
There's a 23GB wordlist somewhere on the internet called "bigfuckoffwordlist.rar". You could test some of the 43 half-GB ish text files in there.

If they aren't big enough, a shell (.sh) script is included called merge.sh to merge them into a 23-ish GB file.

If you are running windows, move them all into a directory, create a .bat file in the same directory, and copy this code into it:
Bare in mind that I forget what the files are called, I think it's WordList00 through WordList43. And that's if you can even find the file.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
copy WordList00.txt+WordList01.txt+WordList02.txt+WordList03.txt+WordList04.txt+WordList05.txt+WordList06.txt+WordList07.txt+WordList08.txt+WordList09.txt Words0.txt

copy WordList10.txt+WordList11.txt+WordList12.txt+WordList13.txt+WordList14.txt+WordList15.txt+WordList16.txt+WordList17.txt+WordList18.txt+WordList19.txt Words1.txt

copy WordList20.txt+WordList21.txt+WordList22.txt+WordList23.txt+WordList24.txt+WordList25.txt+WordList26.txt+WordList27.txt+WordList28.txt+WordList29.txt Words2.txt

copy WordList30.txt+WordList31.txt+WordList32.txt+WordList33.txt+WordList34.txt+WordList35.txt+WordList36.txt+WordList37.txt+WordList38.txt+WordList39.txt Words3.txt

copy WordList40.txt+WordList41.txt+WordList42.txt+WordList43.txt Words4.txt

copy Words0.txt+Words1.txt+Words2.txt+Words3.txt+Words4.txt Words.txt

del Words0.txt
del Words1.txt
del Words2.txt
del Words3.txt
del Words4.txt

del WordList00.txt
del WordList01.txt
del WordList02.txt
del WordList03.txt
del WordList04.txt
del WordList05.txt
del WordList06.txt
del WordList07.txt
del WordList08.txt
del WordList09.txt
del WordList10.txt
del WordList11.txt
del WordList12.txt
del WordList13.txt
del WordList14.txt
del WordList15.txt
del WordList16.txt
del WordList17.txt
del WordList18.txt
del WordList19.txt
del WordList20.txt
del WordList21.txt
del WordList22.txt
del WordList23.txt
del WordList24.txt
del WordList25.txt
del WordList26.txt
del WordList27.txt
del WordList28.txt
del WordList29.txt
del WordList30.txt
del WordList31.txt
del WordList32.txt
del WordList33.txt
del WordList34.txt
del WordList35.txt
del WordList36.txt
del WordList37.txt
del WordList38.txt
del WordList39.txt
del WordList40.txt
del WordList41.txt
del WordList42.txt
del WordList43.txt

Which will merge them into 5 files, (Words0.txt, etc.), merge the 5 files into 1 file (Words.txt) and then delete all the other files.

It'll take a long time though.
Last edited on
Did you perform any google / thread searches yet? There is tons of info on sorting on the net and lots of free code to download so that you can try out each mechanism. Did you even try to just use the std::sort or did you just assume that std::sort won't do the job?
http://www.dreamincode.net/forums/showtopic14799.htm
http://www.softpanorama.org/Algorithms/sorting.shtml

The advantage of using heap memory is that it is more dynamic. A program probably has a bigger heap than stack. Of course it depends on your compiler and linker settings. Some compilers allow you to specify stack/heap sizes. I have always had trouble when dealing with large stack arrays and I don't know how to catch an exception if a stack array ends up being too big. I just think that it is easier to use dynamic memory for large arrays since you can catch the bad_alloc exception and easily increase the allowable heap for a program.

I also like anilpanicker's idea to read the file one line at a time. if you eliminate the dups before sorting your array can be much smaller, although I'm not sure about the specifics in that example. Every time you add a string to the vector the entire thing gets returned by value. I don't think that is a good idea. Just pass the vector to the add function by reference. Also, use the std::find algorithm instead of manually searching. Below, I have created a mod to the Add function.

1
2
3
4
5
6
7
8
9
10
void Add(vector<string>& strVect, string& word) // Add non duplicate word to the vector
{
	vector<string>::iterator start= strVect.begin();
	vector<string>::iterator end= strVect.end();              
                // alternatively, if find returns end iterator, it isn't a dup so add it.
               if(std::find(start, end, word) == end)
               {
                   strVect.push_back(word);
                }
}
Last edited on
I agree you don't need to return the vector, reference is better
Thanks for all the advice. I was able to come up with some code which I think will be able to sort the list.

It goes really slow compared to the previous code, so I am going to have to look at using a faster sort method. I think that the speed issue is also a side effect of the program constantly reading and writing from the hard drive.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#include "stdafx.h"
#include <stdio.h> 
#include <stdlib.h> 
#include <ctime>
#include <cstdlib>
#include <string>
#include <sstream>
#include <windows.h>
#include <fstream>
#include <iostream>
#using <mscorlib.dll>

using namespace System;
using namespace System::IO;
using namespace std;

char DataLine[10001];
int ReadMax;
int ReadCount;
int NumberOfWords = 0;
string currentword;
string nextword;
string swapstring;
long counter = 0;
int counter2 = 0;
int endchar = 0;
int counter3 =0;
int lettercounter = 0;
int sortcount = 0;
bool changemade = 1;
bool finishedloop = 0;
string tinput;
string line;
int timessorted = 0;


void ClearDataLine()
{
	for(int i = 0; i < 1000; i++)
	{
		DataLine[i] = 0;
	}
}

 
void WriteData()
{
	ofstream outputfile ("temp.txt", ios::app);
	if (outputfile.is_open())
	{
		outputfile << currentword << "\n";
		outputfile.close();
	}
	else cout << "Unable to open file"; 

}

void WriteOutputData()
{
	ofstream outputfile ("output.txt", ios::app);
	if (outputfile.is_open())
	{
		outputfile << currentword << "\n";
		outputfile.close();
	}
	else cout << "Unable to open file"; 

}

 

void DeleteDuplicates()
{
	ifstream datafile ("wordlist.txt");
	sortcount = 0;
	if (datafile.is_open())
	{
		//get the first line of the file
		getline(datafile, line, '\n');
		currentword = line;
		//loop until the end of the file
		while (! datafile.eof() )
		{
			getline(datafile, line, '\n');
			nextword = line;
			if(currentword == nextword)
			{
				//do nothing if there is a duplicate word
			}
			else 
			{
				//otherwise write the word to the temporary file
				WriteOutputData();
			}
			currentword = nextword;
			sortcount++;
			cout << sortcount << endl;
		}
		datafile.close();
	}
	else 
	{
		cout << "Unable to open file"; 
	}
}

 

 

void CheckWordPosition()
{
	//if the first letter is earlier in the alphabet, switch the words
	if(nextword.at(0) < currentword.at(0))
	{
		swapstring = currentword;
		currentword = nextword;
		nextword = swapstring;
		changemade = 1;
	}
	if(nextword.at(0) == currentword.at(0))
	{
		if((nextword.length() == 1) && (currentword.length() > 1))
		{
			swapstring = currentword;
			currentword = nextword;
			nextword = swapstring;
			changemade = 1;
		}
		if((nextword.length() > 1) && (currentword.length() > 1))
		{

			if(nextword.at(1) < currentword.at(1))

			{
				swapstring = currentword;
				currentword = nextword;
				nextword = swapstring;
				changemade = 1;
			} 

			if(nextword.at(1) == currentword.at(1))
			{
				if((nextword.length()>2) && (currentword.length() >2))
				{
					if(nextword.at(2) < currentword.at(2))

					{
						swapstring = currentword;
						currentword = nextword;
						nextword = swapstring;
						changemade = 1;
					}
				} 

				if((nextword.length() == 2) && (currentword.length() > 2))
				{
					swapstring = currentword;
					currentword = nextword;
					nextword = swapstring;
					changemade = 1;
				} 
			}
		}
	}

}

 

void RemoveSpaces()
{
	//clear the current word
	counter2 = 0;
	finishedloop = 0;
	while(finishedloop == 0)
	{
		if((counter2+1) == currentword.length())
		{
			DataLine[lettercounter] = currentword.at(counter2);
			//cout << DataLine[lettercounter];
			finishedloop = 1;
			lettercounter = 0;
		}
		else if(currentword.at(counter2) == ' ')
		{
			//do nothing if a space is found
		}
		else if(currentword.at(counter2) == '-')
		{
			//do nothing if a dash is found
		}
		else
		{
			DataLine[lettercounter] = currentword.at(counter2);
			//cout << DataLine[lettercounter];
			lettercounter++;
		}
		counter2++;
	}
	char* pointchar = DataLine;
	currentword = pointchar;
	ClearDataLine();

	//clear the next word
	counter2 = 0;
	finishedloop = 0;
	while(finishedloop == 0)
	{
		if((counter2+1) == nextword.length())
		{
			DataLine[lettercounter] = nextword.at(counter2);
			//cout << DataLine[lettercounter];
			lettercounter++;
			finishedloop = 1;
			lettercounter = 0;
		}
		else if(nextword.at(counter2) == ' ')
		{
			//do nothing if a space is found
		}
		else if(nextword.at(counter2) == '-')
		{
			//do nothing if a dash is found
		}
		else
		{
			DataLine[lettercounter] = nextword.at(counter2);
			//cout << DataLine[lettercounter];
			lettercounter++;
		}
		counter2++;
	}
	pointchar = DataLine;
	nextword = pointchar;
	ClearDataLine();

}
 
void RenameTempList()
{
	remove("wordlist.txt"); //delete the old list
	rename("temp.txt","wordlist.txt"); //rename the temporary list as wordlist.txt
	remove("temp.txt"); //delete the old list
}

void RenameOutputList()
{
	remove("wordlist.txt"); //delete the old list
	rename("output.txt","wordlist.txt"); //rename the temporary list as wordlist.txt
	remove("output.txt"); //delete the old list
}

void ProcessData ()
{
	changemade = 1;
	while(changemade == 1) //loop as long as a word is moved in the file
	{
		cout << "This is pass number " << timessorted << " through the list." << endl;
		changemade = 0;
		sortcount = 0;
		//read wordlist.txt
		ifstream datafile ("wordlist.txt");
		if (datafile.is_open())
		{
			getline(datafile, line, '\n');
			sortcount++;
			//cout << "This is wordlist line # " << sortcount << endl; //show me that the program is working
			//cout << "Read the word " << line << endl;
			currentword = line;
			while (!datafile.eof() )
			{
				getline(datafile, line, '\n');
				sortcount++;
				//cout << "This is wordlist line # " << sortcount << endl; //show me that the program is working
				//cout << "Read the word " << line << endl;
				if(line.length() == 0)
				{
					//do nothing if a blank line is found
				}
				else
				{
					nextword = line; // read a line from wordlist.txt
					//RemoveSpaces(); //remove spaces and dashes from the word
					//cout << "Checking word position..." << endl;
					CheckWordPosition(); //swap the current word with the next word if they are out of order
					//cout << "Finished checking word position." << endl;
					//cout << "Writing word to temp file..." << endl;
					WriteData(); //write the current word to temporary.txt
					//cout << "Finished writing word to temp file." << endl;
					currentword = nextword; //progress to the next word
				}
				
			}
			datafile.close();
			WriteData(); //write the last word to temporary.txt
			//cout << "Renaming the temp list..." << endl;
			RenameTempList();
			//cout << "Finished renaming the temp list." << endl;
		}
		else 
		{
			cout << "Unable to open file"; 
		}
		
		timessorted++;
	}
	remove("temp.txt"); //delete the old list
}

 
int _tmain()
{
	string menuinput;
	cout << "Processing data..." << endl;
	ProcessData();
	cout << endl << "Processing complete." << endl;
	cout << "Deleting duplicate words..." << endl;
	DeleteDuplicates();
	cout << "Finished deleting duplicate words." << endl;
	RenameOutputList();
	return 0;

}


I guess that I am going to have to go back to the drawing board on this code unless someone out there has a supercomputer that I can borrow. I let the program run overnight on my PC, but the program was only able to make 40 passes through the input file.

Assuming that it would only take 1,070,000 passes to sort the input, then it would take 214,000 hours or 8,917 days to sort the data.

I know that there are supposedly faster sorting algorithms out there, but I am at loss as to how to implement them in a situation where I cannot load the entire input into memory.

Is there a way to change the configuration on my PC so that I can load longer arrays into the stack?

I will also try anilpanicker's suggested code and see if I can get that to work.
Is there a way to change the configuration on my PC so that I can load longer arrays into the stack?
No! Dynamic memory!

int *reallyLongArray=new int[100*1000*1000];

The stack is of a fixed size (1 or 2 MiB, I believe). The heap consists of most of a computer's free memory.

EDIT: And use a better sorting algorithm, for Christ's sake.
http://en.wikipedia.org/wiki/Sorting_algorithm#Comparison_of_algorithms
Quick sort is not too hard to implement, but it may be better to just use std::sort().
Last edited on
I really have no idea what your second example is trying to do. You are making this a lot harder than it needs to be. Your initial requirements seem very simple yet you have a lot of functions that make no sense to me such as RenameOutputList, RenameTempList, RemoveSpaces, and clearDataLine. If the file is a bunch of words, why do you need to remove spaces from anything? Are we dealing with an array of words or not?

I can only continue to help if you take some of the advice into consideration. If you continue to go it alone then I don't know what to tell you.

If you follow our advice and read one word at a time you may consider keeping the list sorted as you build it instead of waiting until the end to sort it. If you do it that way you can use lower_bound to avoid inserting duplicates into your array. If there are a lot of duplicates that can save you a tremendous amount of time. When lower bound returns, you'll have to check for equality before actually inserting into the array. This is useful because it avoids all of the copying that would be done by any sorting algorithm. I'd also consider using std::deque instead of std::vector. Deque is better here because contiguous memory isn't really needed. vector could cause you problems due to the potential for memory reallocation as it grows. The only trouble with that is that deque doesn't have a reserve member function for preallocating memory.
http://cplusplus.com/reference/algorithm/lower_bound/

Another option is using a linked list which provides a lot of built in functions and it has its own sort algorithm as a member of the container. You could just sort the list and then use list::unique to get rid of duplicates.
http://cplusplus.com/reference/stl/list/

The point is that there are many options that are already built into the language. Use them! They were designed to be efficient. If you choose the right tool for the job, you should have your program up and running in no time.
Helios - I'll try using dynamic arrays with my first example and let you know if it works. I will also look at the other sorting methods. Thanks!

Kempofighter - My second example is an attempt to read the file one line at a time as suggested by Anilpanicker and seconded by you. The functions RenameOutputList, RenameTempList solve a problem that I ran into. That problem is that you can't edit a file at the same time you are reading it line by line. The solution I came up with was to write data to a temporary file each time the sort passes through the input file and then replace the input file with the temporary file. The function clearDataLine is necessary to avoid situations where leftover letters at the end of long words get transplanted onto shorter words. The input is a list of words. However there are some words that have spaces or dashes in them which I don't want to be there (for example "a la carte" is technically three words - and french! -, but I don't want it to be processed as three separate words).

The idea of removing the duplicates before sorting the list makes alot of sense. I think that I can do that pretty easily by adding a function which will create a list of words and ignore any duplicates before sorting the list.
Last edited on
write data to a temporary file each time the sort passes through the input file and then replace the input file with the temporary file
Oh, wow. I'd estimate that you could have left that thing running for a decade without finishing.

I think that I can do that pretty easily by adding a function which will create a list of words and ignore any duplicates before sorting the list.
The way you're thinking it, that alone will take you two and a quarter trillion steps (worst case). If you're going to disallow duplicates, perhaps you should use an std::set, which also automatically sorts its elements.
Oh, wow. I'd estimate that you could have left that thing running for a decade without finishing.


Yea. My estimate was just under 25 years.

The way you're thinking it, that alone will take you two and a quarter trillion steps (worst case). If you're going to disallow duplicates, perhaps you should use an std::set, which also automatically sorts its elements.


Here are the steps that I am thinking:

While there are still lines from the input file to read
{
Read a line (word) from the input
Check an output file to see if the word is already in the output file (i.e. read the output file)
If the word is not already in the output file then add the word to the output file
}

The equation for the maximum number of steps is probably:
((1/2)*n*(n+1))+2n+1

Since there are about 2,140,000 lines of input then this alone should take 2,289,805,350,000 steps if there are really 2.14 million different words.

I think that there are only about 70,000 different words, so it is probably more like 74,904,315,000 steps.


Check an output file to see if the word is already in the output file (i.e. read the output file)
So, what will you be doing for the next 25 years? My guess is making sure the computer stays on.

((1/2)*n*(n+1))+2n+1
It's (n^2+n)/2, the result of which for n=2.14E6 is 2,289,801,070,000.
Here is the code that I wrote to remove the duplicates from the list before sorting it:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

#include "stdafx.h"
#include <string>
#include <fstream>

#using <mscorlib.dll>

using namespace System;
using namespace System::IO;
using namespace std;

string currentword;
int sortcount = 0;
string tinput;
string line;
bool wordinpreplist = 0;


void CheckPrepList()
{
	//open the input file
	ifstream datafile ("output.txt");
	if (datafile.is_open())
	{
		//loop until the end of the file
		while (! datafile.eof() )
		{
			getline(datafile, line, '\n');
			if(currentword == line)
			{
                wordinpreplist = 1;
			}
		}
        datafile.close();
	}
	else 
	{
		cout << "Unable to open file"; 
	}

}


void WriteOutputData()
{
	ofstream outputfile ("output.txt", ios::app);
	if (outputfile.is_open())
	{
		outputfile << currentword << "\n";
		outputfile.close();
	}
	else cout << "Unable to open file"; 

}


void PrepareList()
{
	//open the input file
	ifstream datafile ("wordlist.txt");
	sortcount = 0;
	if (datafile.is_open())
	{
		
		//loop until the end of the file
		while (! datafile.eof() )
		{
			getline(datafile, line, '\n');
			currentword = line;
			wordinpreplist = 0;
			CheckPrepList();
			if(wordinpreplist == 1)
			{
				//do nothing if the word is already in the prep list
			}
			else 
			{
				//otherwise write the word to the output file
				WriteOutputData();
			}
			sortcount++;
			cout << sortcount << endl;
		}
		datafile.close();
	}
	else 
	{
		cout << "Unable to open file"; 
	}


}
 


 
int _tmain()
{
	string menuinput;
	cout << "Prepping the list..." << endl;
	PrepareList();
	cout << endl << "Preparing the list is complete." << endl;
		
	cin >> menuinput;
	return 0;
}

Maybe I didn't emphasize enough the drawbacks of performing growing lineal searches on unsorted data using a disk as the main memory, particularly when compared to known good and fast algorithms.
Do what you want, you're just ignoring advice, at this point.
Last edited on
Maybe I didn't emphasize enough the drawbacks of performing growing lineal searches on unsorted data using a disk as the main memory, particularly when compared to known good and fast algorithms.
Do what you want, you're just ignoring advice, at this point.


I tried your suggestion of using dynamic arrays, but that didn't work. It compiled fine, but the program gave the same error when I tried to run it with an input that is too large.
Why didn't you ask help with than instead of moving on to a less-than-optimal method on your own?

How did you write it and what sort of error was it?
Pages: 12