Problem Sorting a Really Long Array

Pages: 12
How did you write it and what sort of error was it?


Here is the code that I used. It is basically the original code that I posted in the first post, except I replaced the static arrays with dynamic arrays. I haven't changed the sort method from a bubble sort approach yet, but that isn't necessarily the problem at this point.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

#include "stdafx.h"
#include <string>
#include <fstream>

#using <mscorlib.dll>

using namespace System;
using namespace System::IO;
using namespace std;
string *FileArray=new string[2500000];
string *FileArray2=new string[2500000];
string *FileArray3=new string[2500000];
char DataLine[1001];
int ReadMax;
int ReadCount;
int NumberOfWords = 0;


int ReadData ()
{
	//
	string tinput;
	string line;
	 
	ifstream datafile ("input.txt");
	ReadCount = 0;
	if (datafile.is_open())
	{
		while (! datafile.eof() )
		{

			getline(datafile, line, '\n');
			FileArray[ReadCount] = line;
			//cout << FileArray[ReadCount] << endl;
			ReadCount++;
		}
		datafile.close();
	}
	else cout << "Unable to open file"; 
	ReadMax = ReadCount;
	//cout << "ReadMax is " << ReadMax << endl;
	//
	return(0);
}


void ClearDataLine()
{

	for(int i = 0; i < 100; i++)
	{
		DataLine[i] = 0;
	}

}


void DeleteDuplicates()
{
	long counter = 0;
	long counter2 = 0;
	while(counter < (ReadMax-1))
	{
		while(FileArray2[counter] == FileArray2[(counter+1)])
		{
			counter++;
		}
		FileArray3[counter2] = FileArray2[counter];
		counter2++;
		counter++;
	}
 
 
}
 
void ProcessData ()
{
	string currentword;
	string nextword;
	string swapstring;
	long counter = 0;
	int counter2 = 0;
	int endchar = 0;
	int counter3 =0;
	int lettercounter = 0;
	int sortcount = 0;
	bool changemade = 1;
	bool finishedloop = 0;

	while(changemade == 1)
	{
		changemade = 0;
		counter = 0;
		while(counter < (ReadMax-1))
		{

			//read the word on the current line
			counter2 = 0;
			finishedloop = 0;
			while(finishedloop == 0)
			{
				if((counter2+1) == FileArray[counter].length())
				{

					DataLine[lettercounter] = FileArray[counter].at(counter2);
					//cout << DataLine[lettercounter];
					lettercounter++;


					//cout << FileArray2[NumberOfWords];
					//cout << endl;

					finishedloop = 1;
					lettercounter = 0;
				}
				else if(FileArray[counter].at(counter2) == ' ')
				{
					//do nothing
				}
				else if(FileArray[counter].at(counter2) == '-')
				{
					//do nothing
				}
				else
				{
					DataLine[lettercounter] = FileArray[counter].at(counter2);
					//cout << DataLine[lettercounter];
					lettercounter++;
				}
				counter2++;

			}
			char* pointchar = DataLine;
			currentword = pointchar;
			ClearDataLine();
			//read the word on the next line
			counter2 = 0;
			finishedloop = 0;
			while(finishedloop == 0)
			{
				if((counter2+1) == FileArray[counter+1].length())
				{
					DataLine[lettercounter] = FileArray[counter+1].at(counter2);
					//cout << DataLine[lettercounter];
					lettercounter++;

					//cout << FileArray2[NumberOfWords];
					//cout << endl;
					finishedloop = 1;
					lettercounter = 0;
				}
				else
				{
					DataLine[lettercounter] = FileArray[counter+1].at(counter2);
					//cout << DataLine[lettercounter];
					lettercounter++;
				}
				if(counter2 == 9999)
				{
					//cout << "too many characters" << endl;
					finishedloop = 1;
				}
				counter2++;
			}
			pointchar = DataLine;
			nextword = pointchar;
			ClearDataLine();
			//if the first letter is earlier in the alphabet, switch the words
			if(nextword.at(0) < currentword.at(0))
			{
				swapstring = currentword;
				currentword = nextword;
				nextword = swapstring;
				changemade = 1;
			}
			if(nextword.at(0) == currentword.at(0))
			{
				if((nextword.length() == 1) && (currentword.length() > 1))
				{
					swapstring = currentword;
					currentword = nextword;
					nextword = swapstring;
					changemade = 1;
				}
				if((nextword.length() > 1) && (currentword.length() > 1))
				{
					if(nextword.at(1) < currentword.at(1))
					{
						swapstring = currentword;
						currentword = nextword;
						nextword = swapstring;
						changemade = 1;
					} 
					if(nextword.at(1) == currentword.at(1))
					{
						if((nextword.length()>2) && (currentword.length() >2))
						{
							if(nextword.at(2) < currentword.at(2))
							{
								swapstring = currentword;
								currentword = nextword;
								nextword = swapstring;
								changemade = 1;
							}
						} 
						if((nextword.length() == 2) && (currentword.length() > 2))
						{
							swapstring = currentword;
							currentword = nextword;
							nextword = swapstring;
							changemade = 1;
						} 
					}
				}
			}
			FileArray[counter] = currentword;
			FileArray[counter+1] = nextword;
			FileArray2[counter] = currentword;
			counter++;
		}
		sortcount++;
		cout << sortcount << endl;
	}


}
 
 
 
 
 
int WriteData ()
{
	//
	string tinput;
	string line;
	ofstream outputfile ("wordlist.txt");
	ReadCount = 0;
	if (outputfile.is_open())
	{
		while (ReadCount < (ReadMax-1))
		{
			outputfile << FileArray3[ReadCount] << "\n";
			//cout << FileArray2[ReadCount] << endl;
			ReadCount++;
			}
		outputfile.close();
	}
	else cout << "Unable to open file"; 

return(0);
}
int _tmain()
{
	string menuinput;
	cout << "Reading input file..." << endl;
	ReadData();
	cout << "Read complete." << endl;
	//cin >> menuinput;
	cout << "Sorting words..." << endl;
	ProcessData();
	cout << endl << "Sorting complete." << endl;
	//cin >> menuinput;
	cout << "Deleting duplicate words..." << endl;
	DeleteDuplicates();
	cout << "Finished deleting duplicate words." << endl;
	cout << "Writing output file..." << endl;
	WriteData();
	cout << "Write complete." << endl;

	//cin >> menuinput;
	return 0;
}


As with the static string arrays it works fine for short inputs, but throws an exception if I run it with too large of an input.

The exception message is:

"An unhandled exception of type 'System.Runtime.InteropServices.SEHException' occurred in Finalize Word List.exe

Additional information: External component has thrown an exception."

This is where the debugger shows it stopping in "string.cpp":

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
// string -- template string support functions
#include <istream>
_STD_BEGIN


_CRTIMP2 void _String_base::_Xlen() const
	{	// report a length_error
	_THROW(length_error, "string too long");
	}

_CRTIMP2 void _String_base::_Xran() const
	{	// report an out_of_range error
	_THROW(out_of_range, "invalid string position");
	}
_STD_END

/*
 * Copyright (c) 1992-2002 by P.J. Plauger.  ALL RIGHTS RESERVED.
 * Consult your license regarding permissions and restrictions.
 V3.13:0009 */


First question:
System.Runtime.InteropServices.SEHException
Why, oh, why, are you using CLR?

File>New...>Project>Project types: Visual C++/Win32>Templates: Win32 Console Application

Second question: Why do you need three arrays, each of 2.5 million elements, for a grand total of 57.22 MiB (each string object uses 8 bytes, not counting the stuff that's allocated separately)?
You should need no more than one. If you need more than one, you're doing it wrong.
I'm not really sure what CLR is. I am using a Console Application template.

I initially stayed away from using a vector, because I haven't used them before. However I tried Anilpanicker's vector code and it is kicking the heck out of anything that I have come up with.

I have been timing it and it is processing 30,000 lines of input per minute. I expect it to finish processing the data in just over an hour.

Here is the code:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#include<vector>
#include<algorithm>

using namespace System;
using namespace System::IO;
using namespace std;
char DataLine[1001];
int ReadMax;
int ReadCount;
int NumberOfWords = 0;

vector<string> noDupStr; // vector to store the non duplicate words

void Add(vector<string> &strVect,string word) // Add non duplicate word to the vector
{
	vector<string>::iterator start= strVect.begin();
	vector<string>::iterator end= strVect.end();
	bool dup=false;
	for(vector<string>::iterator it=start; it !=end; it++)
	{
		if(word==(*it) )
		{
			dup=true;
			break;
		}
	}
	if(!dup)
		strVect.push_back(word);
}
 
int WriteData ()
{
	fstream out( "output.txt",ios::out| ios::trunc );
	ostream_iterator<string> os(out," ");
	copy(noDupStr.begin(), noDupStr.end(), os);
	out.close();
return(0);
}


int main ()
{
	string line;
	ReadCount = 0;
	cout << "Reading the input..." << endl;
	ifstream f;
	f.open("input.txt");
	if(!f.is_open())
	{
		cout << "error in opening file " << endl;
		return 1;
	}
	cout << "Sorting the input..." << endl;
	while(f.good())
	{
		getline(f,line);
		Add(noDupStr,line); //reference
		ReadCount++;
		if((ReadCount % 1000) == 0)
		cout << ReadCount << endl;
	}
	f.close();
	ReadMax = ReadCount;
	sort(noDupStr.begin(),noDupStr.end());
	cout << "Writing the output..." << endl;
	WriteData();
	return 0;
}


It's still too slow, in my book.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <iostream>
#include <fstream>
#include <string>
#include <set>

int main(){
	std::ifstream ifile("input.txt");
	std::set<std::string> set;
	unsigned a=0;
	while(ifile.good()){
		std::string line;
		std::getline(ifile,line);
		set.insert(line);
		//I hate explicit comparisons to zero.
		if(!(++a%1000))
			std::cout <<a<<std::endl;
	}
	std::ofstream ofile("output.txt");
	for (std::set<std::string>::iterator i=set.begin();i!=set.end();set++)
		ofile <<*i<<std::endl;
	return 0;
}

You could have solved this hours ago if you had researched the terms I dropped earlier.
Last edited on
You could have solved this hours ago if you had researched the terms I dropped earlier.


At least it didn't take 25 years...
helios, thak you, that was fantastic.
In essence
1) remove duplicate before add/sort
2)sort while adding
gives the optimum performance.
and std::set is the natural choice.
Topic archived. No new replies allowed.
Pages: 12