finding duplicates

hi all

i'm working on a project that searches for valid email addresses, identifies and removes duplicate addresses, and then posts them to a separate file.

my problem is removing the duplicate addresses... here's what i have so far, any help would be greatly appreciated as i can't seem to find a fitting answer from any previous posts. cheers!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#include <deque>
#include <fstream>
#include <iostream>
#include <string>
using namespace std;

bool isValidEmailCharacter(char c)
{
  bool result = false;
  if (c >= 'A' && c <= 'Z') result = true;
  else if (c >= 'a' && c <= 'z') result = true;
  else if (c >= '0' && c <= '9') result = true;
  else if (c == '.' || c == '-' || c == '+') result = true;
  return result;
} // isValidEmailCharacter

bool noDuplicate(deque<string>& email, string s)
{
  bool found = false;
  int i;
  for (i = 0; i < email.size(); i++)
  {
    if (email[i] != s)
	{
	  found = true;
	  break;
	} // if
  } // for
  return found;
} // noDuplicate

void printEmails(deque<string>& email)
{
  int i;
  for (i = 0; i < email.size(); i++)
  {
    cout << email[i] << endl;
  } // for
} // printEmails

int main()
{
  deque<string> email; // email list
  string input; // name of input file
  string output; // name of output file
  string lineFromFile; // line of input file
  string defaultInput = "fileContainingEmails.txt";
  string defaultOutput = "copyPasteMyEmails.txt";
  ifstream fin;
  ofstream fout;
  int i; // iterator
  
  cout << "This program was made to find any email addresses present in a user-designated \ntext file, extract them and post them in the output file of the user's choice.\n" << endl;
  cout << "Enter input filename [default: fileContainingEmails.txt]: ";
  getline(cin, input);
  if (input == "")
    input = defaultInput;

  cout << "Enter output filename [default: copyPasteMyEmails.txt]: ";
  getline(cin, output);
  if (output == "" && input == "")
    output = defaultOutput;
  else if (output == "" && input != "")
    output = input;

  cout << "Input file: " << input << endl;
  cout << "Output file: " << output << endl; 

  fin.open(input.c_str());
  if (!fin.good()) throw "I/O error";  
  
  while (fin.good()) // search input file
  {
	getline(fin, lineFromFile);
	for (i = 0; i < lineFromFile.length(); i++)
	{
	  bool hasDot = false;
	  if (lineFromFile[i] == '@' && i && i - 1 > 0 && i + 1 < lineFromFile.length())
	  {
	    int s;
	    for (s = i - 1; s > 0; s--) // searches start of email for valid characters
	    {
		  if (isValidEmailCharacter(lineFromFile[s]) == false)
		  {
		    s++;
		    break;
		  }
	    } // for
	   int e;
	   for (e = i + 1; e <= lineFromFile.length(); e++) // searches end of email for valid characters
	    {
		  if (lineFromFile[e] == '.') hasDot = true;
		  if (isValidEmailCharacter(lineFromFile[e]) == false) break;
	    } // for
		i = e + 1;
	    if(hasDot) // dot validation
		{
		  string s = lineFromFile.substr(s, e - s);
          if(noDuplicate) email.push_back(lineFromFile.substr(s, e - s));
		} // if
      } // if
	} // for
  } // while
  fin.close();

  
  printEmails(email);
  cout << endl;
  if (email.size() > 0)
  {
    cout << email.size() << " email address(es) were found, and copied to the file: " << output << endl;
	
    fout.open(output.c_str()); // write email addresses to text file
    for (i = 0; i < email.size(); i++)
    {
      fout << email[i] << "; ";
    } // for
	fout.close();
  } // if
  else
    cout << "Sorry, no email addresses were found in the file: " << input << endl;

  return 0;
} // main 
90
91
92
for (e = i + 1; e <= lineFromFile.length(); e++) 
	    {
		  if (lineFromFile[e] == '.') //out of bounds 


The logic in noDuplicate is wrong. If you found an element that is equal to the one you are testing, then you could say that the element is already in the list.
Last edited on
All you need to do is convert each email address to lowercase, then add them into a set<string> container. This container will only keep unique entries. Once you've loaded them all, write the contents of the set out to a file :)
Topic archived. No new replies allowed.