Vector duplicates removal

Hello Everyone.

Could anyone please help me quickly find duplicates in a vector of over 60000 records. It takes a loooootttttt of time ;'(. Here's the code.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#include <string>
#include <vector>
#include <iostream>
#include <algorithm>
#include <sstream>

struct MyPred
{
	std::string x;
	std::string y;

	// add default values
	MyPred(const std::string& x = "", const std::string& y = ""): x(x), y(y) {}

	bool operator==(const MyPred& p) const
	{
		return x == p.x && y == p.y;
	}

	bool operator<(const MyPred& p) const
	{
		if(x < p.x) return true;
		if(x > p.x) return false;
		if(y < p.y) return true;
		if(y > p.y) return false;
		return false;
	}
};


int main()
{
	std::vector<MyPred>* vPred = new std::vector<MyPred>;
	MyPred pred;

	for(int i = 0; i < 60000; i++)
	{
		std::stringstream ss1;
		ss1 << i;
		pred.x = ss1.str();
		std::stringstream ss2;
		ss2 << i+1;
		pred.y = ss2.str();
		vPred->push_back(pred);
	}

	for(int i = 10; i < 25; i++)
	{
		std::stringstream ss1;
		ss1 << i;
		pred.x = ss1.str();
		std::stringstream ss2;
		ss2 << i+1;
		pred.y = ss2.str();
		vPred->push_back(pred);
	}

	for(int i = 35000; i < 35005; i++)
	{
		std::stringstream ss1;
		ss1 << i;
		pred.x = ss1.str();
		std::stringstream ss2;
		ss2 << i+1;
		pred.y = ss2.str();
		vPred->push_back(pred);
	}

	/*vPred->push_back(MyPred("a", "a"));
	vPred->push_back(MyPred("a", "b"));
	vPred->push_back(MyPred("a", "a"));
	vPred->push_back(MyPred("b", "a"));
	vPred->push_back(MyPred("a", "b"));
	vPred->push_back(MyPred("a", "c"));
	vPred->push_back(MyPred("b", "b"));
	vPred->push_back(MyPred("a", "a"));
	vPred->push_back(MyPred("a", "a"));*/

	// The values need to be in order for equal_range() to work
	std::sort(vPred->begin(), vPred->end());

	std::vector<MyPred> uPred; // values that were always unique
	std::vector<MyPred> dPred; // values that were duplicated

	std::pair<std::vector<MyPred>::iterator, std::vector<MyPred>::iterator> ret;

	for(std::vector<MyPred>::iterator i = vPred->begin(); i != vPred->end(); i = ret.second)
	{
		ret = std::equal_range(i, vPred->end(), *i);
		
		if(ret.second - ret.first != 1) // duplicates
		{
				for(std::vector<MyPred>::iterator j = ret.first; j != ret.second; ++j)
				{
					dPred.push_back(*i); //put each duplicate onto a new vector
				}
		}
		else if(ret.second - ret.first == 1)
		{
			uPred.push_back(*i);
		}
	}

	std::cout << "vPred: Sorted input\n";
	for(std::vector<MyPred>::iterator i = vPred->begin(); i != vPred->end(); ++i)
	{
		std::cout << "[" << i->x << ", " << i->y << "]" << '\n';
	}

	std::cout << "dPred: Only the values that were duplicated\n";
	for(std::vector<MyPred>::iterator i = dPred.begin(); i != dPred.end(); ++i)
	{
		std::cout << "[" << i->x << ", " << i->y << "]" << '\n';
	}

	std::cout << "uPred: Only the values that were unique\n";
	for(std::vector<MyPred>::iterator i = uPred.begin(); i != uPred.end(); ++i)
	{
		std::cout << "[" << i->x << ", " << i->y << "]" << '\n';
	}

	delete vPred;
}
Hello heidiK,

try using std::sort and std::unique.
Topic archived. No new replies allowed.