Frequency of Words in 2D array

Hi, I am in an intro programming class where we use qt creator and program in C/C++.

My current assignment is to read in passwords from a document into a 2D char Array (A total of 12233 passwords) and then manipulate it a bunch of different ways. The first way is to find the 20 most common passwords (upper case and lower case ignored) for this step. I was able to load in the file into an array and then sort it using a quicksort algorithm. Now my question is how in the world do I start counting the frequency of the words? I've seen a lot of examples using wordmaps and vectors, but we have not learned those and nor do I fully understand them.

My initial idea was to create two more separate arrays, one will be an array of the words much like the one I already have, but it will check for duplicates. The other would be an int array where the elements correspond to a word in the char array, and gets+1 added to it every time it shows up. If the next word was already in the char array, I would add one to the corresponding index in the int array and not add it to the char array. If its a new word, I would add it to the char array and then add a new element to the int array starting at 1. But i'm not sure even how to get started on this idea, and then how to sort it and find the top 20 after. This is my current code at the moment, which just opens the file and loads in all the passwords (12233 passwords, all of length 9 including the null) into a 2D char array and then sorts them alphabetically.
We are supposed to use functions like strcmp, strcpy and what not to compare strings.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include <iostream>
#include <cstring>
using namespace std;

const int NUMBER_OF_WORDS = 12233;  // number of words in input file
const int MAX_LINE_LENGTH = 9;  // 8 + 1 for NULL



void displayArray( char theWords[NUMBER_OF_WORDS][ MAX_LINE_LENGTH],   // Array to store all the words
                   int start, int end)
{
    for( int i=start; i<end; i++) {
        printf("%s\n", theWords[i]);
    }

    printf("\n");
}//end displayArray(...)


// Swap two array elements.  Used in sorting.
void swap(
    char theWords[NUMBER_OF_WORDS][ MAX_LINE_LENGTH],   // Array to store all the words
    int i, int j)
{
    char temp[ MAX_LINE_LENGTH];
    strcpy( temp, theWords[i]);     // store temporarily
    strcpy( theWords[i], theWords[j]);
    strcpy( theWords[j], temp);
}


int partition( char theWords[NUMBER_OF_WORDS][ MAX_LINE_LENGTH],   // Array to store all the words
       int left,     // index of left-most element
       int right)    // index of right-most element
{
    // To find index of middle do it the funky way shown below to avoid integer overflow
    int pivotIndex = left + (right-left) / 2;
    char pivotValue[ MAX_LINE_LENGTH];
    strcpy( pivotValue, theWords[ pivotIndex]);
    int storeIndex = left;    // destination on the left for first swap
    // Number of elements in subarray is: right-left+1

    // Put pivot value out of the way at far right of array, to be swapped back
    // into place at the end.
    swap( theWords, pivotIndex, right);

    // Go through the current range being considered of array elements,
    // moving each element into the left part of the partition if its value
    // is less than the pivot value.
    for( int i=left; i<right; i++) {
        if( strcmp(theWords[i], pivotValue) < 0) {
            swap( theWords, i, storeIndex);
            storeIndex++;
        }
    }
    // storeIndex is now the index of the location just to the right of the last
    // element that is < pivot, so its value >= pivot.  Swap the pivot back
    // into this spot.
    swap( theWords, storeIndex, right);    // Move pivot to its final place
    return storeIndex;
} // end partition()


void quickSort(
        char theWords[NUMBER_OF_WORDS][ MAX_LINE_LENGTH],   // Array to store all the words
        int first, int last)
{
    int pivotIndex;

    if( first < last) {
        // Partition around a pivot and return index of pivot
        pivotIndex = partition( theWords, first, last);
        // recursively sort the portion to the left of the pivot
        quickSort(theWords, first, pivotIndex - 1);
        // recursively sort the portion to the RIGHT of the pivot
        quickSort(theWords, pivotIndex + 1, last);
    }
}// end quickSort()


int main()
{
    char fileName[] = "words.txt";  // Make a C string (array of char) to store filename
    FILE *pInputFile;               // file pointer
    char inputLine[ 9];            // space for an input line
    char theWords[NUMBER_OF_WORDS][ MAX_LINE_LENGTH];   // Array to store all the words

    // Associate the file pointer with the actual file name and try to open it
    pInputFile = fopen(fileName, "r");   // open with "r" for "read" access
    // verify that file open worked
    if (pInputFile == NULL) {
        printf("Can't open %s. Verify it is in correct location\n", fileName);
        exit(-1);
    }

    // read from file and echo output.  Note how leading spaces from input are ignored
    int numberOfWords = 0;   // count how many words there are
    while( fscanf(pInputFile, "%s", inputLine) != EOF) {
        strcpy( theWords[ numberOfWords], inputLine);   // copy word into array        
        numberOfWords++;
    }
    fclose( pInputFile);         // close the input file

    // Go through array and print out all the words
    printf("Before sorting, array is: \n");
    displayArray( theWords, 0, NUMBER_OF_WORDS);

    quickSort( theWords, 0, NUMBER_OF_WORDS);

    //printf("After sorting, array is: \n");
    //displayArray( theWords, 0, NUMBER_OF_WORDS);

    return 0;
}//end main()
Last edited on
closed account (E3h7X9L8)
lol i misread and i thought you gotta find 20 most alike passwords like : test1,t3est ,3test etc then i realised they gotta be identical...

> brute force algorithm

> make a second 1D array for storing encounters for every word

> loop every word with all other words, and if you find a duplicate increment one in the second array in the position of the number you currently looping then remove the duplicate from the original array

>after this make a [20][9] size 2D char array for top 20 duplicates and a 4th array to store number of encounters

>use linear search to find the highest counter in the second array then use the position in which you found the highest counter to take the element from the first array and asign it to the third array which stores top 20 duplicates, and store the counter from second array to the 4th array

>repeat this 19 more times

>output third and fourth array

I know sounds hard , but it will work, not sure if you understood very well, my english is still pretty bad, thats all just from the top of my head, so i didnt took time to design a better algorithm

Last edited on
UNIX is your friend:
system("sort words.txt | uniq -c | sort -n | tail -20");

Assuming that you need to do it in code, try this (and I apologize for the old compiler syntax):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#include <iostream>
#include <map>
#include <vector>
#include <set>

using std::string;
using std::cin;
using std::cout;
using std::set;

struct WordsAndCounts {
    string word;
    int count;
    WordsAndCounts() : word(), count(0) {}

    // sorted by descending count and then by word
    bool operator < (const WordsAndCounts &right) const {
	if (count > right.count) return true;
	if (count < right.count) return false;
	// counts are equal
	return word < right.word;
    }
};

int main()
{
    std::map<string,unsigned> word2Count;
    string word;
    // Read words and count up the occurences
    while (cin >> word) {
	   ++word2Count[word];
    }

    // Now put the words/counts into a set, sorted by count:
    set<WordsAndCounts> count2Word;
    WordsAndCounts dummy;

    for (std::map<string,unsigned>::iterator iter=word2Count.begin();
	 iter != word2Count.end();
	 ++iter) {
	dummy.word = iter->first;
	dummy.count = iter->second;
	count2Word.insert(dummy);
    }

    // Now print the top 20
    cout << "Word\tCount\n";
    int i=0;
    for (set<WordsAndCounts>::iterator iter = count2Word.begin();
	 iter != count2Word.end();
	 ++iter) {
	if (i++ >= 20) break;	// already did 20
	cout << iter->word << '\t' << iter->count << '\n';
    }
}
Topic archived. No new replies allowed.