1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
int compare(string a, string b); //compares two string values and returns percentage of how similar they are
int bigger(string a, string b); //sends the bigger string first from compare
int blacklist(string[], int size, string line);
void firstPass(void);
int blacklistF(string[], int size, string line);
//global variables
const unsigned int percent = 30; //percentage of string that is similar that qualifies it to be called 'similar'
const unsigned int numofrep = 50; //if any file repeats more than fifty times, it will be removed
string blacklisted[10000];//final blacklisted lines
string potentials[10000000]= {""};
int unsigned amounts[10000000] = {0};
int main(){
cout<< "going through first pass with user defined blacklist:\n";
firstPass();
cout<< endl;
//ifstream variables
fstream log; //original log file
//ifstream altlog2; //for reading to see if next line should be killed or not
//ofstream variables
//ofstream altlog1; //loads lines from ifstream into this, but without ALL repeats including the ones that will be kept
fstream altlog;
fstream altlog2;
//other variables
string temp, temp2, potential,line;
unsigned int count1 = 0,count2 = 0, count3 = 0, numoflines = 0, totalBlacklisted = 0, crap = 0;
bool found = 0;
log.open("log1.log");
altlog.open("altlog.log", fstream::in | fstream::out | ios::trunc);
altlog2.open("altlog.log");
//generate blacklist file
//********************************************************************************************
cout<< "generating blacklist, this may anywhere between 1 and 10 minutes\ndepending on the log file...\n";
unsigned int numofpotentials = 0;
unsigned int count4 = 0;
while(!log.eof()){
++numoflines;
//cout<< numoflines << "\n";
getline(log,temp);
//altlog << temp << endl;
//altlog.flush();
count1 = 0;
while(count4 < numofpotentials){
if(compare(temp, potentials[count4])>= percent){
found = true;
++amounts[count1];
}
++count1;
++count4;
}
if(!found){
++numofpotentials;
potentials[numofpotentials] = temp;
++amounts[count1];
}
found = false;
count4 = 0;
if((numoflines % 1000) == 0)
cout<<numoflines/1000<< "\n";
}
unsigned int blacklistnums[10000];
//i've obtained the repeats, now it's time to decide which ones to keep in
//***************************
cout<< "omitting repeats with more than " << numofrep << "\n";
altlog.clear();
altlog.seekg(0);
for(int i = 0; i < numoflines-1; ++i){
//getline(altlog, potential);
if(amounts[i+1] > numofrep){
blacklisted[count2] = potentials[i+1];
blacklistnums[count2] = amounts[i+1];
++count2;
}
}
//now that i've done that, it's time to write the new log file
cout<< "generating new log file\n";
altlog2.clear();
altlog2.seekg(0);
log.clear();
log.seekg(0);
altlog2 << "the following lines have been automatically removed from this log file:\n";
for(int i = 0; i < count2; ++i){
altlog2<< blacklisted[i] << "\n" << blacklistnums[i] << "\n";
}
altlog2 << "\n\n\n\n\n\n\n\n\n\n";
while(getline(log, line)){
if(blacklist(blacklisted, count2, line)){
altlog2 << line << endl;
}
}
altlog.close();
altlog2.close();
log.close();
remove("log1.log");
rename("altlog.log","newlog.log");
return 0;
}
int compare(string a, string b){
return(a.size() > b.size() ? bigger(a,b) : bigger(b,a));
}
|