1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
|
// --------------TOKENIZE PHRASES INTO MATRIXES--------------
// SYNTAX FOR FILE (each line): word,word,...>word,word,word,...
selected_dir_stream.close();
selected_dir_stream.open(dir_chosen);
std::string* container = new std::string[2];
std::string* line = new std::string;
std::string* token = new std::string;
bool bad_segment=false;
unsigned bad_segments=0, units_accepted=0;
for (size_t i=0; i<number_of_lines; i++) {
bad_segment=false;
for (int y=0; y<2; y++)
container[y].erase(0,std::string::npos);
// get line from std::ifstream to *line
std::getline(selected_dir_stream, *line);
// convert it to istd::stringstream object
std::istringstream language_split;
language_split.str(*line);
// two languages are separated by '>' sign
for (int z=0; z<2; z++) {
// read line anyway, but if line is not properly formatted - omitt it
language_split.clear();
std::getline(language_split, container[z], '>');
if (language_split.fail()) {
bad_segment=true;
bad_segments++;
break;
}
}
// less vocabulary units, but at least they'll be all valid
if (!bad_segment) {
// nest vector<vector>
source_words.push_back(std::vector<std::string>());
target_words.push_back(std::vector<std::string>());
// first language synonyms
std::istringstream synonym_source_split(container[0]);
while (std::getline(synonym_source_split, *token, ','))
source_words.at(units_accepted).push_back(*token);
// second language synonyms
std::istringstream synonym_target_split(container[1]);
while (std::getline(synonym_target_split, *token, ','))
target_words.at(units_accepted).push_back(*token);
units_accepted++;
}
}
|