1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
|
//Loop through all fna files in the designated target directory and count kmers. If a kmer OR its reverse
//complement is present in the refseq it's ignored. All others are added to an unordered map.
//Later we will loop back through the unordered map and eliminate redundant k-mers.
map<uint64_t, boost::dynamic_bitset<> > parse_target_genomes( const vector<string>& file_list,
const unordered_set<uint64_t>& refseq_set ) {
const int num_genomes = file_list.size();
map<uint64_t, boost::dynamic_bitset<> > kmer_directory;
int position_in_bitstring = num_genomes - 1;
string last_kmer(kmer_length, ' ');
string current_kmer_string(kmer_length, ' ');
uint64_t current_kmer_bitset;
uint64_t reverse_comp;
bool failure_code = false;
char current_char;
int last_empty_kmer_position = 0;
ifstream current_fna;
for(auto itr = file_list.begin(); itr!=file_list.end(); itr++) {
cout << (*itr) << " in progress..." << endl;
current_fna.open( (*itr) );
while( current_fna.get(current_char) ){
//If current char is '>', we're reading a header. Reset the current_kmer_string to whitespace (to discard
//the last-read kmer) and ignore all characters up until the next '\n'.
if( current_char == '>' ){
current_fna.ignore(5000, '\n');
current_kmer_string.assign(kmer_length, ' ');
last_empty_kmer_position = 0;
}
//If current char is '\n', we should skip it.
//If current char is anything else, we should read it into our kmer.
else if(current_char != '\n') {
//If there is any whitespace left in current_kmer_string, we don't have a full kmer yet. Add the current_char to the kmer.
if(last_empty_kmer_position < kmer_length ){
current_kmer_string[last_empty_kmer_position] = current_char;
last_empty_kmer_position += 1;
}
//Otherwise, we have an existing kmer. Convert it to the new kmer by combining the last kmer_length - 1 chars with current_char.
else{
last_kmer = current_kmer_string;
current_kmer_string = last_kmer.substr(1, kmer_length - 1) + current_char;
}
//Check again whether we now have a full kmer and if so translate it to a binary representation.
if(last_empty_kmer_position == kmer_length ){
current_kmer_bitset = kmer_to_bitset(current_kmer_string, failure_code);
//If we got a failure code when trying to convert the kmer, it probably contained an n.
//So just skip it. Any kmer containing an 'n' value is not useful to us. That's just a place in
//the sequence where information is missing.
if(failure_code == true)
failure_code = false;
//Otherwise, check whether kmer or its reverse complement are in the refeseq set. If in
//refseq set, ignore, otherwise...
else{
reverse_comp = kmer_to_reverse_complement(current_kmer_string, failure_code);
//There is no situation where we should get a failure code on the reverse complement that we did NOT
//get on the forward string, but just in case...
failure_code = false;
if (refseq_set.count(current_kmer_bitset) == 0 &&
refseq_set.count(reverse_comp) == 0 ){
//Ok, so this kmer is NOT in the refseq. But is it in the map we're building? Need to check
//reverse complement as well of course. If neither is in map, add just the kmer (not its reverse
//complement, do not need to store separately) plus a bitstring to indicate which genomes it is found
//in (we do not count # occurrences, simply presence or absence).
if (kmer_directory.count(current_kmer_bitset) != 0 )
kmer_directory[current_kmer_bitset][position_in_bitstring] = 1;
else if (kmer_directory.count(reverse_comp) != 0 )
kmer_directory[reverse_comp][position_in_bitstring] = 1;
else{
kmer_directory[current_kmer_bitset] = boost::dynamic_bitset<> (num_genomes, 0); //Problem line here!!! <-----------
kmer_directory[current_kmer_bitset][position_in_bitstring] = 1;
}
}
}
}
}
}
current_fna.close();
//Update the bitstring position so we increment the correct position of the bitstring next time.
position_in_bitstring--;
}
return kmer_directory;
}
|