> don't you think the complexity is increased by a factor of n, assuming n is the number of patterns?
std::regex_searh()
returns one match.
To find all matches, we need to call
std::regex_searh()
repeatedly till no more matches are found.
The task can be simplified by using a regex iterator; it iterates over the sequence of all matches.
> Are you sure it would compile since you use iterators instead of const_iterators
text
is a
const std::string
,
text.begin()
returns a const_iterator
With
auto begin = text.begin() ;
the type of
begin
is
std::string::const_iterator
But yes, the code would be clearer if it were written as
auto begin = text.cbegin() ;
Incidentally, just noticed that there is a typo in:
const std::string& text = "a bb aaa cccc bbbbb aaaaaa ccccccc" ;
Should have been:
const std::string text = "a bb aaa cccc bbbbb aaaaaa ccccccc" ;
> i just wonder why the class match_results doesn't supply a member function telling which pattern is matched
> why there is no trivial way to return the index of the matched capture group?
It is trivial to write such a function ourselves:
1 2 3 4 5 6 7 8
|
std::size_t index_of_matched_subexpression( const std::smatch& match )
{
if( match.ready() )
for( std::size_t i = 1 ; i < match.size() ; ++i )
if( match[i].matched ) return i ;
return std::string::npos ;
}
|
For instance, using
std::regex_search()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
|
#include <regex>
#include <string>
#include <iostream>
std::string::size_type start_position_of_match( const std::string& text,
std::string::const_iterator search_begin,
const std::smatch& match )
{
if( match.ready() ) return search_begin - text.begin() + match.position() ;
else return std::string::npos ;
}
std::size_t index_of_matched_subexpression( const std::smatch& match )
{
if( match.ready() )
for( std::size_t i = 1 ; i < match.size() ; ++i )
if( match[i].matched ) return i ;
return std::string::npos ;
}
int main()
{
const std::regex re( "(a+b)|(b+c)|(c+a)" ) ;
const std::string text = "..ab..bbc..ccca..bbbbc..aaaaab..cccccca" ;
std::cout << text << "\n0123456789012345678901234567890123456789\n\n" ;
std::smatch match ;
auto begin = text.cbegin() ;
while( std::regex_search( begin, text.end(), match, re ) )
{
auto pos_start = start_position_of_match( text, begin, match) ;
auto subex_index = index_of_matched_subexpression(match) ;
std::cout << "found \"" << match[0] << "\" starting at "
<< pos_start << " matched $" << subex_index << '\n' ;
begin = match[0].second ;
}
}
|
Output:
..ab..bbc..ccca..bbbbc..aaaaab..cccccca
0123456789012345678901234567890123456789
found "ab" starting at 2 matched $1
found "bbc" starting at 6 matched $2
found "ccca" starting at 11 matched $3
found "bbbbc" starting at 17 matched $2
found "aaaaab" starting at 24 matched $1
found "cccccca" starting at 32 matched $3 |
With
std::sregex_iterator
, the code would be shorter and sweeter:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
|
#include <regex>
#include <string>
#include <iostream>
std::size_t index_of_matched_subexpression( std::sregex_iterator iter )
{
for( std::size_t i = 1 ; i < iter->size() ; ++i )
if( (*iter)[i].matched ) return i ;
return std::string::npos ;
}
int main()
{
const std::regex re( "(a+b)|(b+c)|(c+a)" ) ;
const std::string text = "..ab..bbc..ccca..bbbbc..aaaaab..cccccca" ;
std::cout << text << "\n0123456789012345678901234567890123456789\n\n" ;
std::sregex_iterator iter( text.begin(), text.end(), re ) ;
std::sregex_iterator end ;
for( ; iter != end ; ++iter )
std::cout << "found \"" << iter->str() << "\" starting at " << iter->position()
<< " matched $" << index_of_matched_subexpression(iter) << '\n' ;
}
|
Output:
..ab..bbc..ccca..bbbbc..aaaaab..cccccca
0123456789012345678901234567890123456789
found "ab" starting at 2 matched $1
found "bbc" starting at 6 matched $2
found "ccca" starting at 11 matched $3
found "bbbbc" starting at 17 matched $2
found "aaaaab" starting at 24 matched $1
found "cccccca" starting at 32 matched $3 |