1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
|
#include <ciso646>
template <typename Iterator>
struct utf8_iterator
{
enum { UREPLACEMENT_CHAR = 0xFFFD };
utf8_iterator( Iterator begin, Iterator end ) : begin(begin), end(end) { }
utf8_iterator( const utf8_iterator& that ) : begin(that.begin), end(that.end) { }
char32_t operator * () const
{
// Decodes UTF-8, Modified UTF-8, CESU-8
static const unsigned char nbytes[] =
{
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
};
static const unsigned char masks[] = { 0, 0x7F, 0x1F, 0x0F, 0x07 };
auto s = begin;
if (s == end) return UREPLACEMENT_CHAR;
unsigned char n = nbytes[ (unsigned char)*s >> 3 ];
if (!n) return UREPLACEMENT_CHAR;
char32_t c = (unsigned char)*s++ & masks[ n ];
while (--n and (s != end)) c = (c << 6) | ((unsigned char)*s++ & 0x3F);
return n ? UREPLACEMENT_CHAR : validate( c );
}
utf8_iterator& operator ++ ()
{
if (begin != end) ++begin;
while ((begin != end) and ((unsigned char)*begin & 0x80)) ++begin;
return *this;
}
utf8_iterator operator ++ (int)
{
auto result = utf8_iterator( begin, end );
operator ++ ();
return result;
}
bool operator == ( const utf8_iterator& that ) const { return this->begin == that.begin; }
bool operator != ( const utf8_iterator& that ) const { return this->begin != that.begin; }
private:
Iterator begin, end;
char32_t validate( char32_t c ) const
{
return (((c & 0xFFFF) > 0xFFFD) or (c > 0x10FFFF)) ? UREPLACEMENT_CHAR : c;
}
};
|