1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
|
// utf8-to-wchar_t.cpp
//
// This program is an example of how to read a UTF-8 encoded file into a
// wchar_t sequence (be it a string or, as in this case, another file).
//
#include <algorithm>
#include <fstream>
#include <iostream>
#include <iterator>
#include <string>
using namespace std;
//----------------------------------------------------------------------------
// Here's a little consumer-transformer following the STL design philosophy.
// Notice how, since UTF-8 is bound to specific bit-patterns, our types are
// only generic in what the input and output containers are.
//
// For more on the UTF-8 layout, see
//
// http://en.wikipedia.org/wiki/Utf8
//
// Specifically,
// 0xxxxxxx --> 00000000 00000000 xxxxxxxx
// 110yyyyy 10xxxxxx --> 00000000 00000yyy yyxxxxxx
// 1110zzzz 10yyyyyy 10xxxxxx --> 00000000 zzzzyyyy yyxxxxxx
// 11110www 10zzzzzz 10yyyyyy 10xxxxxx --> 000wwwzz zzzzyyyy yyxxxxxx
//
// Notice how the first form is identical to ASCII.
//
// This algorithm does NOT consider whether or not your wchar_t is large
// enough to hold a 21-bit character. (UTF-8 is specified over U+0000 to
// U+10FFFF. Most modern C++ compilers use a 32-bit wchar_t, particularly
// on Linux, but some older ones still have a 16-bit wchar_t, truncating
// the range to U+0000 to U+FFFF.)
//
template <
typename InputIterator,
typename OutputIterator
>
OutputIterator utf8_to_wchar_t(
InputIterator begin,
InputIterator end,
OutputIterator result
) {
for (; begin != end; ++begin, ++result)
{
int count = 0; // the number of bytes in the UTF-8 sequence
unsigned c = (unsigned char)*begin;
unsigned i = 0x80;
// Skip the stupid UTF-8 BOM that Windows programs add
//
// (And yes, we have to do it here like this due to problems
// that iostream iterators have with multiple data accesses.)
//
// Note that 0xEF is an illegal UTF-8 code, so it is safe to have
// this check in the loop.
//
if (c == 0xEF)
c = (unsigned char)* ++ ++ ++begin;
// Resynchronize after errors (which shouldn't happen)
while ((c & 0xC0) == 0x80)
c = (unsigned char)*++begin;
// Now we count the number of bytes in the sequence...
for (; c & i; i >>= 1) ++count;
// ...and strip the high-code-bits from the character value
c &= i - 1;
// Now we build the resulting wchar_t by
// appending all the character bits together
for (; count > 1; --count)
{
c <<= 6;
c |= (*++begin) & 0x3F;
}
// And we store the result in the output container
*result = c;
}
// The usual generic stuff
return result;
}
//----------------------------------------------------------------------------
int complain( const char* filename, const char* method )
{
cerr
<< "I could not open the file \""
<< filename
<< "\" for "
<< method
<< endl;
return 1;
}
//----------------------------------------------------------------------------
// This little type is to help with actual wide streams (since the STL doesn't
// have any -- see widen() and narrow() for all the disappointing details).
//
struct widechar
{
typedef enum { big_endian, little_endian } endianness_t;
unsigned value;
widechar( unsigned value = 0 ): value( value ) { }
static endianness_t endianness() { return e; }
static void endianness( endianness_t endianness ) { e = endianness; }
private: static endianness_t e;
};
widechar::endianness_t widechar::e = widechar::big_endian;
//............................................................................
ostream& operator << ( ostream& outs, widechar wc )
{
if (wc.endianness() == widechar::little_endian)
for (int i = 0; i < 4; ++i)
{
outs << (char)(wc.value & 0xFF);
wc.value >>= 8;
}
else
for (int i = 24; i >= 0; i -= 8)
{
outs << (char)((wc.value >> i) & 0xFF);
}
return outs;
}
//----------------------------------------------------------------------------
int main( int argc, char** argv )
{
// If necessary, give the user instructions
if (argc < 3)
{
cout <<
"Convert a UTF-8 file to a wchar file.\n"
"usage:\n " << argv[ 0 ] << " UTF8-FILENAME WCHAR-FILENAME\n";
return 1;
}
// Otherwise, convert the named UTF-8 input file to the named wchar_t output
ifstream inf( argv[ 1 ], ios::binary );
ofstream outf( argv[ 2 ], ios::binary );
if (!inf) return complain( argv[ 1 ], "reading" );
if (!outf) return complain( argv[ 2 ], "writing" );
inf >> noskipws; // We want all data (including spaces, newlines, etc).
// This will help on Win32; the command prompt will display a little-endian
// stream correctly, but it will display a big-endian stream with some garbage.
widechar::endianness( widechar::little_endian );
outf << (widechar)0x0000FEFF; // byte order mark
// Here I use a iostream iterator directly, but any appropriate sequence
// container will do. You can convert std::strings or whatever you like
// in the usual way.
//
utf8_to_wchar_t(
istream_iterator <char> (inf),
istream_iterator <char> (),
ostream_iterator <widechar> (outf)
);
outf.close();
inf .close();
//..........................................................................
// Here's an example using a wstring sequence
//
// Again, iostream_iterators play havoc with streams, so we just reopen
// the file to play safe.
inf.open( argv[ 1 ], ios::binary );
inf >> noskipws;
// For each line of text...
string line;
unsigned line_number = 1;
while (getline( inf, line ))
{
// ...First convert it to a wstring
wstring wline;
utf8_to_wchar_t(
line.begin(),
line.end(),
back_insert_iterator <wstring> (wline)
);
// Then see if it has the Spanish leading-question mark (¿) in it
wstring::size_type index = wline.find( (wchar_t)0xBF );
cout << "line " << line_number << ": ";
if (index == wstring::npos)
cout << "the upside-down question-mark does not appear in this line.\n";
else
cout << "the upside-down question-mark is at index " << (index + 1) << "\n";
++line_number;
}
inf.close();
return 0;
}
// end utf8-to-wchar_t.cpp
|