1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
|
#pragma once
#ifndef DUTHOMHAS_UTF_SEQUENCE_HPP_SNIPPIT
#define DUTHOMHAS_UTF_SEQUENCE_HPP_SNIPPIT
#include <ciso646>
#include <iterator>
#include <stdint.h>
//------------------------------------------------------------------------
typedef uint32_t uchar;
//------------------------------------------------------------------------
// Some special code points
//
enum
{
UREPLACEMENT_CHAR = 0x00FFFD, // Special values used when
UMAX_BMP_CHAR = 0x00FFFF, // processing Unicode
UMAX_CHAR = 0x10FFFF //
};
//------------------------------------------------------------------------
// Encode a CESU-8 character sequence.
//
// CESU-8 is not a Unicode-conformant encoder because it permits the
// encoding of high-surrogate and low-surrogate code points.
//
// It is otherwise identical to UTF-8.
//
template <typename OutputByteIterator>
OutputByteIterator
encode_cesu8( OutputByteIterator iter, uchar value )
{
static uchar8 mask [ 4 ] = { 0x7F, 0x1F, 0x0F, 0x07 };
static uchar8 mark [ 4 ] = { 0x00, 0xC0, 0xE0, 0xF0 };
static uchar8 shift[ 4 ] = { 0, 6, 12, 18 };
if (value > UMAX_CHAR) value = UREPLACEMENT_CHAR;
int count = (value < 0x80) ? 0 // count == bytes to write - 1
: (value < 0x800) ? 1
: (value < 0x10000) ? 2
: 3;
*iter++ = ((value >> shift[ count ]) & mask[ count ]) | mark[ count ];
switch (count)
{
case 3: *iter++ = ((value >> 12) & 0x3F) | 0x80;
case 2: *iter++ = ((value >> 6) & 0x3F) | 0x80;
case 1: *iter++ = ( value & 0x3F) | 0x80;
}
return iter;
}
//------------------------------------------------------------------------
// Encode a UTF-8 character sequence.
//
template <typename OutputByteIterator>
inline
OutputByteIterator
encode_utf8( OutputByteIterator iter, uchar value )
{
return encode_cesu8(
iter, is_unicode( value ) ? value : UREPLACEMENT_CHAR
);
}
#endif
|