wchar_t and reading unicode strings

Forum

Forum
General C++ Programming
wchar_t and reading unicode strings

wchar_t and reading unicode strings

Hello.
With this code by helios(thank you) all in UTF8 are ok.
But now i must use ranges from U+10000 to U+17FFF
There is an update code?
Thanks very much.

Jim

/*

#define BOM8A 0xEF
#define BOM8B 0xBB
#define BOM8C 0xBF 
	

* Copyright (c) 2009, Helios (helios.vmg@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*     * Redistributions of source code must retain the above copyright notice,
*       this list of conditions and the following disclaimer.
*     * Redistributions in binary form must reproduce the above copyright
*       notice, this list of conditions and the following disclaimer in the
*       documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY HELIOS "AS IS" AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL HELIOS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*/

typedef unsigned char uchar;

/*
string: a UTF-8-encoded C string (nul terminated)
Return value: a wchar_t C string.

The function handles memory allocation on its own.

Limitations: Only handles the range [U+0000;U+FFFF], higher code points are
changed to '?'.

Assumptions: sizeof(wchar_t)>=2
*/
wchar_t *UTF8_to_WChar(const char *string){
	long b=0,
		c=0;
	if ((uchar)string[0]==BOM8A && (uchar)string[1]==BOM8B && (uchar)string[2]==BOM8C)
		string+=3;
	for (const char *a=string;*a;a++)
		if (((uchar)*a)<128 || (*a&192)==192)
			c++;
	wchar_t *res=new wchar_t[c+1];
	res[c]=0;
	for (uchar *a=(uchar*)string;*a;a++){
		if (!(*a&128))
			//Byte represents an ASCII character. Direct copy will do.
			res[b]=*a;
		else if ((*a&192)==128)
			//Byte is the middle of an encoded character. Ignore.
			continue;
		else if ((*a&224)==192)
			//Byte represents the start of an encoded character in the range
			//U+0080 to U+07FF
			res[b]=((*a&31)<<6)|a[1]&63;
		else if ((*a&240)==224)
			//Byte represents the start of an encoded character in the range
			//U+07FF to U+FFFF
			res[b]=((*a&15)<<12)|((a[1]&63)<<6)|a[2]&63;
		else if ((*a&248)==240){
			//Byte represents the start of an encoded character beyond the
			//U+FFFF limit of 16-bit integers
			res[b]='?';
		}
		b++;
	}
	return res;
}

//Do not call me.
long getUTF8size(const wchar_t *string){
	if (!string)
		return 0;
	long res=0;
	for (;*string;string++){
		if (*string<0x80)
			res++;
		else if (*string<0x800)
			res+=2;
		else
			res+=3;
	}
	return res;
}

/*
string: a wchar_t C string (nul terminated)
Return value: a UTF-8-encoded C string.

The function handles memory allocation on its own.

Limitations: Only handles the range [U+0000;U+FFFF], higher code points are
changed to '?'.

Assumptions: sizeof(wchar_t)>=2
*/
char *WChar_to_UTF8(const wchar_t *string){
	long fSize=getUTF8size(string);
	char *res=new char[fSize+1];
	res[fSize]=0;
	if (!string)
		return res;
	long b=0;
	for (;*string;string++,b++){
		if (*string<0x80)
			res[b]=(char)*string;
		else if (*string<0x800){
			res[b++]=(*string>>6)|192;
			res[b]=*string&63|128;
		}else{
			res[b++]=(*string>>12)|224;
			res[b++]=((*string&4095)>>6)|128;
			res[b]=*string&63|128;
		}
	}
	return res;
}

helios (17607)

(Thanks to LB for pointing out this thread to me.)

There's a complete rewrite that uses an entirely different algorithm:
https://github.com/Helios-vmg/CopperRat/blob/master/src/jni/src/CommonFunctions.h
(Lines 104-200)
https://github.com/Helios-vmg/CopperRat/blob/master/src/jni/src/CommonFunctions.cpp
(Lines 70-82)

Notes:
* I believe this works for the entire 32-bit range.
* I haven't done much (any?) testing outside the BMP.
* The code is written for maximum speed, not adherence to the Unicode standard. This means it accepts and decodes some UTF-8 sequences that more proper implementations reject. You should not use it if that matters to you.

dkaip (196)

Great man. Thanks.
Jim

Last edited on

dkaip (196)

Good Morning.
I made a project in which I use the routines and because it is difficult to make changes to the new codes most interest is to alter existing there that interests me.
How about the increase of the code? It worked?
Thanks again.
Jim


long getUTF8size(const wchar_t *string){
	if (!string)
		return 0;
	long res=0;
	for (;*string;string++){
		if (*string<0x80)
			res++;
		else if (*string<0x800)
			res+=2;
		else if (*string<0x10000)
			res+=3;
		else
		res+=4;
	}
	return res;
}

char *WChar_to_UTF8(const wchar_t *string){
	long fSize=getUTF8size(string);
	char *res=new char[fSize+1];
	res[fSize]=0;
	if (!string)
		return res;
	long b=0;
	for (;*string;string++,b++){
		if (*string<0x80)
			res[b]=(char)*string;
		else if (*string<0x800){
			res[b++]=(*string>>6)|192;
			res[b]=*string&63|128;
		}else if (*string<0x10000){
			res[b++]=(*string>>12)|224;
			res[b++]=((*string&4095)>>6)|128;
			res[b]=*string&63|128;
		}else{
			res[b++]=(*string>>18)|240;
			res[b++]=((*string&63)>>12)|128;
			res[b++]=((*string&63)>>6)|128;
			res[b]=*string&63|128;
		}
	}
	return res;
}

Last edited on

helios (17607)

Why do you need to make changes to the code?

dkaip (196)

I have codepoints up to U+FFFF.
I must write those codepoints to utf8 file.

Last edited on

helios (17607)

You don't need to modify anything for that, both the old and the new versions support code points below 1 << 16.

dkaip (196)

Thanks. I must look on code from int to wchar_t, if i have error ...
Jim.

Last edited on

modoran (2077)

Windows already have MultiByteToWideChar and WideCharToMultiByte whih does conversion from/to UTF8 and UTF16. Why do you need to write your own algorithm ?

http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072%28v=vs.85%29.aspx
http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130%28v=vs.85%29.aspx

dkaip (196)

Hello, i just open forum.
I use Code::Blocks and never visual studio. I want a function for all OS.
I have made a function from int to wchar_t, i just find it.
For ranges until U+FFFF works fine, but i must check for up range.

wstring giveWStringFromASCII(size_t i)
{
    wchar_t character[]= {i,0};
    return wstring(character);
}

Last edited on

dkaip (196)

I just check giveWStringFromASCII. Until U+FFFF gives right result, but from upper no.
Any idea? Maybe i have wrong?
I made a font with 2 glypts in 65000 and 70000 codepoint places.
Then with code make a file. Opening, char at 65000 looks ok, but in 70000 gives two chars.

ofstream outfile ("log",std::ofstream::binary);
        wstring s1=giveWStringFromASCII(65000);
        wstring s2=giveWStringFromASCII(70000);
        outfile<<WChar_to_UTF8(s1.c_str());
        outfile<<WChar_to_UTF8(s2.c_str());
    outfile.close();

Last edited on

helios (17607)

The size of wchar_t is 16 bits on Windows. That's why the second version of my functions read from and write to std::basic_string<T> rather than std::wstring, so you can pass std::basic_string<unsigned>.

giveWStringFromASCII() is very bad code, by the way.

dkaip (196)

Hello, new code ...
old giveWStringFromASCII dont works for chars upper U+FFFF and you have right is very bad ...
New one works fine ...

void StringToWString(std::wstring &ws, const std::string &s)
{
    std::wstring wsTmp(s.begin(), s.end());
    ws = wsTmp;
}

wstring StringToWString(const std::string &s)
{
    std::wstring wsTmp(s.begin(), s.end());
    return wsTmp;
}


std::string UnicodeToUTF8(unsigned int codepoint)
{
    std::string out;

    if (codepoint <= 0x7f)
        out.append(1, static_cast<char>(codepoint));
    else if (codepoint <= 0x7ff)
    {
        out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
        out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
    }
    else if (codepoint <= 0xffff)
    {
        out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
        out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
        out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
    }
    else
    {
        out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
        out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
        out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
        out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
    }
    return out;
}

wstring giveWStringFromASCII(size_t i)
{
return StringToWString(UnicodeToUTF8(i));
}



int main()
{
    ofstream outfile ("log",std::ofstream::binary);
    string s1=UnicodeToUTF8(65000);
    string s2=UnicodeToUTF8(70000);
    wstring s3,s4;
    StringToWString(s3,s1);
    StringToWString(s4,s2);
    outfile<<s1.c_str()<<endl;
    outfile<<s2.c_str()<<endl;
    outfile<<"-----------------"<<endl;
    outfile<<WChar_to_UTF8(s3.c_str())<<endl;
    outfile<<WChar_to_UTF8(s4.c_str())<<endl;
    outfile<<"-----------------"<<endl;
    wstring s5=giveWStringFromASCII(65000);
    wstring s6=giveWStringFromASCII(70000);
    outfile<<WChar_to_UTF8(s5.c_str())<<endl;
    outfile<<WChar_to_UTF8(s6.c_str())<<endl;
    outfile.close();
    return 0;
}

Last edited on

JLBorges (13770)

#include <string>
#include <codecvt>
#include <locale>
#include <iostream>

std::u16string to_utf16( std::string str ) // utf-8 to utf16
{ return std::wstring_convert< std::codecvt_utf8_utf16<char16_t>, char16_t >{}.from_bytes(str); }

std::string to_utf8( std::u16string str16 )
{ return std::wstring_convert< std::codecvt_utf8_utf16<char16_t>, char16_t >{}.to_bytes(str16); }

std::u32string to_utf32( std::string str )
{ return std::wstring_convert< std::codecvt_utf8<char32_t>, char32_t >{}.from_bytes(str); }

std::string to_utf8( std::u32string str32 )
{ return std::wstring_convert< std::codecvt_utf8<char32_t>, char32_t >{}.to_bytes(str32); }

std::wstring to_wchar_t( std::string str )
{ return std::wstring_convert< std::codecvt_utf8<wchar_t>, wchar_t >{}.from_bytes(str); }

std::string to_utf8( std::wstring wstr )
{ return std::wstring_convert< std::codecvt_utf8<wchar_t>, wchar_t >{}.to_bytes(wstr); }

int main()
{
    const std::string utf8 = "hello world!\n" ;

    const std::u16string utf16 = to_utf16( utf8 ) ;
    std::cout << to_utf8(utf16) ;

    const std::u32string utf32 = to_utf32( utf8 ) ;
    std::cout << to_utf8(utf32) ;
    
    const std::wstring wstr = to_wchar_t(utf8) ;
    std::wcout << wstr ;
    std::cout << to_utf8(wstr) ;
}

http://coliru.stacked-crooked.com/a/c6100c660bb4039c
http://rextester.com/SNNS17135

Note: does not work with the GNU library (it does not have the standard header <codecvt>)

helios (17607)

wstring giveWStringFromASCII(size_t i)
{
return StringToWString(UnicodeToUTF8(i));
}

WTF could you ever want that for?

Topic archived. No new replies allowed.

C++

Forum

wchar_t and reading unicode strings