Conversion between Unicode UTF-16 and UTF-8 in C++/Win32

For fresh updated and richer information and modern C++ usage, please read my MSDN Magazine article (published on the 2016 September issue):

Unicode Encoding Conversions with STL Strings and Win32 APIs

New updated modern C++ code can be found here on GitHub.


Check out My Pluralsight Courses here.


Code working with ATL’s CStringW/A classes and throwing exceptions via AtlThrow() can be found here on GitHub. For convenience, the core part of that code is copied below:

//////////////////////////////////////////////////////////////////////////////
//
// *** Functions to convert between Unicode UTF-8 and Unicode UTF-16 ***
//                      using ATL CStringA/W classes
//
// By Giovanni Dicanio 
//
//////////////////////////////////////////////////////////////////////////////


//----------------------------------------------------------------------------
// FUNCTION: Utf8ToUtf16
// DESC:     Converts Unicode UTF-8 text to Unicode UTF-16 (Windows default).
//----------------------------------------------------------------------------
CStringW Utf8ToUtf16(const CStringA& utf8)
{
    // Special case of empty input string
    if (utf8.IsEmpty())
    {
        // Return empty string
        return CStringW();
    }


    // "Code page" value used with MultiByteToWideChar() for UTF-8 conversion 
    const UINT codePageUtf8 = CP_UTF8;

    // Safely fails if an invalid UTF-8 character is encountered
    const DWORD flags = MB_ERR_INVALID_CHARS;

    // Get the length, in WCHARs, of the resulting UTF-16 string
    const int utf16Length = ::MultiByteToWideChar(
            codePageUtf8,       // source string is in UTF-8
            flags,              // conversion flags
            utf8.GetString(),   // source UTF-8 string
            utf8.GetLength(),   // length of source UTF-8 string, in chars
            nullptr,            // unused - no conversion done in this step
            0);                 // request size of destination buffer, in WCHARs
    if (utf16Length == 0)
    {
        // Conversion error
        AtlThrowLastWin32();
    }


    // Allocate destination buffer to store the resulting UTF-16 string
    CStringW utf16;
    WCHAR* const utf16Buffer = utf16.GetBuffer(utf16Length);
    ATLASSERT(utf16Buffer != nullptr);


    // Do the conversion from UTF-8 to UTF-16
    int result = ::MultiByteToWideChar(
            codePageUtf8,       // source string is in UTF-8
            flags,              // conversion flags
            utf8.GetString(),   // source UTF-8 string
            utf8.GetLength(),   // length of source UTF-8 string, in chars
            utf16Buffer,        // pointer to destination buffer
            utf16Length);       // size of destination buffer, in WCHARs  
    if (result == 0)
    {
        // Conversion error
        AtlThrowLastWin32();
    }

    // Don't forget to release internal CString buffer 
    // before returning the string to the caller
    utf16.ReleaseBufferSetLength(utf16Length);

    // Return resulting UTF-16 string
    return utf16;
}



//----------------------------------------------------------------------------
// FUNCTION: Utf16ToUtf8
// DESC:     Converts Unicode UTF-16 (Windows default) text to Unicode UTF-8.
//----------------------------------------------------------------------------
CStringA Utf16ToUtf8(const CStringW& utf16)
{
    // Special case of empty input string
    if (utf16.IsEmpty())
    {
        // Return empty string
        return CStringA();
    }


    // "Code page" value used with WideCharToMultiByte() for UTF-8 conversion 
    const UINT codePageUtf8 = CP_UTF8;

    // Safely fails if an invalid UTF-16 character is encountered
    const DWORD flags = WC_ERR_INVALID_CHARS;

    // Get the length, in chars, of the resulting UTF-8 string
    const int utf8Length = ::WideCharToMultiByte(
            codePageUtf8,       // convert to UTF-8
            flags,              // conversion flags
            utf16.GetString(),  // source UTF-16 string
            utf16.GetLength(),  // length of source UTF-16 string, in WCHARs
            nullptr,            // unused - no conversion required in this step
            0,                  // request size of destination buffer, in chars
            nullptr, nullptr);  // unused
    if (utf8Length == 0)
    {
        // Conversion error
        AtlThrowLastWin32();
    }


    // Allocate destination buffer to store the resulting UTF-8 string
    CStringA utf8;
    char* const utf8Buffer = utf8.GetBuffer(utf8Length);
    ATLASSERT(utf8Buffer != nullptr);


    // Do the conversion from UTF-16 to UTF-8
    int result = ::WideCharToMultiByte(
            codePageUtf8,       // convert to UTF-8
            flags,              // conversion flags
            utf16.GetString(),  // source UTF-16 string
            utf16.GetLength(),  // length of source UTF-16 string, in WCHARs
            utf8Buffer,         // pointer to destination buffer
            utf8Length,         // size of destination buffer, in chars
            nullptr, nullptr);  // unused
    if (result == 0)
    {
        // Conversion error
        AtlThrowLastWin32();
    }


    // Don't forget to release internal CString buffer 
    // before returning the string to the caller
    utf8.ReleaseBufferSetLength(utf8Length);

    // Return resulting UTF-8 string
    return utf8;
}

 

6 thoughts on “Conversion between Unicode UTF-16 and UTF-8 in C++/Win32”

  1. I am just wondering how the buffer allocation using CStringA|W works. It allocates a buffer using a number of characters only. How does this work since the number of bytes depends on exactly what the characters are for variable width encodings such as UTF-8 and UTF-16. (Unless what MultiByteToWideChar returns isn’t strictly the number of characters?)

  2. @Ben, MultiByteToWideChar tells you how large a buffer of WCHAR values is needed to store the output string (on the first invocation).

    When it says “characters” it really means “code-units” (*not* code-points), in this case WCHAR objects (each of which is 16 bits since we have UTF-16).

  3. Very good, but a simplified function based in this that work follow:

    char* _UTF16ToUTF8( nunichar * pszTextUTF16 ){
    if ( (pszTextUTF16 == NULL) || (*pszTextUTF16 == L’\0′) ) {
    return 0;
    }
    int cchUTF16;
    cchUTF16=n_strlen( pszTextUTF16)+1;
    int cbUTF8 = WideCharToMultiByte(CP_UTF8,0,pszTextUTF16,cchUTF16,NULL,0/* request buffer size*/,NULL, NULL );
    ASSERT2(cbUTF8);
    char *strUTF8=new char[cbUTF8],*pszUTF8 =strUTF8;
    int result = WideCharToMultiByte(CP_UTF8, 0,pszTextUTF16,cchUTF16 ,pszUTF8, cbUTF8,NULL,NULL );
    ASSERT2( result);
    return strUTF8;
    }

  4. this is the correct function(sorry):

    char* _UTF16ToUTF8( wchar_t * pszTextUTF16 ){
    if ( (pszTextUTF16 == NULL) || (*pszTextUTF16 == L’\0′) ) {
    return 0;
    }
    int cchUTF16;
    cchUTF16=wcslen( pszTextUTF16)+1;
    int cbUTF8 = WideCharToMultiByte(CP_UTF8,0,pszTextUTF16,cchUTF16,NULL,0/* request buffer size*/,NULL, NULL );
    ASSERT(cbUTF8);
    char *strUTF8=new char[cbUTF8],*pszUTF8 =strUTF8;
    int result = WideCharToMultiByte(CP_UTF8, 0,pszTextUTF16,cchUTF16 ,pszUTF8, cbUTF8,NULL,NULL );
    ASSERT( result);
    return strUTF8;
    }

  5. @Nei Amaral F.
    I’d add const correctness to your function, using “const” for “pszTextUTF16”.

    Moreover, I’d prefer using a string class for return value instead of a raw char* pointer (which the caller must manually free with delete[], and is a potential source for memory leaks).

Leave a Reply

Your email address will not be published. Required fields are marked *