SFML - Simple and Fast Multimedia Library

previous page next page
File List
Unicode.inl

00001 
00002 //
00003 // SFML - Simple and Fast Multimedia Library
00004 // Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com)
00005 //
00006 // This software is provided 'as-is', without any express or implied warranty.
00007 // In no event will the authors be held liable for any damages arising from the use of this software.
00008 //
00009 // Permission is granted to anyone to use this software for any purpose,
00010 // including commercial applications, and to alter it and redistribute it freely,
00011 // subject to the following restrictions:
00012 //
00013 // 1. The origin of this software must not be misrepresented;
00014 //    you must not claim that you wrote the original software.
00015 //    If you use this software in a product, an acknowledgment
00016 //    in the product documentation would be appreciated but is not required.
00017 //
00018 // 2. Altered source versions must be plainly marked as such,
00019 //    and must not be misrepresented as being the original software.
00020 //
00021 // 3. This notice may not be removed or altered from any source distribution.
00022 //
00024 
00025 
00030 template <typename In, typename Out>
00031 inline Out Unicode::UTF32ToANSI(In Begin, In End, Out Output, char Replacement, const std::locale& Locale)
00032 {
00033     #ifdef __MINGW32__
00034 
00035         // MinGW has a almost no support for unicode stuff
00036         // As a consequence, the MinGW version of this function can only use the default locale
00037         // and ignores the one passed as parameter
00038         while (Begin < End)
00039         {
00040             char Char = 0;
00041             if (wctomb(&Char, static_cast<wchar_t>(*Begin++)) >= 0)
00042                 *Output++ = Char;
00043             else if (Replacement)
00044                 *Output++ = Replacement;
00045         }
00046 
00047     #else
00048 
00049         // Get the facet of the locale which deals with character conversion
00050         const std::ctype<wchar_t>& Facet = std::use_facet< std::ctype<wchar_t> >(Locale);
00051 
00052         // Use the facet to convert each character of the input string
00053         while (Begin < End)
00054             *Output++ = Facet.narrow(static_cast<wchar_t>(*Begin++), Replacement);
00055 
00056     #endif
00057 
00058     return Output;
00059 }
00060 
00061 
00066 template <typename In, typename Out>
00067 inline Out Unicode::ANSIToUTF32(In Begin, In End, Out Output, const std::locale& Locale)
00068 {
00069     #ifdef __MINGW32__
00070 
00071         // MinGW has a almost no support for unicode stuff
00072         // As a consequence, the MinGW version of this function can only use the default locale
00073         // and ignores the one passed as parameter
00074         while (Begin < End)
00075         {
00076             wchar_t Char = 0;
00077             mbtowc(&Char, &*Begin, 1);
00078             Begin++;
00079             *Output++ = static_cast<Uint32>(Char);
00080         }
00081 
00082     #else
00083 
00084         // Get the facet of the locale which deals with character conversion
00085         const std::ctype<wchar_t>& Facet = std::use_facet< std::ctype<wchar_t> >(Locale);
00086 
00087         // Use the facet to convert each character of the input string
00088         while (Begin < End)
00089             *Output++ = static_cast<Uint32>(Facet.widen(*Begin++));
00090 
00091     #endif
00092 
00093     return Output;
00094 }
00095 
00096 
00101 template <typename In, typename Out>
00102 inline Out Unicode::UTF8ToUTF16(In Begin, In End, Out Output, Uint16 Replacement)
00103 {
00104     while (Begin < End)
00105     {
00106         Uint32 c = 0;
00107         int TrailingBytes = UTF8TrailingBytes[static_cast<int>(*Begin)];
00108         if (Begin + TrailingBytes < End)
00109         {
00110             // First decode the UTF-8 character
00111             switch (TrailingBytes)
00112             {
00113                 case 5 : c += *Begin++; c <<= 6;
00114                 case 4 : c += *Begin++; c <<= 6;
00115                 case 3 : c += *Begin++; c <<= 6;
00116                 case 2 : c += *Begin++; c <<= 6;
00117                 case 1 : c += *Begin++; c <<= 6;
00118                 case 0 : c += *Begin++;
00119             }
00120             c -= UTF8Offsets[TrailingBytes];
00121 
00122             // Then encode it in UTF-16
00123             if (c < 0xFFFF)
00124             {
00125                 // Character can be converted directly to 16 bits, just need to check it's in the valid range
00126                 if ((c >= 0xD800) && (c <= 0xDFFF))
00127                 {
00128                     // Invalid character (this range is reserved)
00129                     if (Replacement)
00130                         *Output++ = Replacement;
00131                 }
00132                 else
00133                 {
00134                     // Valid character directly convertible to 16 bits
00135                     *Output++ = static_cast<Uint16>(c);
00136                 }
00137             }
00138             else if (c > 0x0010FFFF)
00139             {
00140                 // Invalid character (greater than the maximum unicode value)
00141                 if (Replacement)
00142                     *Output++ = Replacement;
00143             }
00144             else
00145             {
00146                 // Character will be converted to 2 UTF-16 elements
00147                 c -= 0x0010000;
00148                 *Output++ = static_cast<Uint16>((c >> 10)     + 0xD800);
00149                 *Output++ = static_cast<Uint16>((c & 0x3FFUL) + 0xDC00);
00150             }
00151         }
00152     }
00153 
00154     return Output;
00155 }
00156 
00157 
00162 template <typename In, typename Out>
00163 inline Out Unicode::UTF8ToUTF32(In Begin, In End, Out Output, Uint32 Replacement)
00164 {
00165     while (Begin < End)
00166     {
00167         Uint32 c = 0;
00168         int TrailingBytes = UTF8TrailingBytes[static_cast<int>(*Begin)];
00169         if (Begin + TrailingBytes < End)
00170         {
00171             // First decode the UTF-8 character
00172             switch (TrailingBytes)
00173             {
00174                 case 5 : c += *Begin++; c <<= 6;
00175                 case 4 : c += *Begin++; c <<= 6;
00176                 case 3 : c += *Begin++; c <<= 6;
00177                 case 2 : c += *Begin++; c <<= 6;
00178                 case 1 : c += *Begin++; c <<= 6;
00179                 case 0 : c += *Begin++;
00180             }
00181             c -= UTF8Offsets[TrailingBytes];
00182 
00183             // Then write it if valid
00184             if ((c < 0xD800) || (c > 0xDFFF))
00185             {
00186                 // Valid UTF-32 character
00187                 *Output++ = c;
00188             }
00189             else
00190             {
00191                 // Invalid UTF-32 character
00192                 if (Replacement)
00193                     *Output++ = Replacement;
00194             }
00195         }
00196     }
00197 
00198     return Output;
00199 }
00200 
00201 
00206 template <typename In, typename Out>
00207 inline Out Unicode::UTF16ToUTF8(In Begin, In End, Out Output, Uint8 Replacement)
00208 {
00209     while (Begin < End)
00210     {
00211         Uint32 c = *Begin++;
00212 
00213         // If it's a surrogate pair, first convert to a single UTF-32 character
00214         if ((c >= 0xD800) && (c <= 0xDBFF))
00215         {
00216             if (Begin < End)
00217             {
00218                 // The second element is valid : convert the two elements to a UTF-32 character
00219                 Uint32 d = *Begin++;
00220                 if ((d >= 0xDC00) && (d <= 0xDFFF))
00221                     c = static_cast<Uint32>(((c - 0xD800) << 10) + (d - 0xDC00) + 0x0010000);
00222             }
00223             else
00224             {
00225                 // Invalid second element
00226                 if (Replacement)
00227                     *Output++ = Replacement;
00228             }
00229         }
00230 
00231         // Then convert to UTF-8
00232         if (c > 0x0010FFFF)
00233         {
00234             // Invalid character (greater than the maximum unicode value)
00235             if (Replacement)
00236                 *Output++ = Replacement;
00237         }
00238         else
00239         {
00240             // Valid character
00241 
00242             // Get number of bytes to write
00243             int BytesToWrite = 1;
00244             if      (c <  0x80)       BytesToWrite = 1;
00245             else if (c <  0x800)      BytesToWrite = 2;
00246             else if (c <  0x10000)    BytesToWrite = 3;
00247             else if (c <= 0x0010FFFF) BytesToWrite = 4;
00248 
00249             // Extract bytes to write
00250             Uint8 Bytes[4];
00251             switch (BytesToWrite)
00252             {
00253                 case 4 : Bytes[3] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
00254                 case 3 : Bytes[2] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
00255                 case 2 : Bytes[1] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
00256                 case 1 : Bytes[0] = static_cast<Uint8> (c | UTF8FirstBytes[BytesToWrite]);
00257             }
00258 
00259             // Add them to the output
00260             const Uint8* CurByte = Bytes;
00261             switch (BytesToWrite)
00262             {
00263                 case 4 : *Output++ = *CurByte++;
00264                 case 3 : *Output++ = *CurByte++;
00265                 case 2 : *Output++ = *CurByte++;
00266                 case 1 : *Output++ = *CurByte++;
00267             }
00268         }
00269     }
00270 
00271     return Output;
00272 }
00273 
00274 
00279 template <typename In, typename Out>
00280 inline Out Unicode::UTF16ToUTF32(In Begin, In End, Out Output, Uint32 Replacement)
00281 {
00282     while (Begin < End)
00283     {
00284         Uint16 c = *Begin++;
00285         if ((c >= 0xD800) && (c <= 0xDBFF))
00286         {
00287             // We have a surrogate pair, ie. a character composed of two elements
00288             if (Begin < End)
00289             {
00290                 Uint16 d = *Begin++;
00291                 if ((d >= 0xDC00) && (d <= 0xDFFF))
00292                 {
00293                     // The second element is valid : convert the two elements to a UTF-32 character
00294                     *Output++ = static_cast<Uint32>(((c - 0xD800) << 10) + (d - 0xDC00) + 0x0010000);
00295                 }
00296                 else
00297                 {
00298                     // Invalid second element
00299                     if (Replacement)
00300                         *Output++ = Replacement;
00301                 }
00302             }
00303         }
00304         else if ((c >= 0xDC00) && (c <= 0xDFFF))
00305         {
00306             // Invalid character
00307             if (Replacement)
00308                 *Output++ = Replacement;
00309         }
00310         else
00311         {
00312             // Valid character directly convertible to UTF-32
00313             *Output++ = static_cast<Uint32>(c);
00314         }
00315     }
00316 
00317     return Output;
00318 }
00319 
00320 
00325 template <typename In, typename Out>
00326 inline Out Unicode::UTF32ToUTF8(In Begin, In End, Out Output, Uint8 Replacement)
00327 {
00328     while (Begin < End)
00329     {
00330         Uint32 c = *Begin++;
00331         if (c > 0x0010FFFF)
00332         {
00333             // Invalid character (greater than the maximum unicode value)
00334             if (Replacement)
00335                 *Output++ = Replacement;
00336         }
00337         else
00338         {
00339             // Valid character
00340 
00341             // Get number of bytes to write
00342             int BytesToWrite = 1;
00343             if      (c <  0x80)       BytesToWrite = 1;
00344             else if (c <  0x800)      BytesToWrite = 2;
00345             else if (c <  0x10000)    BytesToWrite = 3;
00346             else if (c <= 0x0010FFFF) BytesToWrite = 4;
00347 
00348             // Extract bytes to write
00349             Uint8 Bytes[4];
00350             switch (BytesToWrite)
00351             {
00352                 case 4 : Bytes[3] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
00353                 case 3 : Bytes[2] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
00354                 case 2 : Bytes[1] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
00355                 case 1 : Bytes[0] = static_cast<Uint8> (c | UTF8FirstBytes[BytesToWrite]);
00356             }
00357 
00358             // Add them to the output
00359             const Uint8* CurByte = Bytes;
00360             switch (BytesToWrite)
00361             {
00362                 case 4 : *Output++ = *CurByte++;
00363                 case 3 : *Output++ = *CurByte++;
00364                 case 2 : *Output++ = *CurByte++;
00365                 case 1 : *Output++ = *CurByte++;
00366             }
00367         }
00368     }
00369 
00370     return Output;
00371 }
00372 
00373 
00378 template <typename In, typename Out>
00379 inline Out Unicode::UTF32ToUTF16(In Begin, In End, Out Output, Uint16 Replacement)
00380 {
00381     while (Begin < End)
00382     {
00383         Uint32 c = *Begin++;
00384         if (c < 0xFFFF)
00385         {
00386             // Character can be converted directly to 16 bits, just need to check it's in the valid range
00387             if ((c >= 0xD800) && (c <= 0xDFFF))
00388             {
00389                 // Invalid character (this range is reserved)
00390                 if (Replacement)
00391                     *Output++ = Replacement;
00392             }
00393             else
00394             {
00395                 // Valid character directly convertible to 16 bits
00396                 *Output++ = static_cast<Uint16>(c);
00397             }
00398         }
00399         else if (c > 0x0010FFFF)
00400         {
00401             // Invalid character (greater than the maximum unicode value)
00402             if (Replacement)
00403                 *Output++ = Replacement;
00404         }
00405         else
00406         {
00407             // Character will be converted to 2 UTF-16 elements
00408             c -= 0x0010000;
00409             *Output++ = static_cast<Uint16>((c >> 10)     + 0xD800);
00410             *Output++ = static_cast<Uint16>((c & 0x3FFUL) + 0xDC00);
00411         }
00412     }
00413 
00414     return Output;
00415 }
00416 
00417 
00421 template <typename In>
00422 inline std::size_t Unicode::GetUTF8Length(In Begin, In End)
00423 {
00424     std::size_t Length = 0;
00425     while (Begin < End)
00426     {
00427         int NbBytes = UTF8TrailingBytes[static_cast<int>(*Begin)];
00428         if (Begin + NbBytes < End)
00429             ++Length;
00430 
00431         Begin += NbBytes + 1;
00432     }
00433 
00434     return Length;
00435 }
00436 
00437 
00441 template <typename In>
00442 inline std::size_t Unicode::GetUTF16Length(In Begin, In End)
00443 {
00444     std::size_t Length = 0;
00445     while (Begin < End)
00446     {
00447         if ((*Begin >= 0xD800) && (*Begin <= 0xDBFF))
00448         {
00449             ++Begin;
00450             if ((Begin < End) && ((*Begin >= 0xDC00) && (*Begin <= 0xDFFF)))
00451             {
00452                 ++Length;
00453             }
00454         }
00455         else
00456         {
00457             ++Length;
00458         }
00459 
00460         ++Begin;
00461     }
00462 
00463     return Length;
00464 }
00465 
00466 
00470 template <typename In>
00471 inline std::size_t Unicode::GetUTF32Length(In Begin, In End)
00472 {
00473     return End - Begin;
00474 }
previous page start next page