#pragma region CPL License /* Nuclex Native Framework Copyright (C) 2002-2023 Nuclex Development Labs This library is free software; you can redistribute it and/or modify it under the terms of the IBM Common Public License as published by the IBM Corporation; either version 1.0 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the IBM Common Public License for more details. You should have received a copy of the IBM Common Public License along with this library */ #pragma endregion // CPL License #ifndef NUCLEX_SUPPORT_TEXT_UNICODEHELPER_H #define NUCLEX_SUPPORT_TEXT_UNICODEHELPER_H #include "Nuclex/Support/Config.h" #include // for std::size_t #include // for assert() namespace Nuclex { namespace Support { namespace Text { // ------------------------------------------------------------------------------------------- // /// Helper methods for dealing with unicode and its different encodings /// /// /// Short overview of unicode: the "unicode consortium" has taken symbols from /// all languages of the world and put them into a giant table. Said table is defined /// with room for about 1.1 million symbols, but only some 140,000 symbols have been /// filled so far. Nominally, the table is divided into 17 "planes" of /// 65,536 characters each, separating latin-based languages from asian language and /// from funny poop emojis, but that part is only important for font designers. /// /// /// An index into the unicode table is called a "code point". So what used to /// be a characters in an ASCII string are now code points in a unicode string. /// The easiest way to store them would be to just keep an array of 32 bit integers, /// each sufficient to hold one code point. That's precisely what UTF-32 is. While easy /// to deal with, its downsides are wasted space and endian issues. /// /// /// Enter UTF-8. It is a variable-length encoding where the first byte tells the number /// of bytes that follow, up to 3. Amusingly, if the first byte's uppermost bit is unset, /// this indicates a single-byte code point using 7 bits which happen to be mapped to /// ASCII in a 1:1 fashion, in other words, any 7-bit ASCII string is a valid UTF-8 /// string. Consisting of only bytes, it isn't prone to endian issues. /// /// /// Cool fact: in UTF-8 code points requiring 2, 3 or 4 bytes to encode, all of /// the bytes have their highest bit set. That means that no single byte will intrude /// into the 7-bit ASCII range. So if, for example, the byte 0x2f, '/', a path separator, /// appears in the bytes of an UTF-8 string, it *is* the path separator since no /// follow-up-byte in a 2, 3 or 4 byte code point can ever use the values 0x00-0x7f. /// This allows UTF-8 to harmlessly pass through a lot of old software and/or code. /// /// /// UTF-16 combines the worst of either: endian issues and wasted space. So naturally /// Microsoft used it for all unicode in Windows. A code point is represented by one or /// two 16 bit integers, again using the leading integer's high bits to indicate whether /// the code point is complete or formed together with the 16 bit integer that follows. /// Lots of Windows software, holds the opinion that one 16 bit integer, aka one wchar_t, /// is one glyph, which tends to work until you localize to Asian languages. /// /// /// One last confusing thing: whenever I write that UTF-8 encodes unicode code points /// as 1-4 bytes, UTF-16 as one or two 16 bit integers and UTF-32 as a 32 bit integer, /// the correct term in place of "bytes" and "integers" would be /// "characters". That's why in C++ the new data types are /// char8_t, char16_t and char32_t. /// So "character" has been (re-?)defined to mean "encoding atom" /// and it is not always enough to represent an entire letter (aka code point). /// /// /// A series of characters encoding a unicode code point is called a sequence. /// /// class NUCLEX_SUPPORT_TYPE UnicodeHelper { /// UTF-8 character of which either 1, 2, 3 or 4 specify one codepoint /// /// Under C++20, this will be a native type like char16_t and char32_t. There will also /// be an std::u8string using this character type to unambiguously indicate that /// the contents of the string are supposed to be UTF-8 encoded. /// #if defined(__cpp_char8_t) public: typedef char8_t Char8Type; #else public: typedef unsigned char Char8Type; #endif /// The symbol used to indicate a code point is invalid or corrupted public: static const constexpr char32_t ReplacementCodePoint = char32_t(0xFFFD); /// Checks whether the specified unicode code point is valid /// Code point that will be checked /// True if the code point is valid, false otherwise public: NUCLEX_SUPPORT_API static constexpr bool IsValidCodePoint(char32_t codePoint); /// /// Returns the number of characters in a sequence by looking at the lead character /// /// Lead character of the UTF-8 sequence /// /// The length of the sequence or std::size_t(-1) if the character /// is not the lead character of a sequence (or is not valid UTF-8 at all). /// /// /// This method can be used to figure out if a character is the lead character, too. /// public: NUCLEX_SUPPORT_API static constexpr std::size_t GetSequenceLength( Char8Type leadCharacter ); /// /// Returns the number of characters in a sequence by looking at the lead character /// /// Lead character of the UTF-16 sequence /// /// The length of the sequence or std::size_t(-1) if the character /// is not the lead character of a sequence (or is not valid UTF-16 at all). /// /// /// This method can be used to figure out if a character is the lead character, too. /// It doesn't do any big/little endian conversion. If you know the input is from /// in the endianness opposite of the current platform, byte-swap each char16_t. /// public: NUCLEX_SUPPORT_API static constexpr std::size_t GetSequenceLength( char16_t leadCharacter ); /// /// Counts the number of UTF-8 characters needed to represent a code point /// /// /// Code point for which the needed UTF-8 characters will be counted /// /// The number of characters needed to encode the code point in UTF-8 public: NUCLEX_SUPPORT_API static constexpr std::size_t CountUtf8Characters( char32_t codePoint ); /// /// Counts the number of UTF-16 characters needed to represent a code point /// /// /// Code point for which the needed UTF-16 characters will be counted /// /// The number of characters needed to encode the code point in UTF-16 public: NUCLEX_SUPPORT_API static constexpr std::size_t CountUtf16Characters( char32_t codePoint ); /// Reads a code point from a variable-length UTF-8 sequence /// /// Address of the UTF-8 lead character, will be updated to the next lead /// character if the read succeeds. /// /// Character at which the UTF-8 string ends /// The unicode code point index, identical to UTF-32. /// /// If the end is reached or if the character is incomplete or invalid, this method /// returns char32_t(-1) to indicate failure. You should check the position of your /// read pointer before calling to distinguish between a normal end of the string and /// bad UTF-8 data. /// public: NUCLEX_SUPPORT_API static char32_t ReadCodePoint( const Char8Type *¤t, const Char8Type *end ); /// Reads a code point from a variable-length UTF-8 sequence /// /// Address of the UTF-8 lead character, will be updated to the next lead /// character if the read succeeds. /// /// Character at which the UTF-8 string ends /// The unicode code point index, identical to UTF-32. /// /// If the end is reached or if the character is incomplete or invalid, this method /// returns char32_t(-1) to indicate failure. You should check the position of your /// read pointer before calling to distinguish between a normal end of the string and /// bad UTF-8 data. /// public: NUCLEX_SUPPORT_API static char32_t ReadCodePoint( Char8Type *¤t, const Char8Type *end ); /// Reads a code point from a variable-length UTF-16 sequence /// /// Address of the UTF-16 lead character, will be updated to the next lead /// character if the read succeeds. /// /// Character at which the UTF-16 string ends /// The unicode code point index, identical to UTF-32. /// /// If the end is reached or if the character is incomplete or invalid, this method /// returns char32_t(-1) to indicate failure. You should check the position of your /// read pointer before calling to distinguish between a normal end of the string and /// bad UTF-16 data. /// public: NUCLEX_SUPPORT_API static char32_t ReadCodePoint( const char16_t *¤t, const char16_t *end ); /// Reads a code point from a variable-length UTF-16 sequence /// /// Address of the UTF-16 lead character, will be updated to the next lead /// character if the read succeeds. /// /// Character at which the UTF-16 string ends /// The unicode code point index, identical to UTF-32. /// /// If the end is reached or if the character is incomplete or invalid, this method /// returns char32_t(-1) to indicate failure. You should check the position of your /// read pointer before calling to distinguish between a normal end of the string and /// bad UTF-16 data. /// public: NUCLEX_SUPPORT_API static char32_t ReadCodePoint( char16_t *¤t, const char16_t *end ); /// Reads a code point from a UTF-32 character /// /// Address of the UTF-32 character, will be updated to the next character /// if the read succeeds. /// /// Character at which the UTF-32 string ends /// The unicode code point index, identical to UTF-32. /// /// If the end is reached or if the character is incomplete or invalid, this method /// returns char32_t(-1) to indicate failure. You should check the position of your /// read pointer before calling to distinguish between a normal end of the string and /// bad UTF-16 data. /// public: NUCLEX_SUPPORT_API static char32_t ReadCodePoint( const char32_t *¤t, const char32_t *end ); /// Reads a code point from a UTF-32 character /// /// Address of the UTF-32 character, will be updated to the next character /// if the read succeeds. /// /// Character at which the UTF-32 string ends /// The unicode code point index, identical to UTF-32. /// /// If the end is reached or if the character is incomplete or invalid, this method /// returns char32_t(-1) to indicate failure. You should check the position of your /// read pointer before calling to distinguish between a normal end of the string and /// bad UTF-16 data. /// public: NUCLEX_SUPPORT_API static char32_t ReadCodePoint( char32_t *¤t, const char32_t *end ); /// Encodes the specified code point into UTF-8 characters /// Code point that will be encoded as UTF-8 /// /// Address at which the UTF-8 characters will be deposited. Needs to have at /// least 4 bytes of usable space and will be moved to after the encoded characters /// /// /// The number of characters that have been encoded or std::size_t(-1) if /// you specified an invalid code point. /// public: NUCLEX_SUPPORT_API static std::size_t WriteCodePoint( Char8Type *&target, char32_t codePoint ); /// Encodes the specified code point into UTF-16 characters /// Code point that will be encoded as UTF-16 /// /// Address at which the UTF-16 characters will be deposited. Needs to have at /// least 4 bytes of usable space and will be moved to after the encoded characters /// /// /// The number of characters that have been encoded or std::size_t(-1) if /// you specified an invalid code point. /// public: NUCLEX_SUPPORT_API static std::size_t WriteCodePoint( char16_t *&target, char32_t codePoint ); /// Encodes the specified code point into UTF-32 characters /// Code point that will be encoded as UTF-32 /// /// Address at which the UTF-16 characters will be deposited. Needs to have at /// least 4 bytes of usable space and will be moved to after the encoded characters /// /// /// The number of characters that have been encoded or std::size_t(-1) if /// you specified an invalid code point. /// public: NUCLEX_SUPPORT_API static std::size_t WriteCodePoint( char32_t *&target, char32_t codePoint ); /// Converts the specified Unicode code point to folded lowercase /// /// Unicode code point that will be converted to folded lowercase /// /// The character or its folded lowercase equivalent /// /// /// Folded lowercase is a special variant of lowercase that will result in a string of /// equal or shorter length when encoded to UTF-8 or UTF-16. It is not intended for /// display and some mappings may lead to incorrect lowercase characters for such. /// /// /// Comparing the case-folded translations of two strings will produce the result of /// a case-insensitive comparison. This makes case folding very useful for case /// insensitive comparison logic and associative containers which can store /// pre-case-folded strings for their indexes if they need to be case insensitive. /// /// /// Warning: really, don't use this for displayed strings. It may even replace /// lowercase characters with something weird in case their UTF-8-encoded code point /// would be longer than its uppercase variant. /// /// public: NUCLEX_SUPPORT_API static char32_t ToFoldedLowercase(char32_t codePoint); }; // ------------------------------------------------------------------------------------------- // inline constexpr bool UnicodeHelper::IsValidCodePoint(char32_t codePoint) { return ( (codePoint < 0xD800) || ( (codePoint >= 0xE000) && (codePoint < 1114111) ) ); } // ------------------------------------------------------------------------------------------- // inline constexpr std::size_t UnicodeHelper::GetSequenceLength(Char8Type leadCharacter) { if(leadCharacter < 128) { return 1; } else if((leadCharacter & 0xE0) == 0xC0) { return 2; } else if((leadCharacter & 0xF0) == 0xE0) { return 3; } else if((leadCharacter & 0xF8) == 0xF0) { return 4; } else { return std::size_t(-1); } } // ------------------------------------------------------------------------------------------- // inline constexpr std::size_t UnicodeHelper::GetSequenceLength(char16_t leadCharacter) { if(leadCharacter < char16_t(0xD800)) { return 1; // Single character code point, below surrogate range } else if(leadCharacter >= char16_t(0xE000)) { return 1; // Single character code point, above surrogate range } else if(leadCharacter < char16_t(0xDC00)) { return 2; // Two-character code point, lead surrogate } else { return std::size_t(-1); // It's a trail surrogate, thus no lead character } } // ------------------------------------------------------------------------------------------- // inline constexpr std::size_t UnicodeHelper::CountUtf8Characters(char32_t codePoint) { if(codePoint < 128) { return 1; } else if(codePoint < 2048) { return 2; } else if(codePoint < 65536) { return 3; } else if(codePoint < 1114111) { return 4; } else { return std::size_t(-1); } } // ------------------------------------------------------------------------------------------- // inline constexpr std::size_t UnicodeHelper::CountUtf16Characters(char32_t codePoint) { if(codePoint < 0xD800) { return 1; } else if((codePoint >= 0xE000) && (codePoint < 1114111)) { return 2; } else { return std::size_t(-1); } } // ------------------------------------------------------------------------------------------- // inline char32_t UnicodeHelper::ReadCodePoint(const char32_t *¤t, const char32_t *end) { assert((current < end) && u8"At least one UTF-32 character of input must be available"); NUCLEX_SUPPORT_NDEBUG_UNUSED(end); char32_t codePoint = *current; ++current; return codePoint; } // ------------------------------------------------------------------------------------------- // inline char32_t UnicodeHelper::ReadCodePoint(char32_t *¤t, const char32_t *end) { assert((current < end) && u8"At least one UTF-32 character of input must be available"); NUCLEX_SUPPORT_NDEBUG_UNUSED(end); char32_t codePoint = *current; ++current; return codePoint; } // ------------------------------------------------------------------------------------------- // inline std::size_t UnicodeHelper::WriteCodePoint(Char8Type *&target, char32_t codePoint) { if(codePoint < 128) { *target = static_cast(codePoint); ++target; return 1; } else if(codePoint < 2048) { *target = Char8Type(0xC0) | static_cast(codePoint >> 6); ++target; *target = Char8Type(0x80) | static_cast(codePoint & 0x3F); ++target; return 2; } else if(codePoint < 65536) { *target = Char8Type(0xE0) | static_cast(codePoint >> 12); ++target; *target = Char8Type(0x80) | static_cast((codePoint >> 6) & 0x3F); ++target; *target = Char8Type(0x80) | static_cast(codePoint & 0x3F); ++target; return 3; } else if(codePoint < 1114111) { *target = Char8Type(0xF0) | static_cast(codePoint >> 18); ++target; *target = Char8Type(0x80) | static_cast((codePoint >> 12) & 0x3F); ++target; *target = Char8Type(0x80) | static_cast((codePoint >> 6) & 0x3F); ++target; *target = Char8Type(0x80) | static_cast(codePoint & 0x3F); ++target; return 4; } else { return std::size_t(-1); } } // ------------------------------------------------------------------------------------------- // inline std::size_t UnicodeHelper::WriteCodePoint(char16_t *&target, char32_t codePoint) { if(codePoint < 65536) { assert( ((codePoint < 0xDC00) || (codePoint >= 0xE000)) && u8"Unicode code point has to be outside surrogate range (0xDC00-0xDFFF)" ); *target = static_cast(codePoint); ++target; return 1; } else if(codePoint < 1114111) { codePoint -= char32_t(65536); *(target) = 0xD800 | static_cast(codePoint >> 10); *(target + 1) = 0xDC00 | static_cast(codePoint & 0x03FF); target += 2; return 2; } else { return std::size_t(-1); } } // ------------------------------------------------------------------------------------------- // inline std::size_t UnicodeHelper::WriteCodePoint(char32_t *&target, char32_t codePoint) { *target = codePoint; ++target; return 1; } // ------------------------------------------------------------------------------------------- // }}} // namespace Nuclex::Support::Text #endif // NUCLEX_SUPPORT_TEXT_UNICODEHELPER_H