#pragma region CPL License /* Nuclex Native Framework Copyright (C) 2002-2023 Nuclex Development Labs This library is free software; you can redistribute it and/or modify it under the terms of the IBM Common Public License as published by the IBM Corporation; either version 1.0 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the IBM Common Public License for more details. You should have received a copy of the IBM Common Public License along with this library */ #pragma endregion // CPL License // If the library is compiled as a DLL, this ensures symbols are exported #define NUCLEX_SUPPORT_SOURCE 1 #include "Nuclex/Support/Text/UnicodeHelper.h" #include // for std::uint8_t #include namespace Nuclex { namespace Support { namespace Text { // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, DetectsInvalidCodePoints) { char32_t validCodePoint = U'ร˜'; EXPECT_TRUE(UnicodeHelper::IsValidCodePoint(validCodePoint)); char32_t invalidCodePoint = 1114111; // the first invalid code point EXPECT_FALSE(UnicodeHelper::IsValidCodePoint(invalidCodePoint)); } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, CalculatesUtf8CharacterCount) { char32_t asciiCodePoint = U'x'; EXPECT_EQ(UnicodeHelper::CountUtf8Characters(asciiCodePoint), 1U); char32_t centCodePoint = U'ยข'; EXPECT_EQ(UnicodeHelper::CountUtf8Characters(centCodePoint), 2U); char32_t euroCodePoint = U'โ‚ฌ'; EXPECT_EQ(UnicodeHelper::CountUtf8Characters(euroCodePoint), 3U); char32_t gothicCodePoint = U'๐ˆ'; EXPECT_EQ(UnicodeHelper::CountUtf8Characters(gothicCodePoint), 4U); char32_t invalidCodePoint = 1114111; EXPECT_EQ(UnicodeHelper::CountUtf8Characters(invalidCodePoint), std::size_t(-1)); } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, CalculatesUtf16CharacterCount) { char32_t asciiCodePoint = U'x'; EXPECT_EQ(UnicodeHelper::CountUtf16Characters(asciiCodePoint), 1U); char32_t centCodePoint = U'ยข'; EXPECT_EQ(UnicodeHelper::CountUtf16Characters(centCodePoint), 1U); char32_t euroCodePoint = U'โ‚ฌ'; EXPECT_EQ(UnicodeHelper::CountUtf16Characters(euroCodePoint), 1U); char32_t gothicCodePoint = U'๐ˆ'; EXPECT_EQ(UnicodeHelper::CountUtf16Characters(gothicCodePoint), 2U); char32_t surrogateCodePoint = char32_t(0xDD00); // surrogate range EXPECT_EQ(UnicodeHelper::CountUtf16Characters(surrogateCodePoint), std::size_t(-1)); char32_t invalidCodePoint = 1114111; EXPECT_EQ(UnicodeHelper::CountUtf16Characters(invalidCodePoint), std::size_t(-1)); } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, TellsSequenceLengthFromUtf8LeadCharacter) { using Char8Type = UnicodeHelper::Char8Type; const char *ascii = u8"A"; EXPECT_EQ( UnicodeHelper::GetSequenceLength(*reinterpret_cast(ascii)), 1U ); const char *cents = u8"ยข"; EXPECT_EQ( UnicodeHelper::GetSequenceLength(*reinterpret_cast(cents)), 2U ); const char *euros = u8"โ‚ฌ"; EXPECT_EQ( UnicodeHelper::GetSequenceLength(*reinterpret_cast(euros)), 3U ); const char *gothic = u8"๐ˆ"; EXPECT_EQ( UnicodeHelper::GetSequenceLength(*reinterpret_cast(gothic)), 4U ); EXPECT_EQ( UnicodeHelper::GetSequenceLength(Char8Type(0x80)), std::size_t(-1) ); } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, ReadsCodePointFromUtf8) { using Char8Type = UnicodeHelper::Char8Type; { const char ascii[] = u8"A"; const Char8Type *start = reinterpret_cast(ascii); const Char8Type *end = reinterpret_cast(ascii) + sizeof(ascii); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, U'A'); EXPECT_EQ(start, reinterpret_cast(ascii) + 1); } { const char cents[] = u8"ยข"; const Char8Type *start = reinterpret_cast(cents); const Char8Type *end = reinterpret_cast(cents) + sizeof(cents); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, U'ยข'); EXPECT_EQ(start, reinterpret_cast(cents) + 2); } { const char euros[] = u8"โ‚ฌ"; const Char8Type *start = reinterpret_cast(euros); const Char8Type *end = reinterpret_cast(euros) + sizeof(euros); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, U'โ‚ฌ'); EXPECT_EQ(start, reinterpret_cast(euros) + 3); } { const char gothic[] = u8"๐ˆ"; const Char8Type *start = reinterpret_cast(gothic); const Char8Type *end = reinterpret_cast(gothic) + sizeof(gothic); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, U'๐ˆ'); EXPECT_EQ(start, reinterpret_cast(gothic) + 4); } } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, ReadingInvalidCodePointFromUtf8Fails) { using Char8Type = UnicodeHelper::Char8Type; // Invalid second byte should be detected { char invalid[] = u8"๐ˆ"; *reinterpret_cast(&invalid[1]) = 0xC0; // 0b11xxxxxx const Char8Type *start = reinterpret_cast(invalid); const Char8Type *end = reinterpret_cast(invalid) + sizeof(invalid); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, char32_t(-1)); EXPECT_EQ(start, reinterpret_cast(invalid)); } // Invalid length (5 bytes, possible by encoding, but always invalid since // it's either an out-of-range code point or an overlong code point). { char invalid[] = u8"๐ˆ"; *reinterpret_cast(&invalid[0]) = 0xF8; // 0b11111000 const Char8Type *start = reinterpret_cast(invalid); const Char8Type *end = reinterpret_cast(invalid) + sizeof(invalid); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, char32_t(-1)); EXPECT_EQ(start, reinterpret_cast(invalid)); } } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, ReadsCodePointFromUtf16) { { const char16_t ascii[] = u"A"; const char16_t *start = reinterpret_cast(ascii); const char16_t *end = reinterpret_cast(ascii) + sizeof(ascii); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, U'A'); EXPECT_EQ(start, reinterpret_cast(ascii) + 1); } { const char16_t cent[] = u"ยข"; const char16_t *start = reinterpret_cast(cent); const char16_t *end = reinterpret_cast(cent) + sizeof(cent); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, U'ยข'); EXPECT_EQ(start, reinterpret_cast(cent) + 1); } { const char16_t euro[] = u"โ‚ฌ"; const char16_t *start = reinterpret_cast(euro); const char16_t *end = reinterpret_cast(euro) + sizeof(euro); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, U'โ‚ฌ'); EXPECT_EQ(start, reinterpret_cast(euro) + 1); } { const char16_t gothic[] = u"๐ˆ"; const char16_t *start = reinterpret_cast(gothic); const char16_t *end = reinterpret_cast(gothic) + sizeof(gothic); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, U'๐ˆ'); EXPECT_EQ(start, reinterpret_cast(gothic) + 2); } } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, ReadingInvalidCodePointFromUtf16Fails) { { char16_t gothic[] = u"๐ˆ"; { // Flip lead/trail surrogates char16_t temp = gothic[0]; gothic[0] = gothic[1]; gothic[1] = temp; } const char16_t *start = reinterpret_cast(gothic); const char16_t *end = reinterpret_cast(gothic) + sizeof(gothic); char32_t codePoint = UnicodeHelper::ReadCodePoint(start, end); EXPECT_EQ(codePoint, char32_t(-1)); EXPECT_EQ(start, reinterpret_cast(gothic)); } } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, EncodesCodePointsToUtf8) { using Char8Type = UnicodeHelper::Char8Type; { Char8Type ascii[4] = { 255, 255, 255, 255 }; Char8Type *start = reinterpret_cast(ascii); std::size_t count = UnicodeHelper::WriteCodePoint(start, U'A'); EXPECT_EQ(count, 1U); EXPECT_EQ(start, reinterpret_cast(ascii) + 1); EXPECT_EQ(ascii[0], u8'A'); } { Char8Type cent[4] = { 255, 255, 255, 255 }; Char8Type *start = reinterpret_cast(cent); std::size_t count = UnicodeHelper::WriteCodePoint(start, U'ยข'); EXPECT_EQ(count, 2U); EXPECT_EQ(start, reinterpret_cast(cent) + 2); const Char8Type expected[] = u8"ยข"; EXPECT_EQ(cent[0], expected[0]); EXPECT_EQ(cent[1], expected[1]); } { Char8Type euro[4] = { 255, 255, 255, 255 }; Char8Type *start = reinterpret_cast(euro); std::size_t count = UnicodeHelper::WriteCodePoint(start, U'โ‚ฌ'); EXPECT_EQ(count, 3U); EXPECT_EQ(start, reinterpret_cast(euro) + 3); const Char8Type expected[] = u8"โ‚ฌ"; EXPECT_EQ(euro[0], expected[0]); EXPECT_EQ(euro[1], expected[1]); EXPECT_EQ(euro[2], expected[2]); } { Char8Type gothic[4] = { 255, 255, 255, 255 }; Char8Type *start = reinterpret_cast(gothic); std::size_t count = UnicodeHelper::WriteCodePoint(start, U'๐ˆ'); EXPECT_EQ(count, 4U); EXPECT_EQ(start, reinterpret_cast(gothic) + 4); const Char8Type expected[] = u8"๐ˆ"; EXPECT_EQ(gothic[0], expected[0]); EXPECT_EQ(gothic[1], expected[1]); EXPECT_EQ(gothic[2], expected[2]); EXPECT_EQ(gothic[3], expected[3]); } } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, EncodesCodePointsToUtf16) { { char16_t ascii[2] = { 65535, 65535}; char16_t *start = reinterpret_cast(ascii); std::size_t count = UnicodeHelper::WriteCodePoint(start, U'A'); EXPECT_EQ(count, 1U); EXPECT_EQ(start, reinterpret_cast(ascii) + 1); EXPECT_EQ(ascii[0], u'A'); } { char16_t cent[4] = { 255, 255, 255, 255 }; char16_t *start = reinterpret_cast(cent); std::size_t count = UnicodeHelper::WriteCodePoint(start, U'ยข'); EXPECT_EQ(count, 1U); EXPECT_EQ(start, reinterpret_cast(cent) + 1); const char16_t expected[] = u"ยข"; EXPECT_EQ(cent[0], expected[0]); } { char16_t euro[4] = { 255, 255, 255, 255 }; char16_t *start = reinterpret_cast(euro); std::size_t count = UnicodeHelper::WriteCodePoint(start, U'โ‚ฌ'); EXPECT_EQ(count, 1U); EXPECT_EQ(start, reinterpret_cast(euro) + 1); const char16_t expected[] = u"โ‚ฌ"; EXPECT_EQ(euro[0], expected[0]); } { char16_t gothic[4] = { 255, 255, 255, 255 }; char16_t *start = reinterpret_cast(gothic); std::size_t count = UnicodeHelper::WriteCodePoint(start, U'๐ˆ'); EXPECT_EQ(count, 2U); EXPECT_EQ(start, reinterpret_cast(gothic) + 2); const char16_t expected[] = u"๐ˆ"; EXPECT_EQ(gothic[0], expected[0]); EXPECT_EQ(gothic[1], expected[1]); } } // ------------------------------------------------------------------------------------------- // TEST(UnicodeHelperTest, CaseFoldingAllowsCaseInsensitiveComparison) { EXPECT_EQ( UnicodeHelper::ToFoldedLowercase(U'A'), UnicodeHelper::ToFoldedLowercase(U'a') ); EXPECT_EQ( UnicodeHelper::ToFoldedLowercase(U'ฤ€'), UnicodeHelper::ToFoldedLowercase(U'ฤ') ); EXPECT_EQ( UnicodeHelper::ToFoldedLowercase(U'ฮฉ'), UnicodeHelper::ToFoldedLowercase(U'ฯ‰') ); EXPECT_EQ( UnicodeHelper::ToFoldedLowercase(U'๐‘ขฐ'), UnicodeHelper::ToFoldedLowercase(U'๐‘ฃ') ); } // ------------------------------------------------------------------------------------------- // }}} // namespace Nuclex::Support::Text