#pragma region CPL License /* Nuclex Native Framework Copyright (C) 2002-2023 Nuclex Development Labs This library is free software; you can redistribute it and/or modify it under the terms of the IBM Common Public License as published by the IBM Corporation; either version 1.0 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the IBM Common Public License for more details. You should have received a copy of the IBM Common Public License along with this library */ #pragma endregion // CPL License // If the library is compiled as a DLL, this ensures symbols are exported #define NUCLEX_SUPPORT_SOURCE 1 #include "Nuclex/Support/Text/StringHelper.h" #include "Nuclex/Support/Text/UnicodeHelper.h" #include "Nuclex/Support/Text/ParserHelper.h" #include "Nuclex/Support/Errors/CorruptStringError.h" namespace { // ------------------------------------------------------------------------------------------- // /// /// Collapses any instance of two or more consecutive whitespaces into a single whitespace /// /// Type of string the method will be working on /// /// Type of the UTF characters in the string, must be char8_t, char16_t or char32_t /// /// String in which whitespace will be collapsed template void collapseDuplicateWhitespaceAndTrim(StringType &targetString) { using Nuclex::Support::Text::UnicodeHelper; using Nuclex::Support::Text::ParserHelper; CharType *read = reinterpret_cast(targetString.data()); CharType *write = read; // 'write' tracks the shift target position const CharType *end = read + targetString.length(); // If the string is of zero length, we don't need to do anything if(unlikely(read == end)) { return; } // Read the first character. This variant does trimming, so the first character // decides if we can even run the scan-only loop (and doing the check outside of // the loop simplifies the conditions that need to be checked inside the loop) char32_t codePoint = UnicodeHelper::ReadCodePoint(read, end); if(unlikely(codePoint == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } // If it was not a whitespace, we can fast-forward until we find a duplicate whitespace if(ParserHelper::IsWhitespace(codePoint)) { for(;;) { if(unlikely(read >= end)) { targetString.resize(0); // Only whitespace + trim = string becomes empty return; } codePoint = UnicodeHelper::ReadCodePoint(read, end); if(unlikely(codePoint == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } if(likely(!ParserHelper::IsWhitespace(codePoint))) { break; // Exit without updating write pointer since we're trimming } } // for ever } else { // initial character is not a whitespace write = read; std::size_t successiveWhitespaceCount = 0; for(;;) { if(unlikely(read >= end)) { targetString.resize(write - reinterpret_cast(targetString.data())); return; } codePoint = UnicodeHelper::ReadCodePoint(read, end); if(unlikely(codePoint == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } if(unlikely(ParserHelper::IsWhitespace(codePoint))) { ++successiveWhitespaceCount; } else if(unlikely(successiveWhitespaceCount >= 2)) { // String will need backshifting UnicodeHelper::WriteCodePoint(write, U' '); break; } else { // Character after single whitespace (which we'll just skip over) write = read; // write pointer tracks last non-whitespace successiveWhitespaceCount = 0; } } // for ever } // if initial character whitespace / not whitespace // At this point: // - 'read' is on a non-whitespace character // - 'write' is at the current backshifting target position // - 'codePoint' contains one code point that yet needs to be backshifted UnicodeHelper::WriteCodePoint(write, codePoint); // Backshifting loop { std::size_t successiveWhitespaceCount = 0; char32_t whitespaceCodePoint = codePoint; while(likely(read < end)) { codePoint = UnicodeHelper::ReadCodePoint(read, end); if(unlikely(codePoint == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } if(unlikely(ParserHelper::IsWhitespace(codePoint))) { whitespaceCodePoint = codePoint; ++successiveWhitespaceCount; } else { if(unlikely(successiveWhitespaceCount >= 2)) { // Normalize multiple whitespaces into one UnicodeHelper::WriteCodePoint(write, U' '); } else if(unlikely(successiveWhitespaceCount == 1)) { // Pass through single whitespace UnicodeHelper::WriteCodePoint(write, whitespaceCodePoint); } UnicodeHelper::WriteCodePoint(write, codePoint); successiveWhitespaceCount = 0; } } // while read characters remain targetString.resize(write - reinterpret_cast(targetString.data())); } // beauty scope } // ------------------------------------------------------------------------------------------- // /// /// Collapses any instance of two or more consecutive whitespaces into a single whitespace /// /// Type of string the method will be working on /// /// Type of the UTF characters in the string, must be char8_t, char16_t or char32_t /// /// String in which whitespace will be collapsed template void collapseDuplicateWhitespaceWithoutTrim(StringType &targetString) { using Nuclex::Support::Text::UnicodeHelper; using Nuclex::Support::Text::ParserHelper; CharType *read = reinterpret_cast(targetString.data()); CharType *write = read; // 'write' tracks the shift target position const CharType *end = read + targetString.length(); std::size_t successiveWhitespaceCount = 0; char32_t codePoint; for(;;) { if(unlikely(read >= end)) { if(unlikely(successiveWhitespaceCount >= 2)) { UnicodeHelper::WriteCodePoint(write, U' '); targetString.resize(write - reinterpret_cast(targetString.data())); } // Otherwise, even if final character was single whitespace, string is fine. return; } codePoint = UnicodeHelper::ReadCodePoint(read, end); if(unlikely(codePoint == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } if(unlikely(ParserHelper::IsWhitespace(codePoint))) { ++successiveWhitespaceCount; } else if(unlikely(successiveWhitespaceCount >= 2)) { UnicodeHelper::WriteCodePoint(write, U' '); successiveWhitespaceCount = 0; break; // From here on out, we need to backshift the string } else { write = read; // Write pointer keeps tracking last non-whitespace character successiveWhitespaceCount = 0; } } // for ever // At this point: // - 'read' is on a non-whitespace character // - 'write' is at the current backshifting target position // - 'codePoint' contains one code point that yet needs to be backshifted UnicodeHelper::WriteCodePoint(write, codePoint); // Backshifting loop { char32_t whitespaceCodePoint = codePoint; while(likely(read < end)) { codePoint = UnicodeHelper::ReadCodePoint(read, end); if(unlikely(codePoint == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } if(unlikely(ParserHelper::IsWhitespace(codePoint))) { whitespaceCodePoint = codePoint; ++successiveWhitespaceCount; } else { if(unlikely(successiveWhitespaceCount >= 2)) { // Normalize multiple whitespaces UnicodeHelper::WriteCodePoint(write, U' '); } else if(unlikely(successiveWhitespaceCount == 1)) { // Pass through single whitespace UnicodeHelper::WriteCodePoint(write, whitespaceCodePoint); } UnicodeHelper::WriteCodePoint(write, codePoint); successiveWhitespaceCount = 0; } } // while read characters remain if(unlikely(successiveWhitespaceCount >= 2)) { // Normalize multiple whitespaces into one UnicodeHelper::WriteCodePoint(write, U' '); } else if(unlikely(successiveWhitespaceCount == 1)) { // Pass through single whitespace UnicodeHelper::WriteCodePoint(write, whitespaceCodePoint); } targetString.resize(write - reinterpret_cast(targetString.data())); } // beauty scope } // ------------------------------------------------------------------------------------------- // /// Erases all first-level occurrences of the specified victim string /// Type of string the method will be working on /// /// Type of the UTF characters in the string, must be char8_t, char16_t or char32_t /// /// String in which victims will be erased /// String that will be erased from the target string template void eraseSubstrings(StringType &targetString, const StringType &victim) { using Nuclex::Support::Text::UnicodeHelper; using Nuclex::Support::Text::ParserHelper; // Gather some pointers for moving around in the substring for comparison const CharType *victimFromSecondCodePoint = ( reinterpret_cast(victim.c_str()) ); const CharType *victimEnd = ( victimFromSecondCodePoint + victim.length() ); if(victimFromSecondCodePoint >= victimEnd) { return; // victim is empty, we were asked to remove nothing, so we do nothing } char32_t firstCodePointOfVictim = UnicodeHelper::ReadCodePoint( victimFromSecondCodePoint, victimEnd ); if(unlikely(firstCodePointOfVictim == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } // CHECK: Should we optimize this to stop comparison when master < substring? // If there aren't enough characters left to fit the substring even once, // it cannot occur any more. But our expected typical use case is removal of // short tokens, so the additional check might even make things slower overall // We also need pointers into the master string so we can scan it for // occurrences of the substring and move characters to the left after removal. CharType *read = reinterpret_cast(targetString.data()); CharType *write = read; const CharType *end = read + targetString.length(); while(likely(read < end)) { char32_t currentCodePoint = UnicodeHelper::ReadCodePoint(read, end); if(unlikely(currentCodePoint == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } // Once we encounter a character that matches the first character of the substring, // start comparing the rest of the substring to see if we have a match. if(unlikely(currentCodePoint == firstCodePointOfVictim)) { CharType *readForComparison = read; const CharType *victimCurrent = victimFromSecondCodePoint; while(likely(victimCurrent < victimEnd)) { if(readForComparison >= end) { break; // master string ended before full substring was compared } char32_t masterCodePoint = UnicodeHelper::ReadCodePoint(readForComparison, end); if(unlikely(masterCodePoint == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } char32_t victimCodePoint = UnicodeHelper::ReadCodePoint( victimCurrent, victimEnd ); if(unlikely(victimCodePoint == char32_t(-1))) { throw Nuclex::Support::Errors::CorruptStringError(u8"Corrupt UTF-8 string"); } if(masterCodePoint != victimCodePoint) { break; // we found a difference, it doesn't match the full substring } } // If the full substring was matched if(victimCurrent == victimEnd) { read = readForComparison; // Skip over the substring } else { // Substring not matched, write character as normal UnicodeHelper::WriteCodePoint(write, currentCodePoint); } } else { // Character was not the start of the substring, write it as normal UnicodeHelper::WriteCodePoint(write, currentCodePoint); } } // Since the above loop keeps going until the end of the master string is reached, // in case substrings were found and skipped, it will already have moved all // of the remaining characters to the left, so the string contents are all in place. // We merely may need to tell the master string its new length in case it changed. if(read != write) { targetString.resize( write - reinterpret_cast(targetString.data()) ); } } // ------------------------------------------------------------------------------------------- // } // anonymous namespace namespace Nuclex { namespace Support { namespace Text { // ------------------------------------------------------------------------------------------- // void StringHelper::CollapseDuplicateWhitespace( std::string &utf8String, bool alsoTrim /* = true */ ) { if(alsoTrim) { collapseDuplicateWhitespaceAndTrim(utf8String); } else { collapseDuplicateWhitespaceWithoutTrim(utf8String); } } // ------------------------------------------------------------------------------------------- // void StringHelper::CollapseDuplicateWhitespace( std::wstring &wideString, bool alsoTrim /* = true */ ) { if(alsoTrim) { if constexpr(sizeof(std::wstring::value_type) == sizeof(char32_t)) { collapseDuplicateWhitespaceAndTrim(wideString); } else { collapseDuplicateWhitespaceAndTrim(wideString); } } else { if constexpr(sizeof(std::wstring::value_type) == sizeof(char32_t)) { collapseDuplicateWhitespaceWithoutTrim(wideString); } else { collapseDuplicateWhitespaceWithoutTrim(wideString); } } } // ------------------------------------------------------------------------------------------- // void StringHelper::EraseSubstrings( std::string &utf8String, const std::string &victim ) { eraseSubstrings(utf8String, victim); } // ------------------------------------------------------------------------------------------- // void StringHelper::EraseSubstrings( std::wstring &wideString, const std::wstring &victim ) { if constexpr(sizeof(std::wstring::value_type) == sizeof(char32_t)) { eraseSubstrings(wideString, victim); } else { eraseSubstrings(wideString, victim); } } // ------------------------------------------------------------------------------------------- // }}} // namespace Nuclex::Support::Text