#pragma region CPL License /* Nuclex Native Framework Copyright (C) 2002-2023 Nuclex Development Labs This library is free software; you can redistribute it and/or modify it under the terms of the IBM Common Public License as published by the IBM Corporation; either version 1.0 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the IBM Common Public License for more details. You should have received a copy of the IBM Common Public License along with this library */ #pragma endregion // CPL License // If the library is compiled as a DLL, this ensures symbols are exported #define NUCLEX_SUPPORT_SOURCE 1 #include "IniDocumentModel.FileParser.h" #include "Nuclex/Support/Text/ParserHelper.h" #include // for std::unique_ptr, std::align() #include // for std::is_base_of #include // for std::copy_n() #include // for assert() // Ambiguous cases and their resolution: // // ["Hello]" -> Malformed // [World -> Malformed // [Foo] = Bar -> Assignment, no section // [Woop][Woop] -> Two sections, one w/newline one w/o // [Foo] Bar = Baz -> Section and assignment // [[Yay] -> Malformed, section // Foo = Bar = Baz -> Malformed // [Yay = Nay] -> Malformed // "Hello -> Malformed // Foo = [Bar] -> Assignment, no section // Foo = ]][Bar -> Assignment // "Foo" Bar = Baz -> Malformed // Foo = "Bar" Baz -> Malformed // // Allocation schemes: // // By line -> lots of micro-allocations // In blocks (custom allocator) -> I have to do reference counting to free anything // Load pre-alloc, then by line -> Fast for typical case, no or few micro-allocations // But requires pre-scan of entire file + more code namespace { // ------------------------------------------------------------------------------------------- // /// Size if the chunks in which memory is allocated const std::size_t AllocationChunkSize = 4096; // bytes // ------------------------------------------------------------------------------------------- // /// Determines the size of a type plus padding for another aligned member /// Type whose size plus padding will be determined /// The size of the type plus padding with another aligned member template constexpr std::size_t getSizePlusAlignmentPadding() { constexpr std::size_t misalignment = (sizeof(T) % alignof(T)); if constexpr(misalignment > 0) { return sizeof(T) + (alignof(T) - misalignment); } else { return sizeof(T); } } // ------------------------------------------------------------------------------------------- // } // anonymous namespace namespace Nuclex { namespace Support { namespace Settings { // ------------------------------------------------------------------------------------------- // IniDocumentModel::FileParser::FileParser( const std::uint8_t *fileContents, std::size_t byteCount ) : target(nullptr), remainingChunkByteCount(0), currentSection(nullptr), fileBegin(fileContents), fileEnd(fileContents + byteCount), parsePosition(nullptr), lineStart(nullptr), nameStart(nullptr), nameEnd(nullptr), valueStart(nullptr), valueEnd(nullptr), sectionFound(false), equalsSignFound(false), lineIsMalformed(false), windowsLineBreaks(0), blankLines(0), paddedAssignments(0) {} // ------------------------------------------------------------------------------------------- // void IniDocumentModel::FileParser::ParseInto(IniDocumentModel *documentModel) { this->target = documentModel; // Reset the parser, just in case someone re-uses an instance resetState(); this->currentSection = nullptr; // These are only to collect heuristics on the loaded .ini file's formatting // They are not used for the parser state. bool previousWasCR = false; bool previousWasSpace = false; bool encounteredNonBlankCharacter = false; bool previousLineWasEmpty = false; //bool previousWasEqualsSign = false; // Go through the entire file contents byte-by-byte and select the correct parse // mode for the elements we encounter. All of these characters are in the ASCII range, // thus there are no UTF-8 sequences that could be mistaken for them (multi-byte UTF-8 // codepoints will have the highest bit set in all bytes) this->parsePosition = this->lineStart = this->fileBegin; while(this->parsePosition < this->fileEnd) { std::uint8_t current = *this->parsePosition; switch(current) { // Comments (any section or property already found still counts) case '#': case ';': { parseComment(); break; } // Equals sign, line is a property assignment case '=': { if(equalsSignFound) { parseMalformedLine(); } else { if(this->parsePosition > this->lineStart) { previousWasSpace = Text::ParserHelper::IsWhitespace( static_cast(*(this->parsePosition - 1)) ); } if(previousWasSpace) { ++this->paddedAssignments; } else { --this->paddedAssignments; } this->equalsSignFound = true; ++this->parsePosition; } break; } // Line break, submits the current line to the document model case '\n': { if(previousWasCR) { ++this->windowsLineBreaks; } else { --this->windowsLineBreaks; } submitLine(); // Update heuristics if(previousLineWasEmpty) { ++this->blankLines; } else { --this->blankLines; } previousLineWasEmpty = !encounteredNonBlankCharacter; encounteredNonBlankCharacter = false; break; } // Other character, parse as section name, property name or property value default: { previousWasCR = (current == '\r'); previousWasSpace = Text::ParserHelper::IsWhitespace(static_cast(current)); encounteredNonBlankCharacter |= (!previousWasSpace); if(previousWasSpace) { ++this->parsePosition; // skip over it } else if(equalsSignFound) { parseValue(); } else { parseName(); } break; } } // switch on current byte } // while parse position is before end of file // Even if the file's last line didn't end with a line break, // we still treat it as a line of its own if(this->parsePosition > this->lineStart) { submitLine(); } } // ------------------------------------------------------------------------------------------- // void IniDocumentModel::FileParser::parseComment() { while(this->parsePosition < this->fileEnd) { std::uint8_t current = *this->parsePosition; if(current == '\n') { //submitLine(); break; } else { // Skip everything that isn't a newline character ++this->parsePosition; } } } // ------------------------------------------------------------------------------------------- // void IniDocumentModel::FileParser::parseName() { bool isInQuote = false; bool quoteEncountered = false; bool isInSection = false; while(this->parsePosition < this->fileEnd) { std::uint8_t current = *this->parsePosition; // When inside a quote, ignore everything but the closing quote // (or newline / end-of-file which are handled in all cases) if(isInQuote) { nameEnd = this->parsePosition; // Quotes name includes anything until closing quote switch(current) { case '"': { isInQuote = false; break; } case '\n': { // Newline without closing quote? -> Line is malformed this->lineIsMalformed = true; return; } } isInQuote = (current != '"'); nameEnd = this->parsePosition; } else { // Outside of quote switch(current) { // Comment start found? case ';': case '#': { parseMalformedLine(); // Name without equals sign? -> Line is malformed return; } // Section start found? case '[': { if((this->nameStart != nullptr) || isInSection) { // Bracket is not first char? parseMalformedLine(); return; } else if(this->sectionFound) { // Did we already see a section in this line? submitLine(); } isInSection = true; //nameStart = this->parsePosition + 1; break; } // Section end found? case ']': { if((this->nameStart == nullptr) || !isInSection) { // Bracket is first char? parseMalformedLine(); return; } isInSection = false; //this->nameEnd = this->parsePosition; this->sectionFound = true; break; } // Quoted name found? case '"': { if((this->nameStart != nullptr) || quoteEncountered) { // Quote is not first char? parseMalformedLine(); return; } else { // Quote is first char encountered quoteEncountered = true; isInQuote = true; nameStart = this->parsePosition + 1; } break; } // Equals sign found? The name part is over, assignment follows case '=': { if(isInSection) { // Equals sign inside section name? -> line is malformed parseMalformedLine(); } // Just return, the root parser will set the equalsSignFound property. return; } // Newline found? Either the section was closed or the line is malformed. case '\n': { this->lineIsMalformed |= isInSection; return; } // Other characters without special meaning default: { if(!Text::ParserHelper::IsWhitespace(static_cast(current))) { if(quoteEncountered) { // Characters after quote? -> line is malformed parseMalformedLine(); return; } if(nameStart == nullptr) { nameStart = this->parsePosition; } nameEnd = this->parsePosition + 1; } break; } } // switch on current byte } // is outside of quote ++this->parsePosition; } // while parse position is before end of file } // ------------------------------------------------------------------------------------------- // void IniDocumentModel::FileParser::parseValue() { bool isInQuote = false; bool quoteEncountered = false; while(this->parsePosition < this->fileEnd) { std::uint8_t current = *this->parsePosition; // When inside a quote, ignore everything but the closing quote // (or newline / end-of-file which are handled in all cases) if(isInQuote) { valueEnd = this->parsePosition; // Quotes name includes anything until closing quote switch(current) { case '"': { isInQuote = false; break; } case '\n': { // Newline without closing quote? -> Line is malformed this->lineIsMalformed = true; return; } } } else { // Outside of quote switch(current) { // Comment start found? case ';': case '#': { parseComment(); return; } // Quoted value found? case '"': { if((this->valueStart != nullptr) || quoteEncountered) { // Quote is not first char? parseMalformedLine(); return; } else { // Quote is first char encountered quoteEncountered = true; isInQuote = true; valueStart = this->parsePosition + 1; } break; } // Another equals sign found? -> line is malformed case '=': { parseMalformedLine(); return; } // Newline found? The value ends, we're done case '\n': { return; } // Other characters without special meaning default: { if(!Text::ParserHelper::IsWhitespace(static_cast(current))) { if(quoteEncountered) { // Characters after quote? -> line is malformed parseMalformedLine(); return; } if(valueStart == nullptr) { valueStart = this->parsePosition; } valueEnd = this->parsePosition + 1; } break; } } // switch on current byte } // is outside of quote ++this->parsePosition; } // while parse position is before end of file } // ------------------------------------------------------------------------------------------- // void IniDocumentModel::FileParser::parseMalformedLine() { this->lineIsMalformed = true; while(this->parsePosition < this->fileEnd) { std::uint8_t current = *this->parsePosition; if(current == '\n') { break; } ++this->parsePosition; } } // ------------------------------------------------------------------------------------------- // void IniDocumentModel::FileParser::submitLine() { ++this->parsePosition; Line *newLine; if(this->lineIsMalformed) { newLine = allocateLineChunked( this->lineStart, this->parsePosition - this->lineStart ); } else if(this->equalsSignFound) { newLine = generatePropertyLine(); } else if(this->sectionFound) { newLine = generateSectionLine(); } else { newLine = allocateLineChunked( this->lineStart, this->parsePosition - this->lineStart ); } // If this is the first line we submit to the document model, // initialize the firstLine attribute so the file can be serialized top-to-bottom if(this->target->firstLine == nullptr) { this->target->firstLine = newLine; newLine->Previous = newLine; newLine->Next = newLine; } else { Line *lastLine = this->target->firstLine->Previous; newLine->Next = this->target->firstLine; newLine->Previous = lastLine; lastLine->Next = newLine; this->target->firstLine->Previous = newLine; } // The currentSection and index work is done by the generatePropertyLine() // and generateSectionLine() methods, so we're already done here! resetState(); } // ------------------------------------------------------------------------------------------- // IniDocumentModel::PropertyLine *IniDocumentModel::FileParser::generatePropertyLine() { PropertyLine *newPropertyLine = allocateLineChunked( this->lineStart, this->parsePosition - this->lineStart ); // Initialize the property value. This will allow the document model to look up // and read or write the property's value quickly when accessed by the user. if((this->valueStart != nullptr) && (this->valueEnd != nullptr)) { newPropertyLine->ValueStartIndex = this->valueStart - this->lineStart; newPropertyLine->ValueLength = this->valueEnd - this->valueStart; } else { newPropertyLine->ValueStartIndex = 0; newPropertyLine->ValueLength = 0; } // Place the property name in the declaration line and also properly initialize // a string we can use to look up or insert this property into the index. std::string propertyName; { if((this->nameStart != nullptr) && (this->nameEnd != nullptr)) { newPropertyLine->NameStartIndex = this->nameStart - this->lineStart; newPropertyLine->NameLength = this->nameEnd - this->nameStart; propertyName.assign(nameStart, nameEnd); } else { newPropertyLine->NameStartIndex = 0; newPropertyLine->NameLength = 0; // intentionally leaves propertyName as an empty string } } // Add the new property to the index so it can be looked up by name if(this->currentSection == nullptr) { this->currentSection = getOrCreateDefaultSection(); } if(this->currentSection->LastLine == nullptr) { this->currentSection->LastLine = newPropertyLine; } this->currentSection->Properties.insert( PropertyMap::value_type(propertyName, newPropertyLine) ); return newPropertyLine; } // ------------------------------------------------------------------------------------------- // IniDocumentModel::SectionLine *IniDocumentModel::FileParser::generateSectionLine() { SectionLine *newSectionLine = allocateLineChunked( this->lineStart, this->parsePosition - this->lineStart ); // Place the section name in the declaration line and also properly initialize // a string we can use to look up or insert this section into the index. std::string sectionName; { if((this->nameStart != nullptr) && (this->nameEnd != nullptr)) { newSectionLine->NameStartIndex = this->nameStart - this->lineStart; newSectionLine->NameLength = this->nameEnd - this->nameStart; sectionName.assign(nameStart, nameEnd); } else { newSectionLine->NameStartIndex = 0; newSectionLine->NameLength = 0; // intentionally leaves sectionName as an empty string } } // Update the currentSection attribute to SectionMap::iterator sectionIterator = this->target->sections.find(sectionName); if(sectionIterator == this->target->sections.end()) { IndexedSection *newSection = allocateChunked(0); new(newSection) IndexedSection(); newSection->DeclarationLine = newSectionLine; newSection->LastLine = newSectionLine; this->target->sections.insert( SectionMap::value_type(sectionName, newSection) ); this->currentSection = newSection; } else { // If a section appears twice or multiple .inis are loaded this->currentSection = sectionIterator->second; } this->currentSection->LastLine = newSectionLine; return newSectionLine; } // ------------------------------------------------------------------------------------------- // IniDocumentModel::IndexedSection *IniDocumentModel::FileParser::getOrCreateDefaultSection() { SectionMap::iterator sectionIterator = this->target->sections.find(std::string()); if(sectionIterator == this->target->sections.end()) { IndexedSection *newSection = allocateChunked(0); new(newSection) IndexedSection(); this->target->sections.insert( SectionMap::value_type(std::string(), newSection) ); return newSection; } else { return sectionIterator->second; } } // ------------------------------------------------------------------------------------------- // void IniDocumentModel::FileParser::resetState() { this->lineStart = this->parsePosition; this->nameStart = this->nameEnd = nullptr; this->valueStart = this->valueEnd = nullptr; this->sectionFound = this->equalsSignFound = this->lineIsMalformed = false; } // ------------------------------------------------------------------------------------------- // template TLine *IniDocumentModel::FileParser::allocateLineChunked( const std::uint8_t *contents, std::size_t byteCount ) { static_assert(std::is_base_of::value && u8"TLine inherits from Line"); // Allocate memory for a new line, assign its content pointer to hold // the line loaded from the .ini file and copy the line contents into it. TLine *newLine = allocateChunked(byteCount); { newLine->Contents = ( reinterpret_cast(newLine) + getSizePlusAlignmentPadding() ); newLine->Length = byteCount; std::copy_n(contents, byteCount, newLine->Contents); } return newLine; } // ------------------------------------------------------------------------------------------- // template T *IniDocumentModel::FileParser::allocateChunked(std::size_t extraByteCount /* = 0 */) { // While we're asked to allocate a specific type, making extra bytes available // requires us to allocate as std::uint8_t. The start address still needs to be // appropriately aligned for the requested type (otherwise we'd have to keep // separate pointers for delete[] and for the allocated type). #if defined(__STDCPP_DEFAULT_NEW_ALIGNMENT__) static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ >= alignof(T)); #endif // Try to obtain the requested memory. If it is larger than half the allocation chunk // size, it gets its own special allocation. Otherwise, it either fits in the current // chunk or we need to start a new one. The alignment of the extra bytes is only for // good manners but uses the same alignment as members of type T, relative to the start // address of type T, which is also aligned, so we don't need to look at the pointer. std::size_t totalByteCount = getSizePlusAlignmentPadding() + extraByteCount; if(totalByteCount * 2 < AllocationChunkSize) { // Calculate the offset within the chunk at which the new instance would start. // Since the chunk itself is already aligned (__STDCPP_DEFAULT_NEW_ALIGNMENT__), // we don't have to even look at the memory address itself. std::size_t occupiedByteCount = AllocationChunkSize - this->remainingChunkByteCount; { std::size_t misalignment = occupiedByteCount % alignof(T); if(misalignment > 0) { occupiedByteCount += alignof(T) - misalignment; } } // If the new instance fits into the current chunk, place it there. if(occupiedByteCount + totalByteCount < AllocationChunkSize) { this->remainingChunkByteCount = AllocationChunkSize - occupiedByteCount - totalByteCount; std::size_t chunkCount = this->target->loadedLinesMemory.size(); std::uint8_t *memory = this->target->loadedLinesMemory[chunkCount - 1]; return reinterpret_cast(memory + occupiedByteCount); } else { // Instance didn't fit in the current chunk or no chunk allocated std::unique_ptr newChunk(new std::uint8_t[AllocationChunkSize]); this->target->loadedLinesMemory.push_back(newChunk.get()); this->remainingChunkByteCount = AllocationChunkSize - totalByteCount; return reinterpret_cast(newChunk.release()); } } else { // Requested instance would take half the allocation chunk size or more std::unique_ptr newChunk(new std::uint8_t[totalByteCount]); this->target->createdLinesMemory.insert(newChunk.get()); return reinterpret_cast(newChunk.release()); } } // ------------------------------------------------------------------------------------------- // }}} // namespace Nuclex::Support::Settings