From d0ec708744a9f634bd53c54a8d8e563d4dd8c215 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Thu, 2 Jan 2025 11:22:24 +0100 Subject: [PATCH] Used the custom InputRangeMixin to lazily tokenize and normalize words of texts --- src/index/IndexImpl.Text.cpp | 29 ++------------- src/parser/WordsAndDocsFileParser.cpp | 19 ++++++++++ src/parser/WordsAndDocsFileParser.h | 52 +++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 27 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 1bc6101c7..9f9e11ec6 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -21,32 +21,6 @@ #include "util/Conversions.h" #include "util/Simple8bCode.h" -namespace { - -// Custom delimiter class for tokenization of literals using `absl::StrSplit`. -// The `Find` function returns the next delimiter in `text` after the given -// `pos` or an empty substring if there is no next delimiter. -struct LiteralsTokenizationDelimiter { - absl::string_view Find(absl::string_view text, size_t pos) { - auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; - auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); - if (found == text.end()) return text.substr(text.size()); - return {found, found + 1}; - } -}; - -cppcoro::generator tokenizeAndNormalizeTextLine( - std::string_view lineView, LocaleManager localeManager) { - // Currently it is not possible to use std::views or std::ranges with the - // splitter object returned by absl::StrSplit. Every solution I have seen - // will remove the lazy nature of StrSplit and views/ranges. (2024-12-28) - for (auto word : absl::StrSplit(lineView, LiteralsTokenizationDelimiter{}, - absl::SkipEmpty{})) { - co_yield localeManager.getLowercaseUtf8(word); - } -} -} // namespace - // _____________________________________________________________________________ cppcoro::generator IndexImpl::wordsInTextRecords( const std::string& contextFile, bool addWordsFromLiterals) { @@ -79,7 +53,8 @@ cppcoro::generator IndexImpl::wordsInTextRecords( std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); - for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) { + TokenizeAndNormalizeText normalizedWords(textView, localeManager); + for (auto word : normalizedWords) { WordsFileLine wordLine{word, false, contextId, 1}; co_yield wordLine; } diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp index e45f47c53..b66932fbf 100644 --- a/src/parser/WordsAndDocsFileParser.cpp +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -9,6 +9,25 @@ #include "../util/Exception.h" #include "../util/StringUtils.h" +// _____________________________________________________________________________ +void TokenizeAndNormalizeText::start() { + if (current_ != end_) { + currentValue_ = normalizeToken(*current_); + } else { + currentValue_ = std::nullopt; + } +} + +// _____________________________________________________________________________ +void TokenizeAndNormalizeText::next() { + ++current_; + if (current_ != end_) { + currentValue_ = normalizeToken(*current_); + } else { + currentValue_ = std::nullopt; + } +} + // _____________________________________________________________________________ WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile, LocaleManager localeManager) diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index c72bf24d6..5216a29e1 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -28,6 +29,57 @@ struct DocsFileLine { DocumentIndex docId_; }; +// Custom delimiter class for tokenization of literals using `absl::StrSplit`. +// The `Find` function returns the next delimiter in `text` after the given +// `pos` or an empty substring if there is no next delimiter. +struct LiteralsTokenizationDelimiter { + absl::string_view Find(absl::string_view text, size_t pos) { + auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; + auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); + if (found == text.end()) return text.substr(text.size()); + return {found, found + 1}; + } +}; + +class TokenizeAndNormalizeText + : public ad_utility::InputRangeMixin { + public: + using StorageType = std::string; + explicit TokenizeAndNormalizeText(std::string_view text, + LocaleManager localeManager) + : splitter_{absl::StrSplit(text, LiteralsTokenizationDelimiter{}, + absl::SkipEmpty{})}, + current_{splitter_.begin()}, + end_{splitter_.end()}, + localeManager_(localeManager){}; + + // Delete unsafe constructors + TokenizeAndNormalizeText() = delete; + TokenizeAndNormalizeText(const TokenizeAndNormalizeText&) = delete; + TokenizeAndNormalizeText& operator=(const TokenizeAndNormalizeText&) = delete; + + private: + using Splitter = decltype(absl::StrSplit( + std::string_view{}, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{})); + Splitter splitter_; + Splitter::const_iterator current_; + Splitter::const_iterator end_; + + std::optional currentValue_; + + LocaleManager localeManager_; + + std::string normalizeToken(std::string_view token) { + return localeManager_.getLowercaseUtf8(token); + } + + public: + void start(); + bool isFinished() const { return !currentValue_.has_value(); }; + const StorageType& get() const { return *currentValue_; }; + void next(); +}; + class WordsAndDocsFileParser { public: explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,