Used the custom InputRangeMixin to lazily tokenize and normalize word…

…s of texts
ad-freiburg · Jan 2, 2025 · d0ec708 · d0ec708
1 parent 479b763
commit d0ec708
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 27 deletions.
diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
@@ -21,32 +21,6 @@
 #include "util/Conversions.h"
 #include "util/Simple8bCode.h"
 
-namespace {
-
-// Custom delimiter class for tokenization of literals using `absl::StrSplit`.
-// The `Find` function returns the next delimiter in `text` after the given
-// `pos` or an empty substring if there is no next delimiter.
-struct LiteralsTokenizationDelimiter {
-  absl::string_view Find(absl::string_view text, size_t pos) {
-    auto isWordChar = [](char c) -> bool { return std::isalnum(c); };
-    auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar);
-    if (found == text.end()) return text.substr(text.size());
-    return {found, found + 1};
-  }
-};
-
-cppcoro::generator<std::string> tokenizeAndNormalizeTextLine(
-    std::string_view lineView, LocaleManager localeManager) {
-  // Currently it is not possible to use std::views or std::ranges with the
-  // splitter object returned by absl::StrSplit. Every solution I have seen
-  // will remove the lazy nature of StrSplit and views/ranges. (2024-12-28)
-  for (auto word : absl::StrSplit(lineView, LiteralsTokenizationDelimiter{},
-                                  absl::SkipEmpty{})) {
-    co_yield localeManager.getLowercaseUtf8(word);
-  }
-}
-}  // namespace
-
 // _____________________________________________________________________________
 cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
     const std::string& contextFile, bool addWordsFromLiterals) {
@@ -79,7 +53,8 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) {
+      TokenizeAndNormalizeText normalizedWords(textView, localeManager);
+      for (auto word : normalizedWords) {
         WordsFileLine wordLine{word, false, contextId, 1};
         co_yield wordLine;
       }

diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
@@ -9,6 +9,25 @@
 #include "../util/Exception.h"
 #include "../util/StringUtils.h"
 
+// _____________________________________________________________________________
+void TokenizeAndNormalizeText::start() {
+  if (current_ != end_) {
+    currentValue_ = normalizeToken(*current_);
+  } else {
+    currentValue_ = std::nullopt;
+  }
+}
+
+// _____________________________________________________________________________
+void TokenizeAndNormalizeText::next() {
+  ++current_;
+  if (current_ != end_) {
+    currentValue_ = normalizeToken(*current_);
+  } else {
+    currentValue_ = std::nullopt;
+  }
+}
+
 // _____________________________________________________________________________
 WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
                                                LocaleManager localeManager)

diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <absl/strings/str_split.h>
 #include <unicode/locid.h>
 
 #include <fstream>
@@ -28,6 +29,57 @@ struct DocsFileLine {
   DocumentIndex docId_;
 };
 
+// Custom delimiter class for tokenization of literals using `absl::StrSplit`.
+// The `Find` function returns the next delimiter in `text` after the given
+// `pos` or an empty substring if there is no next delimiter.
+struct LiteralsTokenizationDelimiter {
+  absl::string_view Find(absl::string_view text, size_t pos) {
+    auto isWordChar = [](char c) -> bool { return std::isalnum(c); };
+    auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar);
+    if (found == text.end()) return text.substr(text.size());
+    return {found, found + 1};
+  }
+};
+
+class TokenizeAndNormalizeText
+    : public ad_utility::InputRangeMixin<TokenizeAndNormalizeText> {
+ public:
+  using StorageType = std::string;
+  explicit TokenizeAndNormalizeText(std::string_view text,
+                                    LocaleManager localeManager)
+      : splitter_{absl::StrSplit(text, LiteralsTokenizationDelimiter{},
+                                 absl::SkipEmpty{})},
+        current_{splitter_.begin()},
+        end_{splitter_.end()},
+        localeManager_(localeManager){};
+
+  // Delete unsafe constructors
+  TokenizeAndNormalizeText() = delete;
+  TokenizeAndNormalizeText(const TokenizeAndNormalizeText&) = delete;
+  TokenizeAndNormalizeText& operator=(const TokenizeAndNormalizeText&) = delete;
+
+ private:
+  using Splitter = decltype(absl::StrSplit(
+      std::string_view{}, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{}));
+  Splitter splitter_;
+  Splitter::const_iterator current_;
+  Splitter::const_iterator end_;
+
+  std::optional<StorageType> currentValue_;
+
+  LocaleManager localeManager_;
+
+  std::string normalizeToken(std::string_view token) {
+    return localeManager_.getLowercaseUtf8(token);
+  }
+
+ public:
+  void start();
+  bool isFinished() const { return !currentValue_.has_value(); };
+  const StorageType& get() const { return *currentValue_; };
+  void next();
+};
+
 class WordsAndDocsFileParser {
  public:
   explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,