Skip to content

Commit

Permalink
Used the custom InputRangeMixin to lazily tokenize and normalize word…
Browse files Browse the repository at this point in the history
…s of texts
  • Loading branch information
Flixtastic committed Jan 2, 2025
1 parent 479b763 commit d0ec708
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 27 deletions.
29 changes: 2 additions & 27 deletions src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,6 @@
#include "util/Conversions.h"
#include "util/Simple8bCode.h"

namespace {

// Custom delimiter class for tokenization of literals using `absl::StrSplit`.
// The `Find` function returns the next delimiter in `text` after the given
// `pos` or an empty substring if there is no next delimiter.
struct LiteralsTokenizationDelimiter {
absl::string_view Find(absl::string_view text, size_t pos) {
auto isWordChar = [](char c) -> bool { return std::isalnum(c); };
auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar);
if (found == text.end()) return text.substr(text.size());
return {found, found + 1};
}
};

cppcoro::generator<std::string> tokenizeAndNormalizeTextLine(
std::string_view lineView, LocaleManager localeManager) {
// Currently it is not possible to use std::views or std::ranges with the
// splitter object returned by absl::StrSplit. Every solution I have seen
// will remove the lazy nature of StrSplit and views/ranges. (2024-12-28)
for (auto word : absl::StrSplit(lineView, LiteralsTokenizationDelimiter{},
absl::SkipEmpty{})) {
co_yield localeManager.getLowercaseUtf8(word);
}
}
} // namespace

// _____________________________________________________________________________
cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
const std::string& contextFile, bool addWordsFromLiterals) {
Expand Down Expand Up @@ -79,7 +53,8 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
std::string_view textView = text;
textView = textView.substr(0, textView.rfind('"'));
textView.remove_prefix(1);
for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) {
TokenizeAndNormalizeText normalizedWords(textView, localeManager);
for (auto word : normalizedWords) {
WordsFileLine wordLine{word, false, contextId, 1};
co_yield wordLine;
}
Expand Down
19 changes: 19 additions & 0 deletions src/parser/WordsAndDocsFileParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,25 @@
#include "../util/Exception.h"
#include "../util/StringUtils.h"

// _____________________________________________________________________________
void TokenizeAndNormalizeText::start() {
if (current_ != end_) {
currentValue_ = normalizeToken(*current_);
} else {
currentValue_ = std::nullopt;
}

Check warning on line 18 in src/parser/WordsAndDocsFileParser.cpp

View check run for this annotation

Codecov / codecov/patch

src/parser/WordsAndDocsFileParser.cpp#L17-L18

Added lines #L17 - L18 were not covered by tests
}

// _____________________________________________________________________________
void TokenizeAndNormalizeText::next() {
++current_;
if (current_ != end_) {
currentValue_ = normalizeToken(*current_);
} else {
currentValue_ = std::nullopt;
}
}

// _____________________________________________________________________________
WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
LocaleManager localeManager)
Expand Down
52 changes: 52 additions & 0 deletions src/parser/WordsAndDocsFileParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#pragma once

#include <absl/strings/str_split.h>
#include <unicode/locid.h>

#include <fstream>
Expand All @@ -28,6 +29,57 @@ struct DocsFileLine {
DocumentIndex docId_;
};

// Custom delimiter class for tokenization of literals using `absl::StrSplit`.
// The `Find` function returns the next delimiter in `text` after the given
// `pos` or an empty substring if there is no next delimiter.
struct LiteralsTokenizationDelimiter {
absl::string_view Find(absl::string_view text, size_t pos) {
auto isWordChar = [](char c) -> bool { return std::isalnum(c); };
auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar);
if (found == text.end()) return text.substr(text.size());
return {found, found + 1};
}
};

class TokenizeAndNormalizeText
: public ad_utility::InputRangeMixin<TokenizeAndNormalizeText> {
public:
using StorageType = std::string;
explicit TokenizeAndNormalizeText(std::string_view text,
LocaleManager localeManager)
: splitter_{absl::StrSplit(text, LiteralsTokenizationDelimiter{},
absl::SkipEmpty{})},
current_{splitter_.begin()},
end_{splitter_.end()},
localeManager_(localeManager){};

// Delete unsafe constructors
TokenizeAndNormalizeText() = delete;
TokenizeAndNormalizeText(const TokenizeAndNormalizeText&) = delete;
TokenizeAndNormalizeText& operator=(const TokenizeAndNormalizeText&) = delete;

private:
using Splitter = decltype(absl::StrSplit(
std::string_view{}, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{}));
Splitter splitter_;
Splitter::const_iterator current_;
Splitter::const_iterator end_;

std::optional<StorageType> currentValue_;

LocaleManager localeManager_;

std::string normalizeToken(std::string_view token) {
return localeManager_.getLowercaseUtf8(token);
}

public:
void start();
bool isFinished() const { return !currentValue_.has_value(); };
const StorageType& get() const { return *currentValue_; };
void next();
};

class WordsAndDocsFileParser {
public:
explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,
Expand Down

0 comments on commit d0ec708

Please sign in to comment.