From 9f9738c76d294482d57c250c464873eca6b4c983 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Sat, 11 Jan 2025 12:46:24 +0100 Subject: [PATCH] Addressed more SonarQube problems --- src/index/IndexImpl.Text.cpp | 86 +++++++++++++++++---------- src/index/IndexImpl.h | 14 ++++- src/parser/WordsAndDocsFileParser.cpp | 6 +- src/parser/WordsAndDocsFileParser.h | 4 +- 4 files changed, 71 insertions(+), 39 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 08afa9a11..a27a5a781 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -23,7 +23,7 @@ // _____________________________________________________________________________ cppcoro::generator IndexImpl::wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals) const { + std::string contextFile, bool addWordsFromLiterals) const { auto localeManager = textVocab_.getLocaleManager(); // ROUND 1: If context file aka wordsfile is not empty, read words from there. // Remember the last context id for the (optional) second round. @@ -62,6 +62,56 @@ cppcoro::generator IndexImpl::wordsInTextRecords( } } +// _____________________________________________________________________________ +void IndexImpl::processEntityCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& entitiesInContext, size_t& nofLiterals, + size_t& entityNotFoundErrorMsgCount) const { + VocabIndex eid; + // TODO Currently only IRIs and strings from the vocabulary can + // be tagged entities in the text index (no doubles, ints, etc). + if (getVocab().getId(line.word_, &eid)) { + // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have + // to be contiguous. + entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_; + if (line.isLiteralEntity_) { + ++nofLiterals; + } + } else { + logEntityNotFound(line.word_, entityNotFoundErrorMsgCount); + } +} + +// _____________________________________________________________________________ +void IndexImpl::processWordCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& wordsInContext) const { + // TODO Let the `textVocab_` return a `WordIndex` directly. + WordVocabIndex vid; + bool ret = textVocab_.getId(line.word_, &vid); + WordIndex wid = vid.get(); + if (!ret) { + LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" " + << "not found in textVocab. Terminating\n"; + AD_FAIL(); + } + wordsInContext[wid] += line.score_; +} + +// _____________________________________________________________________________ +void IndexImpl::logEntityNotFound(const string& word, + size_t& entityNotFoundErrorMsgCount) const { + if (entityNotFoundErrorMsgCount < 20) { + LOG(WARN) << "Entity from text not in KB: " << word << '\n'; + if (++entityNotFoundErrorMsgCount == 20) { + LOG(WARN) << "There are more entities not in the KB..." + << " suppressing further warnings...\n"; + } + } else { + entityNotFoundErrorMsgCount++; + } +} + // _____________________________________________________________________________ void IndexImpl::addTextFromContextFile(const string& contextFile, bool addWordsFromLiterals) { @@ -234,39 +284,11 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile, } if (line.isEntity_) { ++nofEntityPostings; - // TODO Currently only IRIs and strings from the vocabulary can - // be tagged entities in the text index (no doubles, ints, etc). - VocabIndex eid; - if (getVocab().getId(line.word_, &eid)) { - // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have - // to be contiguous. - entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_; - if (line.isLiteralEntity_) { - ++nofLiterals; - } - } else { - if (entityNotFoundErrorMsgCount < 20) { - LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n'; - if (++entityNotFoundErrorMsgCount == 20) { - LOG(WARN) << "There are more entities not in the KB..." - << " suppressing further warnings...\n"; - } - } else { - entityNotFoundErrorMsgCount++; - } - } + processEntityCaseDuringInvertedListProcessing( + line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount); } else { ++nofWordPostings; - // TODO Let the `textVocab_` return a `WordIndex` directly. - WordVocabIndex vid; - bool ret = textVocab_.getId(line.word_, &vid); - WordIndex wid = vid.get(); - if (!ret) { - LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" " - << "not found in textVocab. Terminating\n"; - AD_FAIL(); - } - wordsInContext[wid] += line.score_; + processWordCaseDuringInvertedListProcessing(line, wordsInContext); } } if (entityNotFoundErrorMsgCount > 0) { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 1b491b04a..ac0003db8 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -522,7 +522,19 @@ class IndexImpl { // testing phase, once it works, it should be easy to include the IRIs and // literals from the external vocabulary as well). cppcoro::generator wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals) const; + std::string contextFile, bool addWordsFromLiterals) const; + + void processEntityCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& entitiesInContxt, size_t& nofLiterals, + size_t& entityNotFoundErrorMsgCount) const; + + void processWordCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& wordsInContext) const; + + void logEntityNotFound(const string& word, + size_t& entityNotFoundErrorMsgCount) const; size_t processWordsForVocabulary(const string& contextFile, bool addWordsFromLiterals); diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp index d5756e01d..e7d36974c 100644 --- a/src/parser/WordsAndDocsFileParser.cpp +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -21,7 +21,7 @@ ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { string l; if (!std::getline(getInputStream(), l)) { return std::nullopt; - }; + } std::string_view lineView(l); size_t i = lineView.find('\t'); assert(i != string::npos); @@ -48,11 +48,11 @@ ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { // _____________________________________________________________________________ ad_utility::InputRangeFromGet::Storage DocsFileParser::get() { - DocsFileLine line; string l; if (!std::getline(getInputStream(), l)) { return std::nullopt; - }; + } + DocsFileLine line; size_t i = l.find('\t'); assert(i != string::npos); line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str())); diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index ebacfc2eb..1fc80523f 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -87,8 +87,6 @@ struct WordsFileLine { * TextRecordIndex as type of one column. Those get * mapped to the next bigger or equal docId which is * then used to extract the text from the docsDB. - * TODO: check if this behaviour is consistently - * implemented. * - string docContent_: The whole text given after the first tab of a line of * docsfile. */ @@ -145,7 +143,7 @@ class WordsAndDocsFileParser { protected: std::ifstream& getInputStream() { return in_; } - const LocaleManager& getLocaleManager() { return localeManager_; } + const LocaleManager& getLocaleManager() const { return localeManager_; } private: std::ifstream in_;