From 9f9738c76d294482d57c250c464873eca6b4c983 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Sat, 11 Jan 2025 12:46:24 +0100
Subject: [PATCH] Addressed more SonarQube problems

---
 src/index/IndexImpl.Text.cpp          | 86 +++++++++++++++++----------
 src/index/IndexImpl.h                 | 14 ++++-
 src/parser/WordsAndDocsFileParser.cpp |  6 +-
 src/parser/WordsAndDocsFileParser.h   |  4 +-
 4 files changed, 71 insertions(+), 39 deletions(-)
diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index 08afa9a11..a27a5a781 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -23,7 +23,7 @@
 
 // _____________________________________________________________________________
 cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
-    const std::string& contextFile, bool addWordsFromLiterals) const {
+    std::string contextFile, bool addWordsFromLiterals) const {
   auto localeManager = textVocab_.getLocaleManager();
   // ROUND 1: If context file aka wordsfile is not empty, read words from there.
   // Remember the last context id for the (optional) second round.
@@ -62,6 +62,56 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
   }
 }
 
+// _____________________________________________________________________________
+void IndexImpl::processEntityCaseDuringInvertedListProcessing(
+    const WordsFileLine& line,
+    ad_utility::HashMap<Id, Score>& entitiesInContext, size_t& nofLiterals,
+    size_t& entityNotFoundErrorMsgCount) const {
+  VocabIndex eid;
+  // TODO<joka921> Currently only IRIs and strings from the vocabulary can
+  // be tagged entities in the text index (no doubles, ints, etc).
+  if (getVocab().getId(line.word_, &eid)) {
+    // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
+    // to be contiguous.
+    entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
+    if (line.isLiteralEntity_) {
+      ++nofLiterals;
+    }
+  } else {
+    logEntityNotFound(line.word_, entityNotFoundErrorMsgCount);
+  }
+}
+
+// _____________________________________________________________________________
+void IndexImpl::processWordCaseDuringInvertedListProcessing(
+    const WordsFileLine& line,
+    ad_utility::HashMap<WordIndex, Score>& wordsInContext) const {
+  // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
+  WordVocabIndex vid;
+  bool ret = textVocab_.getId(line.word_, &vid);
+  WordIndex wid = vid.get();
+  if (!ret) {
+    LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
+               << "not found in textVocab. Terminating\n";
+    AD_FAIL();
+  }
+  wordsInContext[wid] += line.score_;
+}
+
+// _____________________________________________________________________________
+void IndexImpl::logEntityNotFound(const string& word,
+                                  size_t& entityNotFoundErrorMsgCount) const {
+  if (entityNotFoundErrorMsgCount < 20) {
+    LOG(WARN) << "Entity from text not in KB: " << word << '\n';
+    if (++entityNotFoundErrorMsgCount == 20) {
+      LOG(WARN) << "There are more entities not in the KB..."
+                << " suppressing further warnings...\n";
+    }
+  } else {
+    entityNotFoundErrorMsgCount++;
+  }
+}
+
 // _____________________________________________________________________________
 void IndexImpl::addTextFromContextFile(const string& contextFile,
                                        bool addWordsFromLiterals) {
@@ -234,39 +284,11 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
     }
     if (line.isEntity_) {
       ++nofEntityPostings;
-      // TODO<joka921> Currently only IRIs and strings from the vocabulary can
-      // be tagged entities in the text index (no doubles, ints, etc).
-      VocabIndex eid;
-      if (getVocab().getId(line.word_, &eid)) {
-        // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
-        // to be contiguous.
-        entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
-        if (line.isLiteralEntity_) {
-          ++nofLiterals;
-        }
-      } else {
-        if (entityNotFoundErrorMsgCount < 20) {
-          LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n';
-          if (++entityNotFoundErrorMsgCount == 20) {
-            LOG(WARN) << "There are more entities not in the KB..."
-                      << " suppressing further warnings...\n";
-          }
-        } else {
-          entityNotFoundErrorMsgCount++;
-        }
-      }
+      processEntityCaseDuringInvertedListProcessing(
+          line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount);
     } else {
       ++nofWordPostings;
-      // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
-      WordVocabIndex vid;
-      bool ret = textVocab_.getId(line.word_, &vid);
-      WordIndex wid = vid.get();
-      if (!ret) {
-        LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
-                   << "not found in textVocab. Terminating\n";
-        AD_FAIL();
-      }
-      wordsInContext[wid] += line.score_;
+      processWordCaseDuringInvertedListProcessing(line, wordsInContext);
     }
   }
   if (entityNotFoundErrorMsgCount > 0) {
diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
index 1b491b04a..ac0003db8 100644
--- a/src/index/IndexImpl.h
+++ b/src/index/IndexImpl.h
@@ -522,7 +522,19 @@ class IndexImpl {
   // testing phase, once it works, it should be easy to include the IRIs and
   // literals from the external vocabulary as well).
   cppcoro::generator<WordsFileLine> wordsInTextRecords(
-      const std::string& contextFile, bool addWordsFromLiterals) const;
+      std::string contextFile, bool addWordsFromLiterals) const;
+
+  void processEntityCaseDuringInvertedListProcessing(
+      const WordsFileLine& line,
+      ad_utility::HashMap<Id, Score>& entitiesInContxt, size_t& nofLiterals,
+      size_t& entityNotFoundErrorMsgCount) const;
+
+  void processWordCaseDuringInvertedListProcessing(
+      const WordsFileLine& line,
+      ad_utility::HashMap<WordIndex, Score>& wordsInContext) const;
+
+  void logEntityNotFound(const string& word,
+                         size_t& entityNotFoundErrorMsgCount) const;
 
   size_t processWordsForVocabulary(const string& contextFile,
                                    bool addWordsFromLiterals);
diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
index d5756e01d..e7d36974c 100644
--- a/src/parser/WordsAndDocsFileParser.cpp
+++ b/src/parser/WordsAndDocsFileParser.cpp
@@ -21,7 +21,7 @@ ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
   string l;
   if (!std::getline(getInputStream(), l)) {
     return std::nullopt;
-  };
+  }
   std::string_view lineView(l);
   size_t i = lineView.find('\t');
   assert(i != string::npos);
@@ -48,11 +48,11 @@ ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
 
 // _____________________________________________________________________________
 ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
-  DocsFileLine line;
   string l;
   if (!std::getline(getInputStream(), l)) {
     return std::nullopt;
-  };
+  }
+  DocsFileLine line;
   size_t i = l.find('\t');
   assert(i != string::npos);
   line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
index ebacfc2eb..1fc80523f 100644
--- a/src/parser/WordsAndDocsFileParser.h
+++ b/src/parser/WordsAndDocsFileParser.h
@@ -87,8 +87,6 @@ struct WordsFileLine {
  *                         TextRecordIndex as type of one column. Those get
  *                         mapped to the next bigger or equal docId which is
  *                         then used to extract the text from the docsDB.
- *                         TODO: check if this behaviour is consistently
- *                         implemented.
  * - string docContent_: The whole text given after the first tab of a line of
  *                       docsfile.
  */
@@ -145,7 +143,7 @@ class WordsAndDocsFileParser {
 
  protected:
   std::ifstream& getInputStream() { return in_; }
-  const LocaleManager& getLocaleManager() { return localeManager_; }
+  const LocaleManager& getLocaleManager() const { return localeManager_; }
 
  private:
   std::ifstream in_;