Automatically identify and fix spelling errors using codespell. (#1368

) Add a GitHub action and a locally installable pre-commit hook that runs the spell checker `codespell` on the codebase and fails if codespell identifies possible spelling errors. Also fix all the errors that the initial run of codespell detected. Should the tool have too many false positives in the future, we can stell deactivate it and run it manually every now and then, but after a review of the initial suggestions I (@joka921) don't consider that to be likely.
ad-freiburg · Jun 14, 2024 · f9e730c · f9e730c
1 parent e45f499
commit f9e730c
Show file tree

Hide file tree

Showing 132 changed files with 343 additions and 292 deletions.
diff --git a/.codespellrc b/.codespellrc
@@ -0,0 +1,8 @@
+[codespell]
+# Ref: https://github.com/codespell-project/codespell#using-a-config-file
+skip = .git*,.codespellrc,*.pdf,generated
+check-hidden = true
+# Ignore mixedCase variables, lines with latin, lines with codespell-ignore pragma, etc
+ignore-regex = \b([A-Z]*[a-z]+[A-Z][a-zA-Z]*)\b|.*(Lorem ipsum|eleifend|feugait|codespell-ignore).*
+# alph - is used frequently in tests, just ignore altogether
+ignore-words-list = ser,alph,inbetween,interm
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
@@ -0,0 +1,25 @@
+# Codespell configuration is within .codespellrc
+---
+name: Codespell
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+permissions:
+  contents: read
+
+jobs:
+  codespell:
+    name: Check for spelling errors
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Annotate locations with typos
+        uses: codespell-project/codespell-problem-matcher@v1
+      - name: Codespell
+        uses: codespell-project/actions-codespell@v2
diff --git a/.pre-commit-config-local.yaml b/.pre-commit-config-local.yaml
@@ -14,3 +14,9 @@ repos:
         args: [ "-style=file" ]
         require_serial: false
 
+
+  - repo: https://github.com/codespell-project/codespell
+    # Configuration for codespell is in .codespellrc
+    rev: v2.2.6
+    hooks:
+    - id: codespell
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,3 +14,9 @@ repos:
     hooks:
       - id: clang-format
         'types_or': [ c++, c ]
+
+  - repo: https://github.com/codespell-project/codespell
+    # Configuration for codespell is in .codespellrc
+    rev: v2.2.6
+    hooks:
+    - id: codespell
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -348,7 +348,7 @@ message(STATUS ---)
 include_directories(src)
 
 # Run the script `CompilationInfo.cmake` that creates the file `CompilationInfo.cpp`
-# with the current git hash and the curent time and date. When specifying
+# with the current git hash and the current time and date. When specifying
 # `-DDONT_UPDATE_COMPILATION_INFO=true` as an argument to `cmake`, the compilation info is
 # never updated. This is useful during development to avoid a relinking of the binaries for
 # every compilation.

diff --git a/Dockerfile b/Dockerfile
@@ -41,7 +41,7 @@ ENV MEMORY_FOR_QUERIES 70
 ENV CACHE_MAX_SIZE_GB 30
 ENV CACHE_MAX_SIZE_GB_SINGLE_ENTRY 5
 ENV CACHE_MAX_NUM_ENTRIES 1000
-# Need the shell to get the INDEX_PREFIX envirionment variable
+# Need the shell to get the INDEX_PREFIX environment variable
 ENTRYPOINT ["/bin/sh", "-c", "exec ServerMain -i \"/index/${INDEX_PREFIX}\" -j 8 -m ${MEMORY_FOR_QUERIES} -c ${CACHE_MAX_SIZE_GB} -e ${CACHE_MAX_SIZE_GB_SINGLE_ENTRY} -k ${CACHE_MAX_NUM_ENTRIES} -p 7001 \"$@\"", "--"]
 
 # Build image:  docker build -t qlever.master .

diff --git a/Dockerfiles/Dockerfile.Ubuntu18.04 b/Dockerfiles/Dockerfile.Ubuntu18.04
@@ -49,7 +49,7 @@ ENV MEMORY_FOR_QUERIES 70
 ENV CACHE_MAX_SIZE_GB 30
 ENV CACHE_MAX_SIZE_GB_SINGLE_ENTRY 5
 ENV CACHE_MAX_NUM_ENTRIES 1000
-# Need the shell to get the INDEX_PREFIX envirionment variable
+# Need the shell to get the INDEX_PREFIX environment variable
 ENTRYPOINT ["/bin/sh", "-c", "exec ServerMain -i \"/index/${INDEX_PREFIX}\" -j 8 -m ${MEMORY_FOR_QUERIES} -c ${CACHE_MAX_SIZE_GB} -e ${CACHE_MAX_SIZE_GB_SINGLE_ENTRY} -k ${CACHE_MAX_NUM_ENTRIES} -p 7001 \"$@\"", "--"]
 
 # Build image:  docker build -t qlever.master .

diff --git a/Dockerfiles/Dockerfile.Ubuntu20.04 b/Dockerfiles/Dockerfile.Ubuntu20.04
@@ -45,7 +45,7 @@ ENV MEMORY_FOR_QUERIES 70
 ENV CACHE_MAX_SIZE_GB 30
 ENV CACHE_MAX_SIZE_GB_SINGLE_ENTRY 5
 ENV CACHE_MAX_NUM_ENTRIES 1000
-# Need the shell to get the INDEX_PREFIX envirionment variable
+# Need the shell to get the INDEX_PREFIX environment variable
 ENTRYPOINT ["/bin/sh", "-c", "exec ServerMain -i \"/index/${INDEX_PREFIX}\" -j 8 -m ${MEMORY_FOR_QUERIES} -c ${CACHE_MAX_SIZE_GB} -e ${CACHE_MAX_SIZE_GB_SINGLE_ENTRY} -k ${CACHE_MAX_NUM_ENTRIES} -p 7001 \"$@\"", "--"]
 
 # Build image:  docker build -t qlever.master .

diff --git a/benchmark/BenchmarkExamples.cpp b/benchmark/BenchmarkExamples.cpp
@@ -62,7 +62,7 @@ class ConfigOptions : public BenchmarkInterface {
                          "allowed for the configuration option \"num-signs\".",
                          numSigns);
 
-    manager.addOption("coin-flip-try", "The number of succesful coin flips.",
+    manager.addOption("coin-flip-try", "The number of successful coin flips.",
                       &wonOnTryX_, {false, false, false, false, false});
 
     // Sub manager can be used to organize things better. They are basically
@@ -94,7 +94,7 @@ class BMSingleMeasurements : public ConfigOptions {
       exponentiate(number);
     });
     auto& multipleTimes = results.addMeasurement(
-        "Recursivly exponentiate multiple times", [&number, &exponentiate]() {
+        "Recursively exponentiate multiple times", [&number, &exponentiate]() {
           size_t toExponentiate = number;
           for (size_t i = 0; i < 10'000'000'000; i++) {
             toExponentiate = exponentiate(toExponentiate);

diff --git a/benchmark/JoinAlgorithmBenchmark.cpp b/benchmark/JoinAlgorithmBenchmark.cpp
@@ -130,7 +130,7 @@ struct SetOfIdTableColumnElements {
 /*
 @brief Create an overlap between the join columns of the IdTables, by randomly
 choosing distinct elements from the join column of the smaller table and
-overiding all their occurrences in the join column with randomly choosen
+overriding all their occurrences in the join column with randomly chosen
 distinct elements from the join column of the bigger table.
 
 @param smallerTable The table, where distinct join column elements will be
@@ -167,7 +167,7 @@ static size_t createOverlapRandomly(IdTableAndJoinColumn* const smallerTable,
   // Collect and count the table elements.
   SetOfIdTableColumnElements biggerTableJoinColumnSet(biggerTableJoinColumnRef);
 
-  // Seeds for the random generators, so that things are less similiar.
+  // Seeds for the random generators, so that things are less similar.
   const std::array<ad_utility::RandomSeed, 2> seeds =
       createArrayOfRandomSeeds<2>(std::move(randomSeed));
 
@@ -229,7 +229,7 @@ static size_t createOverlapRandomly(IdTableAndJoinColumn* const smallerTable,
           Assign the value to itself.
           This is needed, because so we can indirectly track the first
           'encounter' with an distinct element in the smaller table and save
-          ressources.
+          resources.
           */
           smallerTableElementToNewElement.emplace(id, id);
         }
@@ -275,7 +275,7 @@ static size_t createOverlapRandomly(IdTableAndJoinColumn* const smallerTable,
   SetOfIdTableColumnElements smallerTableJoinColumnSet(
       smallerTableJoinColumnRef);
 
-  // Seeds for the random generators, so that things are less similiar.
+  // Seeds for the random generators, so that things are less similar.
   const std::array<ad_utility::RandomSeed, 2> seeds =
       createArrayOfRandomSeeds<2>(std::move(randomSeed));
 
@@ -304,7 +304,7 @@ static size_t createOverlapRandomly(IdTableAndJoinColumn* const smallerTable,
         const auto& biggerTableId{biggerTableJoinColumnSet.uniqueElements_.at(
             randomBiggerTableElement())};
 
-        // Skip this possibilty, if we have an overflow.
+        // Skip this possibility, if we have an overflow.
         size_t newMatches{
             smallerTableJoinColumnSet.numOccurrences_.at(smallerTableId)};
         if (const size_t numOccurencesBiggerTable{
@@ -338,12 +338,12 @@ static size_t createOverlapRandomly(IdTableAndJoinColumn* const smallerTable,
 }
 
 /*
-The columns of the automaticly generated benchmark tables contain the following
-informations:
+The columns of the automatically generated benchmark tables contain the
+following information:
 - The parameter, that changes with every row.
 - Time needed for sorting `IdTable`s.
 - Time needed for merge/galloping join.
-- Time needed for sorting and merge/galloping added togehter.
+- Time needed for sorting and merge/galloping added together.
 - Time needed for the hash join.
 - How many rows the result of joining the tables has.
 - How much faster the hash join is. For example: Two times faster.
@@ -390,7 +390,7 @@ concept exactlyOneGrowthFunction =
     ((growthFunction<Ts, size_t> || growthFunction<Ts, float>)+...) == 1;
 
 /*
-@brief Calculates the smalles whole exponent $n$, so that $base^n$ is equal, or
+@brief Calculates the smallest whole exponent $n$, so that $base^n$ is equal, or
 bigger, than the `startingPoint`.
 */
 template <std::convertible_to<double> T>
@@ -530,7 +530,7 @@ struct ConfigVariables {
   `IdTables` gets bigger with every row, while the other attributes stay the
   same.
   For the attributes, that don't stay the same, inclusive boundaries are
-  defined. Sometimes implicitely via other configuration option. (With the
+  defined. Sometimes implicitly via other configuration option. (With the
   exception of the sample size ratios for the special benchmarking class
   `BmSampleSizeRatio`.)
 
@@ -759,7 +759,7 @@ class GeneralInterfaceImplementation : public BenchmarkInterface {
     @brief The generated lambda returns true, iff if it is called with a value,
     that is bigger than the given minimum value
 
-    @param canBeEqual If true, the generated lamba also returns true, if the
+    @param canBeEqual If true, the generated lambda also returns true, if the
     values are equal.
     */
     auto generateBiggerEqualLambda = []<typename T>(const T& minimumValue,
@@ -1107,7 +1107,7 @@ class GeneralInterfaceImplementation : public BenchmarkInterface {
   - Return values of the parameter, you gave a function for.
   - Time needed for sorting `IdTable`s.
   - Time needed for merge/galloping join.
-  - Time needed for sorting and merge/galloping added togehter.
+  - Time needed for sorting and merge/galloping added together.
   - Time needed for the hash join.
   - How many rows the result of joining the tables has.
   - How much faster the hash join is. For example: Two times faster.
@@ -1145,7 +1145,7 @@ class GeneralInterfaceImplementation : public BenchmarkInterface {
   to be exactly the same.
   @param randomSeed Seed for the random generators.
   @param smallerTableSorted, biggerTableSorted Should the bigger/smaller table
-  be sorted by his join column before being joined? More specificly, some
+  be sorted by his join column before being joined? More specifically, some
   join algorithm require one, or both, of the IdTables to be sorted. If this
   argument is false, the time needed for sorting the required table will
   added to the time of the join algorithm.
@@ -1200,7 +1200,7 @@ class GeneralInterfaceImplementation : public BenchmarkInterface {
     // Returns the first argument, that is a growth function.
     auto returnFirstGrowthFunction =
         [&isGrowthFunction]<typename... Ts>(Ts&... args) -> auto& {
-      // Put them into a tuple, so that we can easly look them up.
+      // Put them into a tuple, so that we can easily look them up.
       auto tup = std::tuple<Ts&...>{AD_FWD(args)...};
 
       // Get the index of the first growth function.
@@ -1479,7 +1479,7 @@ class GeneralInterfaceImplementation : public BenchmarkInterface {
                       biggerTableJoinColumnSampleSizeRatio)) -
         1;
 
-    // Seeds for the random generators, so that things are less similiar
+    // Seeds for the random generators, so that things are less similar
     // between the tables.
     const std::array<ad_utility::RandomSeed, 5> seeds =
         createArrayOfRandomSeeds<5>(std::move(randomSeed));
@@ -1891,7 +1891,7 @@ class BmSampleSizeRatio final : public GeneralInterfaceImplementation {
           "'max-memory' and 'bigger-table-num-columns'";
     } else {
       smallerTableNumRowsDescription =
-          "divison of 'max-bigger-table-rows' with 'min-ratio-rows'";
+          "division of 'max-bigger-table-rows' with 'min-ratio-rows'";
       smallerTableNumRowsConfigurationOptions = "'max-bigger-table-rows'";
     }
 
@@ -1918,7 +1918,7 @@ class BmSampleSizeRatio final : public GeneralInterfaceImplementation {
     };
 
     /*
-    Calculate the expexcted number of rows in the result for the simplified
+    Calculate the expected number of rows in the result for the simplified
     creation model of input tables join columns and overlaps, with the biggest
     sample size ratio used for both input tables. The simplified creation model
     assumes, that:
@@ -1927,12 +1927,13 @@ class BmSampleSizeRatio final : public GeneralInterfaceImplementation {
     overlaps are inserted later.
     - The join column entries of the smaller table have a uniform distribution,
     are made up out of the join column elements of both tables (smaller and
-    bigger) and the generation of one row entry is independet from the
+    bigger) and the generation of one row entry is independent from the
     generation of all other row entries.
     - The join column entries of the bigger table have a uniform distribution,
     are made up out of only the elements of the bigger tables and the generation
-    of one row entry is independet from the generation of all other row entries.
-    - The generation of join column entries in the smaller table is independet
+    of one row entry is independent from the generation of all other row
+    entries.
+    - The generation of join column entries in the smaller table is independent
     from the generation in the bigger table.
 
     Note: In reality, the set of possible join column entries for the smaller
@@ -2155,7 +2156,7 @@ class BmSmallerTableGrowsBiggerTableRemainsSameSize final
   - Number of rows in the smaller table.
   - Time needed for sorting `IdTable`s.
   - Time needed for merge/galloping join.
-  - Time needed for sorting and merge/galloping added togehter.
+  - Time needed for sorting and merge/galloping added together.
   - Time needed for the hash join.
   - How many rows the result of joining the tables has.
   - How much faster the hash join is. For example: Two times faster.
@@ -2165,7 +2166,7 @@ class BmSmallerTableGrowsBiggerTableRemainsSameSize final
   @param tableDescriptor A identifier for the to be created benchmark table, so
   that it can be easier identified later.
   @param smallerTableSorted, biggerTableSorted Should the bigger/smaller table
-  be sorted by his join column before being joined? More specificly, some
+  be sorted by his join column before being joined? More specifically, some
   join algorithm require one, or both, of the IdTables to be sorted. If this
   argument is false, the time needed for sorting the required table will
   added to the time of the join algorithm.

diff --git a/benchmark/Usage.md b/benchmark/Usage.md
@@ -68,7 +68,7 @@ BenchmarkResults runAllBenchmarks(){
   /*
   Create an empty table with a number of rows and columns. Doesn't measure anything.
   The number of columns can not be changed after creation, but the number of rows can.
-  Important: The row names aren't saved in a seperate container, but INSIDE the
+  Important: The row names aren't saved in a separate container, but INSIDE the
   first column of the table.
   */
   auto& table = results.addTable(identifier, {"rowName1", "rowName2", "etc."},

diff --git a/benchmark/infrastructure/Benchmark.h b/benchmark/infrastructure/Benchmark.h
@@ -98,7 +98,7 @@ class BenchmarkResults {
   }
 
   /*
-   * @brief Returns a vector of all the singe measurements.
+   * @brief Returns a vector of all the single measurements.
    */
   std::vector<ResultEntry> getSingleMeasurements() const;
 
@@ -185,7 +185,7 @@ class BenchmarkInterface {
   const ad_utility::ConfigManager& getConfigManager() const;
 
   /*
-  @brief Only used for manipulaton via the infrastructure. Is called directly
+  @brief Only used for manipulation via the infrastructure. Is called directly
   before `runAllBenchmarks`.
 
   Add/update the default metadata of the benchmark class. Currently
@@ -205,7 +205,7 @@ class BenchmarkRegister {
   using BenchmarkPointer = std::unique_ptr<BenchmarkInterface>;
 
   /*
-  Static vector of all registered benchmark classe instances.
+  Static vector of all registered benchmark class instances.
    */
   inline static std::vector<BenchmarkPointer> registeredBenchmarks{};
 
@@ -216,7 +216,7 @@ class BenchmarkRegister {
    *  implemented the `BenchmarkInterface`. Shouldn't take up much space
    *  and I couldn't find a better way of doing it.
    *
-   * @param benchmarkClasseInstance The memory managment of the passed
+   * @param benchmarkClasseInstance The memory management of the passed
    *  instances will be taken over by `BenchmarkRegister`.
    */
   explicit BenchmarkRegister(BenchmarkPointer&& benchmarkClasseInstance);
@@ -230,9 +230,9 @@ class BenchmarkRegister {
   /*
   @brief For each registered benchmark:
   - Update the default class metadata.
-  - Run the measurments.
+  - Run the measurements.
 
-  @return Every benchmark class get's measured with their own
+  @return Every benchmark class gets measured with their own
   `BenchmarkResults`. They should be in the same order as the registrations.
   */
   static std::vector<BenchmarkResults> runAllRegisteredBenchmarks();
@@ -247,7 +247,7 @@ class BenchmarkRegister {
 Macros for easier registering of benchmark classes.
 `declareRegisterVariable` and `declareRegisterVariableHidden` are needed for
 the implementation. Only `registerBenchmark` needs to be 'called', when one
-want's to register a benchmark class.
+wants to register a benchmark class.
 */
 #define AD_DECLARE_REGISTER_VARIABLE_HIDDEN(line, benchmarkClass, ...) \
   static BenchmarkRegister gRegisterVariable##benchmarkClass##line{    \