From 8a0c7074081fd6e63fbd3f0db87a5c45334bf693 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 13 Oct 2023 13:23:01 -0700 Subject: [PATCH] Improve: wrap-around indexing in the hash-table --- .vscode/launch.json | 4 +-- c/test.c | 6 ++-- docs/compilation.md | 4 ++- include/usearch/index_plugins.hpp | 55 ++++++++++++++++++++----------- 4 files changed, 43 insertions(+), 26 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 4c791b4e..931cd104 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,7 +5,7 @@ "name": "Unit Test C++", "type": "cppdbg", "request": "launch", - "program": "${workspaceFolder}/build_debug/test", + "program": "${workspaceFolder}/build_debug/test_cpp", "cwd": "${workspaceFolder}", "environment": [ { @@ -29,7 +29,7 @@ "windows": { "preLaunchTask": "Windows Build Debug", "type": "cppvsdbg", - "program": "${workspaceFolder}/build_debug/test.exe" + "program": "${workspaceFolder}/build_debug/test_cpp.exe" } }, { diff --git a/c/test.c b/c/test.c index cd5a8753..96e08212 100644 --- a/c/test.c +++ b/c/test.c @@ -142,9 +142,8 @@ void test_find_vector(size_t const collection_size, size_t const dimensions) { // Find the vectors for (size_t i = 0; i < collection_size; i++) { - size_t found_count = - usearch_search(index, data + i * dimensions, usearch_scalar_f32_k, collection_size, keys, distances, &error); - printf("count %i : %i cs %i d\n", (int)found_count, (int)collection_size, (int)dimensions); + size_t found_count = usearch_search(index, data + i * dimensions, usearch_scalar_f32_k, collection_size, keys, + distances, &error); ASSERT(!error, error); ASSERT(found_count >= 1 && found_count <= collection_size, "Vector is missing"); } @@ -184,7 +183,6 @@ void test_get_vector(size_t const collection_size, size_t const dimensions) { // Retrieve vectors from index size_t found_count = usearch_get(index, key, collection_size, vectors, usearch_scalar_f32_k, &error); - printf("found_count %zu : %zu cs %zu d\n", found_count, collection_size, dimensions); ASSERT(found_count == collection_size, "Vector is missing"); free(vectors); diff --git a/docs/compilation.md b/docs/compilation.md index 6f7999e4..80380e4a 100644 --- a/docs/compilation.md +++ b/docs/compilation.md @@ -54,7 +54,9 @@ cppcheck --enable=all --force --suppress=cstyleCast --suppress=unusedFunction \ Testing: ```sh -cmake -DCMAKE_BUILD_TYPE=Debug -B ./build_debug && make -C ./build_debug && ./build_debug/test_cpp +cmake -DUSEARCH_BUILD_TEST_CPP=1 -B ./build_debug +cmake --build ./build_debug --config Debug +./build_debug/test_cpp ``` ## Python 3 diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp index bba10f71..5de7caf6 100644 --- a/include/usearch/index_plugins.hpp +++ b/include/usearch/index_plugins.hpp @@ -1732,6 +1732,7 @@ class exact_search_t { * @section Layout * * For every slot we store 2 extra bits for 3 possible states: empty, populated, or deleted. + * With linear probing the hashes at the end of the populated region will spill into its first half. */ template > class flat_hash_multi_set_gt { @@ -1950,18 +1951,15 @@ class flat_hash_multi_set_gt { using pointer = element_t*; using reference = element_t&; - equal_iterator_gt(std::size_t index, flat_hash_multi_set_gt* parent, const query_at& query, - const equals_t& equals) + equal_iterator_gt(std::size_t index, flat_hash_multi_set_gt* parent, query_at const& query, + equals_t const& equals) : index_(index), parent_(parent), query_(query), equals_(equals) {} // Pre-increment equal_iterator_gt& operator++() { do { - ++index_; - if (index_ >= parent_->capacity_slots_) { - break; - } - } while (!equals_(parent_->slot_ref(index_).element, query_) || + index_ = (index_ + 1) & (parent_->capacity_slots_ - 1); + } while (!equals_(parent_->slot_ref(index_).element, query_) && !(parent_->slot_ref(index_).header.populated & parent_->slot_ref(index_).mask)); return *this; } @@ -1986,6 +1984,12 @@ class flat_hash_multi_set_gt { equals_t equals_; // Store the equals functor }; + /** + * @brief Returns an iterator range of all elements matching the given query. + * + * Technically, the second iterator points to the first empty slot after a + * range of equal values and non-equal values with similar hashes. + */ template std::pair, equal_iterator_gt> equal_range(query_at const& query) const noexcept { @@ -1998,28 +2002,41 @@ class flat_hash_multi_set_gt { hash_t hasher; std::size_t hash_value = hasher(query); - std::size_t slot_index = hash_value & (capacity_slots_ - 1); - std::size_t const start_index = slot_index; - std::size_t first_equal_index = capacity_slots_; + std::size_t first_equal_index = hash_value & (capacity_slots_ - 1); + std::size_t const start_index = first_equal_index; // Linear probing to find the first equal element do { - slot_ref_t slot = slot_ref(slot_index); + slot_ref_t slot = slot_ref(first_equal_index); if (slot.header.populated & ~slot.header.deleted & slot.mask) { - if (equals(slot.element, query)) { - first_equal_index = slot_index; + if (equals(slot.element, query)) break; - } } // Stop if we find an empty slot else if (~slot.header.populated & slot.mask) - break; + return {end, end}; // Move to the next slot - slot_index = (slot_index + 1) & (capacity_slots_ - 1); - } while (slot_index != start_index); + first_equal_index = (first_equal_index + 1) & (capacity_slots_ - 1); + } while (first_equal_index != start_index); - return {equal_iterator_gt(first_equal_index, this_ptr, query, equals), end}; + // If no matching element was found, return end iterators + if (first_equal_index == capacity_slots_) + return {end, end}; + + // Start from the first matching element and find the end of the populated range + std::size_t first_empty_index = first_equal_index; + do { + first_empty_index = (first_empty_index + 1) & (capacity_slots_ - 1); + slot_ref_t slot = slot_ref(first_empty_index); + + // If we find an empty slot, this is our end + if (~slot.header.populated & slot.mask) + break; + } while (first_empty_index != start_index); + + return {equal_iterator_gt(first_equal_index, this_ptr, query, equals), + equal_iterator_gt(first_empty_index, this_ptr, query, equals)}; } template bool pop_first(similar_at&& query, element_t& popped_value) noexcept { @@ -2065,7 +2082,7 @@ class flat_hash_multi_set_gt { equals_t equals; std::size_t hash_value = hasher(query); std::size_t slot_index = hash_value & (capacity_slots_ - 1); // Assuming capacity_slots_ is a power of 2 - std::size_t start_index = slot_index; // To detect loop in probing + std::size_t const start_index = slot_index; // To detect loop in probing std::size_t count = 0; // Count of elements removed // Linear probing to find all matches