From 8a9c14023fe88e3c6688a37ed91477c30a4862bd Mon Sep 17 00:00:00 2001 From: Johannes Terblanche <6612981+Affie@users.noreply.github.com> Date: Thu, 14 Nov 2024 07:58:01 +0200 Subject: [PATCH] Implement a BagOfWords tree and DB (#1026) * Implement BagOfWords Co-authored-by: Dehann Fourie <6412556+dehann@users.noreply.github.com> --- Project.toml | 5 + src/Caesar.jl | 5 +- src/bagofwords/BagOfWords.jl | 342 +++++++++++++++++++++++++++++++++++ 3 files changed, 351 insertions(+), 1 deletion(-) create mode 100644 src/bagofwords/BagOfWords.jl diff --git a/Project.toml b/Project.toml index 7a639724f..3199331cb 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.16.3" [deps] ApproxManifoldProducts = "9bbbb610-88a1-53cd-9763-118ce10c1f89" Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" ColorVectorSpace = "c3611d14-8923-5661-9e6a-0046d554d3a4" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" @@ -14,6 +15,7 @@ CoordinateTransformations = "150eb455-5306-5404-9cee-2592286d6298" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" DistributedFactorGraphs = "b5cc3c7e-6572-11e9-2517-99fb8daf2f04" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" @@ -23,6 +25,7 @@ FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" GeometricalPredicates = "fd0ad045-b25c-564e-8f9c-8ef5c5f21267" GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326" +Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534" ImageDraw = "4381153b-2b60-58ae-a1ba-fd683676385f" IncrementalInference = "904591bb-b899-562f-9e6f-b8df64c7d480" @@ -34,6 +37,7 @@ JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" KernelDensityEstimate = "2472808a-b354-52ea-a80e-1658a3c6056d" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Manifolds = "1cead3c2-87b3-11e9-0ccd-23c62b72b94e" +MetaGraphsNext = "fa8bd995-216d-47f1-8a91-f3b68fbeb377" MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411" NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56" NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" @@ -46,6 +50,7 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69" RoME = "91fb55c2-4c03-5a59-ba21-f4ea956187b8" Rotations = "6038ab10-8711-5258-84ad-4b1120ba62dc" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/src/Caesar.jl b/src/Caesar.jl index 17c825eb3..1f2313cf2 100644 --- a/src/Caesar.jl +++ b/src/Caesar.jl @@ -73,6 +73,7 @@ include("services/WorkflowBlobNames.jl") include("services/PointUtils.jl") include("services/DataUtils.jl") include("services/UserFunctions.jl") +include("services/ToImage.jl") # SAS-SLAM include("beamforming/czt.jl") @@ -81,8 +82,10 @@ include("beamforming/MatchedFilter.jl") include("beamforming/SASBearing2D.jl") include("beamforming/SASUtils.jl") -include("services/ToImage.jl") +# Bag of words +include("bagofwords/BagOfWords.jl") +# manual type-implementation of Point Cloud Library include("3rdParty/_PCL/_PCL.jl") # object affordance work diff --git a/src/bagofwords/BagOfWords.jl b/src/bagofwords/BagOfWords.jl new file mode 100644 index 000000000..9dd1ffc1e --- /dev/null +++ b/src/bagofwords/BagOfWords.jl @@ -0,0 +1,342 @@ +using MetaGraphsNext +using Graphs +using Clustering +using StaticArrays +using TensorCast +using Distances +using SparseArrays + +# References: +# [SZ 2003]: Sivic and Zisserman, 2003, October. Video Google: A text retrieval approach to object matching in videos. In Proceedings ninth IEEE international conference on computer vision (pp. 1470-1477). IEEE. +# [Wang 2011] Wang, X., Yang, M., Cour, T., Zhu, S., Yu, K., & Han, T. X. (2011, November). Contextual weighting for vocabulary tree based image retrieval. In 2011 International conference on computer vision (pp. 209-216). IEEE. +# [Gálvez-López, 2012] Gálvez-López, D., & Tardos, J. D. (2012). Bags of binary words for fast place recognition in image sequences. IEEE Transactions on robotics, 28(5), 1188-1197. +# [Nister, 2006] Nister, David, and Henrik Stewenius. "Scalable recognition with a vocabulary tree." 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06). Vol. 2. Ieee, 2006. + +## ============================================================================= +## Vocabulary +## ============================================================================= +const Wordnode = @NamedTuple{ + leaveId::Int, word::SVector{128, Float32}, count::Int, level::Int, weight::Float64} +const Word = @NamedTuple{leaveId::Int, weight::Float64} +function Base.zero(::Type{Wordnode}) + (leaveId = 0, word = zeros(SVector{128, Float32}), count = 0, level = 0, weight = 0.0) +end + +function add_voc_children!(tree, descriptors::Matrix, parent = 1, level = 0; progress) + children = Clustering.kmeans(descriptors, tree.graph_data[:n_children]) + level += 1 + for (i, (centre, cnt)) in enumerate(zip(eachcol(children.centers), children.counts)) + next!(progress) + idx = nv(tree) + 1 + + # only crete children if count enough for n_children or not last level + isLeaveNode = cnt < tree.graph_data[:n_children] || + level >= tree.graph_data[:n_levels] + + if isLeaveNode + tree.graph_data[:n_leaves] += 1 + leaveId = tree.graph_data[:n_leaves] + else + leaveId = 0 + end + + tree[idx] = ( + leaveId = leaveId, word = centre, count = cnt, level = level, weight = 0.0) + add_edge!(tree, parent, idx) + + if isLeaveNode + # push!(tree.graph_data[:leaveIds], idx) + continue + else + next_idxs = findall(children.assignments .== i) + add_voc_children!(tree, descriptors[:, next_idxs], idx, level; progress) + end + end + + return nothing +end + +function countOccurrence(voctree, image_descriptors) + image_word_count = [Set() for _ in 1:voctree.graph_data[:n_leaves]] + # FIXME make thread save if needed for speed, got undef # Threads.@threads + for (idx, image_desc) in enumerate(image_descriptors) + img_words = getWords(voctree, image_desc) + for img_word in img_words + push!(image_word_count[img_word.leaveId], idx) + end + end + return length.(image_word_count) +end + +# set inverse document frequency idf weights for the vocabulary [SZ 2003] +function setVocabularyWeigths!(voctree, all_desc) + N = length(all_desc) # how many images used to train the vocabulary + occs = countOccurrence(voctree, all_desc) + _weights = map(occs) do n_i + log(N / n_i) # [SZ 2003, Sec 4] + end + @showprogress for l in labels(voctree) + node = voctree[l] + if node.leaveId > 0 + voctree[l] = (node..., weight = _weights[node.leaveId]) + end + end +end + +""" + buildVocabulary + +Build a vocabulary tree from a set of descriptors. +image_descriptors is a vector of vectors of descriptors, each vector of descriptors is from one image. +""" +function buildVocabulary(image_descriptors::Vector, n_children = 9, n_levels = 5) + descriptors = reduce(vcat, image_descriptors) + + # tree and root vertex + voctree = MetaGraph( + DiGraph(), + Int, + Wordnode, + Nothing, + Dict( + :n_children => n_children, + :n_levels => n_levels, + :n_leaves => 0 + ) + ) + + voctree[1] = (leaveId = 0, word = mean(descriptors), + count = length(descriptors), level = 0, weight = 0.0) + @cast desc_mat[j, i] := descriptors[i][j] + + n_nodes = sum(n_children .^ collect(0:n_levels)) + progress = Progress(n_nodes; dt = 1.0) + + add_voc_children!(voctree, desc_mat; progress) + + setVocabularyWeigths!(voctree, image_descriptors) + + finish!(progress) + return voctree +end + +## ============================================================================= +## Vocabulary Lookup +## ============================================================================= +""" + getWord(tree, lookmeup, nodeIdx=1, level=0, MAX_LEVEL=tree.graph_data[:n_levels]; dist=Distances.Euclidean()) + +Recursively traverse the vocabulary tree structure to find the closest word to `lookmeup` using a specified distance metric. + +# Arguments +- `tree`: The vocabulary tree structure containing nodes with words. +- `lookmeup`: The features to look up in the tree. +- `nodeIdx`: The current node index (default is 1). +- `level`: The current level in the tree (default is 0). +- `MAX_LEVEL`: The maximum level to traverse in the tree (default is `tree.graph_data[:n_levels]`). +- `dist`: The distance metric to use for comparison (default is `Distances.Euclidean()`). + +# Returns +- A named tuple containing: + - `leaveId`: The ID of the leaf node. + - `weight`: The weight associated with the leaf node. +""" +function getWord(tree, lookmeup, nodeIdx = 1, level = 0, MAX_LEVEL = tree.graph_data[:n_levels]; + dist = Distances.Euclidean() +) + level += 1 + children = outneighbors(tree, nodeIdx) + # @debug level nodeIdx children + if level <= MAX_LEVEL && !isempty(children) + dists = map(children) do i + dist(tree[i].word, lookmeup) + end + getWord(tree, lookmeup, children[argmin(dists)], level, MAX_LEVEL; dist) + else + (; leaveId, weight) = tree[nodeIdx] + return (leaveId = leaveId, weight = weight) + end +end + +""" + getWords(tree, lookupvec, nodeIdx=1, level=0, MAX_LEVEL=tree.graph_data[:n_levels]; dist=Distances.Euclidean()) + +Given a vocabulary tree and a vector of lookup features, this function computes the corresponding words for each value in the lookup vector. + +# Arguments +- `tree`: The tree structure containing the vocabulary. +- `lookupvec`: A vector of values for which words need to be found. +- `dist`: The distance metric to use (default is `Distances.Euclidean()`). + +# Returns +- `words`: A vector of words corresponding to each value in the lookup vector. +""" +function getWords(tree, + lookupvec, + nodeIdx = 1, + level = 0, + MAX_LEVEL = tree.graph_data[:n_levels]; + dist = Distances.Euclidean() +) + # words = Vector{Wordnode}(undef, length(lookupvec)) + # words = zeros(Wordnode, length(lookupvec)) + words = Vector{Word}(undef, length(lookupvec)) + Threads.@threads for idx in eachindex(lookupvec) + words[idx] = getWord(tree, lookupvec[idx], nodeIdx, level, MAX_LEVEL; dist) + end + return words +end + +""" + getBowvector(voctree, image_words) + +Compute the Bag of Words (BoW) vector for a given image. + +# Arguments +- `voctree`: The vocabulary tree used to generate the words. +- `image_words`: The words extracted from the image. + +# Returns +- A sparse vector representing the Term Frequency-Inverse Document Frequency (TF-IDF) of the image words. + +""" +function getBowvector(voctree, image_words) + tfvec = spzeros(voctree[][:n_leaves]) + idfvec = spzeros(voctree[][:n_leaves]) # IDF weights + # TODO can improve, bit inefficient, but easy + for (i, w) in image_words + # calculate n_id number of occurences of word i in image d + tfvec[i] += 1 + idfvec[i] = w + end + n_d = length(image_words) #total number of words in image d + # n_id/n_d*log(N/n_i), # [SZ 2003, Sec 4] + return (tfvec / n_d) .* idfvec # TF_IDF bowvec +end + +""" + score_L1(v1, v2) + +Compute the L1 score between two vectors `v1` and `v2`. + +The L1 score is calculated as `1 - 0.5 * norm(v1 / norm(v1) .- v2 / norm(v2))`, which measures the similarity between the two vectors. + +# Arguments +- `v1::AbstractVector`: The first input vector. +- `v2::AbstractVector`: The second input vector. + +# Returns +- `Float64`: The L1 score between the two input vectors. + +# References +- [Nister, 2006] +- [Gálvez-López, 2014] +""" +function score_L1(v1, v2) + # [Gálvez-López, 2014] eq2 #TODO can optimize if needed with [Nister, 2006] eq 5 + return 1 - 0.5 * norm(v1 / norm(v1, 1) .- v2 / norm(v2, 1), 1) +end + +function score_L2(v1, v2) + # [Nister, 2006] eq 6 + # return 2.0 - sqrt(1.0 - dot(v1, v2)) + return 1 - 0.5 * norm(v1 / norm(v1) .- v2 / norm(v2)) +end + +function score_norm(p=2) + return (v1,v2) -> 1 - 0.5 * norm(v1 / norm(v1, p) .- v2 / norm(v2, p), p) +end +## ============================================================================= +## Image DB +## ============================================================================= +# +""" + createImageInverseIndex(voctree, image_descriptors) + +Create an image inverse index using Term Frequency-Inverse Document Frequency (TF-IDF) weighting. + +# Arguments +- `voctree`: A vocabulary tree structure containing the graph data and other relevant information. +- `image_descriptors`: A collection of image descriptors, where each descriptor is a pair consisting of an image identifier and its corresponding feature descriptors. + +# Returns +- `image_index`: A sparse matrix where each column corresponds to an image and each row corresponds to a word in the vocabulary. The values are the TF-IDF weights. +- `image_ids`: A vector of image identifiers corresponding to the columns of the `image_index`. +""" +function createImageInverseIndex(voctree, image_descriptors; dist = Distances.Euclidean()) + #creation is a bit slower this way, but should be easier to create faster lookups + image_index = spzeros(voctree.graph_data[:n_leaves], length(image_descriptors)) + @showprogress for (l, image_desc) in enumerate(image_descriptors) + img_words = getWords(voctree, image_desc.second; dist) + bow_vec = getBowvector(voctree, img_words) + for (i, bv) in zip(findnz(bow_vec)...) + image_index[i,l] = bv + end + end + return image_index, first.(image_descriptors) +end + +function createImageInverseIndex_idf(voctree, image_descriptors) + image_index = [Tuple{Symbol, Float64}[] for _ in 1:voctree.graph_data[:n_leaves]] + word_index = Dict{Symbol, Vector{Int64}}() + for image_desc in image_descriptors + img_words = getWords(voctree, getproperty.(image_desc.second, :value)) + for img_word in img_words + push!(image_index[img_word.leaveId], (image_desc.first, img_word.weight)) + push!(get!(word_index, image_desc.first, Int[]), img_word.leaveId) + end + end + return image_index, word_index +end + + +## ============================================================================= +## Lookup +## ============================================================================= + +""" + findkImages_BF_binary(image_inverse_index, image_word; k=10) + +Finds the top `k` images that match the given `image_word` using a brute-force search using binary weights. + +# Returns +- `Array`: An array of pairs where each pair consists of an image identifier and its corresponding score, sorted by score in descending order. +""" +function findkImages_BF_binary(image_inverse_index, image_word; k=10) + worddict = OrderedDict{Symbol, Float64}() + # Threads.@threads + for words in image_word + poses = image_inverse_index[words[1]] + foreach(poses) do p + get!(worddict, p[1], 0) + worddict[p[1]] += 1 + end + end + sort!(worddict; byvalue=true, rev=true) + return collect(pairs(worddict))[1:k] +end + +""" + findkImages_BF(image_index, image_labels, bowvec, score=score_L1; k=10) + +Finds the top `k` images that best match the given bag-of-words vector (`bowvec`) using a brute-force approach. + +# Arguments +- `image_index::Matrix{Float64}`: A matrix where each column represents the bag-of-words vector of an image in the DB. +- `image_labels::Vector{Symbol}`: A vector containing the labels of the images in the DB. +- `bowvec::Vector{Float64}`: The bag-of-words vector of the query image. +- `score::Function`: A function to compute the similarity score between two bag-of-words vectors. Defaults to `score_L1`. +- `k::Int`: The number of top matches to return. Defaults to 10. + +# Returns +- `Vector{Pair{Symbol, Float64}}`: A vector of pairs where each pair consists of an image label and its corresponding similarity score, sorted in descending order of similarity. Only the top `k` matches are returned. +""" +function findkImages_BF(image_index, image_labels, bowvec, score=score_L1; k=10) + matches = Vector{Pair{Symbol, Float64}}(undef, size(image_index,2)) + Threads.@threads for i in eachindex(matches) + vdb = image_index[:,i] + matches[i] = image_labels[i]=>score(vdb, bowvec) + end + sort!(matches, by=last, rev=true) + return matches[1:k] +end