Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-58901: Introduce support for BM25 scoring #2113

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
bbe4ae7
hacky start
metonymic-smokey Dec 5, 2024
a679009
use ctx in term srch
metonymic-smokey Dec 5, 2024
2d8a43d
field cardinality temp save
Thejas-bhat Dec 6, 2024
52b1768
average doc length stat for a field
Thejas-bhat Dec 6, 2024
42082f8
bm25 scoring first implementation
Thejas-bhat Dec 6, 2024
a52bd49
notes and keep the default tf-idf stuff
Thejas-bhat Dec 6, 2024
36159b6
bug fixes and BM25 UT pass
Thejas-bhat Dec 10, 2024
f3424b5
making bm25 presearch (i.e. global scoring) optional
Thejas-bhat Dec 10, 2024
d393616
field mapping to capture type of scoring; bm25 by default
Thejas-bhat Dec 11, 2024
55e63fd
bug fixes, unit test fixes
Thejas-bhat Dec 11, 2024
04e1e72
cleanup/refactor
Thejas-bhat Dec 12, 2024
ab58975
bug fixes
Thejas-bhat Dec 12, 2024
dbed957
fix scatter-gather path
Thejas-bhat Dec 13, 2024
52e318d
bug fixes after merge conflict resolution
Thejas-bhat Jan 2, 2025
36db386
score explanation
Thejas-bhat Jan 6, 2025
e83cca0
default similarity config for an index
Thejas-bhat Jan 6, 2025
a643a3b
cleanup
Thejas-bhat Jan 6, 2025
b5a7c9b
keeping scoring as an index level config for consistency
Thejas-bhat Jan 7, 2025
7c4873c
Upgrade bleve_index_api, scorch_segment_api, zapx
abhinavdangeti Jan 7, 2025
12c2c72
Bump up zapx's v11, v12, v13, v14, v15 on account of interface change
abhinavdangeti Jan 8, 2025
ce537e6
code comments and handling edge case
Thejas-bhat Jan 9, 2025
79bd0c1
unit tests fix
Thejas-bhat Jan 9, 2025
8cdb525
cleanup?
Thejas-bhat Jan 9, 2025
d478f4f
code comment, exposing the multipliers to be made configurable
Thejas-bhat Jan 10, 2025
eaca63a
update score explanation, code cleanup
Thejas-bhat Jan 13, 2025
fbd4ed8
update links
Thejas-bhat Jan 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,26 @@ go 1.21
require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/bits-and-blooms/bitset v1.12.0
github.com/blevesearch/bleve_index_api v1.2.0
github.com/blevesearch/bleve_index_api v1.2.1
github.com/blevesearch/geo v0.1.20
github.com/blevesearch/go-faiss v1.0.24
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475
github.com/blevesearch/go-porterstemmer v1.0.3
github.com/blevesearch/goleveldb v1.0.1
github.com/blevesearch/gtreap v0.1.1
github.com/blevesearch/scorch_segment_api/v2 v2.3.0
github.com/blevesearch/scorch_segment_api/v2 v2.3.1
github.com/blevesearch/segment v0.9.1
github.com/blevesearch/snowball v0.6.1
github.com/blevesearch/snowballstem v0.9.0
github.com/blevesearch/stempel v0.2.0
github.com/blevesearch/upsidedown_store_api v1.0.2
github.com/blevesearch/vellum v1.1.0
github.com/blevesearch/zapx/v11 v11.3.10
github.com/blevesearch/zapx/v12 v12.3.10
github.com/blevesearch/zapx/v13 v13.3.10
github.com/blevesearch/zapx/v14 v14.3.10
github.com/blevesearch/zapx/v15 v15.3.17
github.com/blevesearch/zapx/v16 v16.1.11-0.20241219160422-82553cdd4b38
github.com/blevesearch/zapx/v11 v11.3.11
github.com/blevesearch/zapx/v12 v12.3.11
github.com/blevesearch/zapx/v13 v13.3.11
github.com/blevesearch/zapx/v14 v14.3.11
github.com/blevesearch/zapx/v15 v15.3.18
github.com/blevesearch/zapx/v16 v16.1.11-0.20250107152255-021e66397612
github.com/couchbase/moss v0.2.0
github.com/golang/protobuf v1.3.2
github.com/spf13/cobra v1.7.0
Expand Down
32 changes: 16 additions & 16 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4
github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo=
github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/bleve_index_api v1.2.1 h1:IuXwLvmyp7I7+e0FOA68gcHHLfzSQ4AqQ8wVab5uxk0=
github.com/blevesearch/bleve_index_api v1.2.1/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0=
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI=
Expand All @@ -19,8 +19,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0 h1:vxCjbXAkkEBSb4AB3Iqgr/EJcPyYRsiGxpcvsS8E1Dw=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0/go.mod h1:5y+TgXYSx+xJGaCwSlvy9G/UJBIY5wzvIkhvhBm2ATc=
github.com/blevesearch/scorch_segment_api/v2 v2.3.1 h1:jjexIzwOdBtC9MlUceNErYHepLvoKxTdA5atbeZSRWE=
github.com/blevesearch/scorch_segment_api/v2 v2.3.1/go.mod h1:Np3Y03rsemM5TsyFxQ3wy+tG97EcviLTbp2S5W0tpRY=
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A=
Expand All @@ -33,18 +33,18 @@ github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMG
github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ=
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk=
github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ=
github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s=
github.com/blevesearch/zapx/v12 v12.3.10/go.mod h1:0yeZg6JhaGxITlsS5co73aqPtM04+ycnI6D1v0mhbCs=
github.com/blevesearch/zapx/v13 v13.3.10 h1:0KY9tuxg06rXxOZHg3DwPJBjniSlqEgVpxIqMGahDE8=
github.com/blevesearch/zapx/v13 v13.3.10/go.mod h1:w2wjSDQ/WBVeEIvP0fvMJZAzDwqwIEzVPnCPrz93yAk=
github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz77pSwwKU=
github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
github.com/blevesearch/zapx/v15 v15.3.17 h1:NkkMI98pYLq/uHnB6YWcITrrLpCVyvZ9iP+AyfpW1Ys=
github.com/blevesearch/zapx/v15 v15.3.17/go.mod h1:vXRQzJJvlGVCdmOD5hg7t7JdjUT5DmDPhsAfjvtzIq8=
github.com/blevesearch/zapx/v16 v16.1.11-0.20241219160422-82553cdd4b38 h1:iJ3Q3sbyo2d0bjfb720RmGjj7cqzh/EdP3528ggDIMY=
github.com/blevesearch/zapx/v16 v16.1.11-0.20241219160422-82553cdd4b38/go.mod h1:JTZseJiEpogtkepKSubIKAmfgbQiOReJXfmjxB1qta4=
github.com/blevesearch/zapx/v11 v11.3.11 h1:r6/wFHFAKWvXJb82f5aO53l6p+gRH6eiX7S1tb3VGc0=
github.com/blevesearch/zapx/v11 v11.3.11/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ=
github.com/blevesearch/zapx/v12 v12.3.11 h1:GBBAmXesxXLV5UZ+FZ0qILb7HPssT+kxEkbPPfp5HPM=
github.com/blevesearch/zapx/v12 v12.3.11/go.mod h1:0yeZg6JhaGxITlsS5co73aqPtM04+ycnI6D1v0mhbCs=
github.com/blevesearch/zapx/v13 v13.3.11 h1:H5ZvgS1qM1XKzsAuwp3kvDfh5sJFu9bLH/B8U6Im5e8=
github.com/blevesearch/zapx/v13 v13.3.11/go.mod h1:w2wjSDQ/WBVeEIvP0fvMJZAzDwqwIEzVPnCPrz93yAk=
github.com/blevesearch/zapx/v14 v14.3.11 h1:pg+c/YFzMJ32GkOwLzH/HAQ/GBr6y1Ar7/K5ZQpxTNo=
github.com/blevesearch/zapx/v14 v14.3.11/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
github.com/blevesearch/zapx/v15 v15.3.18 h1:yJcQnQyHGNF6rAiwq85OHn3HaXo26t7vgd83RclEw7U=
github.com/blevesearch/zapx/v15 v15.3.18/go.mod h1:vXRQzJJvlGVCdmOD5hg7t7JdjUT5DmDPhsAfjvtzIq8=
github.com/blevesearch/zapx/v16 v16.1.11-0.20250107152255-021e66397612 h1:LhORiqEVyUPUrVETzmmVuT0Yudsz2R3qGLFJWUpMsQo=
github.com/blevesearch/zapx/v16 v16.1.11-0.20250107152255-021e66397612/go.mod h1:+FIylxb+5Z/sFVmNaGpppGLHKBMUEnPSbkKoi+izER8=
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=
Expand Down
9 changes: 7 additions & 2 deletions index/scorch/snapshot_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ type asynchSegmentResult struct {
dict segment.TermDictionary
dictItr segment.DictionaryIterator

index int
docs *roaring.Bitmap
cardinality int
index int
docs *roaring.Bitmap

thesItr segment.ThesaurusIterator

Expand Down Expand Up @@ -137,6 +138,7 @@ func (is *IndexSnapshot) newIndexSnapshotFieldDict(field string,

results := make(chan *asynchSegmentResult)
var totalBytesRead uint64
var fieldCardinality int64
for _, s := range is.segment {
go func(s *SegmentSnapshot) {
dict, err := s.segment.Dictionary(field)
Expand All @@ -146,6 +148,7 @@ func (is *IndexSnapshot) newIndexSnapshotFieldDict(field string,
if dictStats, ok := dict.(segment.DiskStatsReporter); ok {
atomic.AddUint64(&totalBytesRead, dictStats.BytesRead())
}
atomic.AddInt64(&fieldCardinality, int64(dict.Cardinality()))
if randomLookup {
results <- &asynchSegmentResult{dict: dict}
} else {
Expand All @@ -160,6 +163,7 @@ func (is *IndexSnapshot) newIndexSnapshotFieldDict(field string,
snapshot: is,
cursors: make([]*segmentDictCursor, 0, len(is.segment)),
}

for count := 0; count < len(is.segment); count++ {
asr := <-results
if asr.err != nil && err == nil {
Expand All @@ -183,6 +187,7 @@ func (is *IndexSnapshot) newIndexSnapshotFieldDict(field string,
}
}
}
rv.cardinality = int(fieldCardinality)
rv.bytesRead = totalBytesRead
// after ensuring we've read all items on channel
if err != nil {
Expand Down
14 changes: 10 additions & 4 deletions index/scorch/snapshot_index_dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ type segmentDictCursor struct {
}

type IndexSnapshotFieldDict struct {
snapshot *IndexSnapshot
cursors []*segmentDictCursor
entry index.DictEntry
bytesRead uint64
cardinality int
bytesRead uint64

snapshot *IndexSnapshot
cursors []*segmentDictCursor
entry index.DictEntry
}

func (i *IndexSnapshotFieldDict) BytesRead() uint64 {
Expand Down Expand Up @@ -94,6 +96,10 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
return &i.entry, nil
}

func (i *IndexSnapshotFieldDict) Cardinality() int {
return i.cardinality
}

func (i *IndexSnapshotFieldDict) Close() error {
return nil
}
Expand Down
4 changes: 4 additions & 0 deletions index/upsidedown/field_dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ func (r *UpsideDownCouchFieldDict) Next() (*index.DictEntry, error) {

}

func (r *UpsideDownCouchFieldDict) Cardinality() int {
return 0
}

func (r *UpsideDownCouchFieldDict) Close() error {
return r.iterator.Close()
}
64 changes: 58 additions & 6 deletions index_alias_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,11 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest
// indicates that this index alias is set as an Index
// in another alias, so we need to do a preSearch search
// and NOT a real search
bm25PreSearch := isBM25Enabled(i.mapping)
flags := &preSearchFlags{
knn: requestHasKNN(req),
synonyms: !isMatchNoneQuery(req.Query),
bm25: bm25PreSearch,
}
return preSearchDataSearch(ctx, req, flags, i.indexes...)
}
Expand Down Expand Up @@ -234,7 +236,7 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest
// - the request requires preSearch
var preSearchDuration time.Duration
var sr *SearchResult
flags, err := preSearchRequired(req, i.mapping)
flags, err := preSearchRequired(ctx, req, i.mapping)
if err != nil {
return nil, err
}
Expand All @@ -244,6 +246,7 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest
if err != nil {
return nil, err
}

// check if the preSearch result has any errors and if so
// return the search result as is without executing the query
// so that the errors are not lost
Expand Down Expand Up @@ -573,11 +576,20 @@ type asyncSearchResult struct {
type preSearchFlags struct {
knn bool
synonyms bool
bm25 bool // needs presearch for this too
}

// preSearchRequired checks if preSearch is required and returns a boolean flag
// It only allocates the preSearchFlags struct if necessary
func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) (*preSearchFlags, error) {
func isBM25Enabled(m mapping.IndexMapping) bool {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's refactor this to scoringModel(..) which returns the scoring model to use instead.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, perhaps this is fine as is - given that IndexMappingImpl's ScoringModel is exported.

var rv bool
if m, ok := m.(*mapping.IndexMappingImpl); ok {
rv = m.ScoringModel == index.BM25Scoring
}
return rv
}

// preSearchRequired checks if preSearch is required and returns the presearch flags struct
// indicating which preSearch is required
func preSearchRequired(ctx context.Context, req *SearchRequest, m mapping.IndexMapping) (*preSearchFlags, error) {
// Check for KNN query
knn := requestHasKNN(req)
var synonyms bool
Expand All @@ -598,18 +610,32 @@ func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) (*preSearchFl
}
}
}
if knn || synonyms {
var bm25 bool
if !isMatchNoneQuery(req.Query) {
if ctx != nil {
if searchType := ctx.Value(search.SearchTypeKey); searchType != nil {
if searchType.(string) == search.FetchStatsAndSearch {
bm25 = isBM25Enabled(m)
}
}
}
}

if knn || synonyms || bm25 {
return &preSearchFlags{
knn: knn,
synonyms: synonyms,
bm25: bm25,
}, nil
}
return nil, nil
}

func preSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) {
// create a dummy request with a match none query
// since we only care about the preSearchData in PreSearch
var dummyQuery = req.Query
if !flags.synonyms {
if !flags.bm25 && !flags.synonyms {
// create a dummy request with a match none query
// since we only care about the preSearchData in PreSearch
dummyQuery = query.NewMatchNoneQuery()
Expand Down Expand Up @@ -694,6 +720,19 @@ func constructSynonymPreSearchData(rv map[string]map[string]interface{}, sr *Sea
return rv
}

func constructBM25PreSearchData(rv map[string]map[string]interface{}, sr *SearchResult, indexes []Index) map[string]map[string]interface{} {
bmStats := sr.BM25Stats
if bmStats != nil {
for _, index := range indexes {
rv[index.Name()][search.BM25PreSearchDataKey] = &search.BM25Stats{
DocCount: bmStats.DocCount,
FieldCardinality: bmStats.FieldCardinality,
}
}
}
return rv
}

func constructPreSearchData(req *SearchRequest, flags *preSearchFlags,
preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) {
if flags == nil || preSearchResult == nil {
Expand All @@ -713,6 +752,9 @@ func constructPreSearchData(req *SearchRequest, flags *preSearchFlags,
if flags.synonyms {
mergedOut = constructSynonymPreSearchData(mergedOut, preSearchResult, indexes)
}
if flags.bm25 {
mergedOut = constructBM25PreSearchData(mergedOut, preSearchResult, indexes)
}
return mergedOut, nil
}

Expand Down Expand Up @@ -822,6 +864,12 @@ func redistributePreSearchData(req *SearchRequest, indexes []Index) (map[string]
rv[index.Name()][search.SynonymPreSearchDataKey] = fts
}
}

if bm25Data, ok := req.PreSearchData[search.BM25PreSearchDataKey].(*search.BM25Stats); ok {
for _, index := range indexes {
rv[index.Name()][search.BM25PreSearchDataKey] = bm25Data
}
}
return rv, nil
}

Expand Down Expand Up @@ -1009,3 +1057,7 @@ func (f *indexAliasImplFieldDict) Close() error {
defer f.index.mutex.RUnlock()
return f.fieldDict.Close()
}

func (f *indexAliasImplFieldDict) Cardinality() int {
return f.fieldDict.Cardinality()
}
Loading
Loading