From 36db386f13c05251ba502100be7694046cf32f1b Mon Sep 17 00:00:00 2001 From: Thejas-bhat Date: Mon, 6 Jan 2025 12:28:27 +0530 Subject: [PATCH] score explanation --- index_alias_impl.go | 2 +- mapping/index.go | 1 + search/scorer/scorer_term.go | 61 +++++++++++++++++++++++++--------- search/searcher/search_term.go | 2 +- 4 files changed, 49 insertions(+), 17 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 88c021a90..7956a7a72 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -591,7 +591,7 @@ func isBM25Enabled(req *SearchRequest, m mapping.IndexMapping) (bool, query.Fiel // scoring. Otherwise, we just skip the presearch for field := range fs { f := m.FieldMappingForPath(field) - if f.Similarity == "" || f.Similarity == index.BM25Similarity { + if f.Similarity == index.BM25Similarity { rv = true break } diff --git a/mapping/index.go b/mapping/index.go index 8a0d5e34a..6399bef20 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -50,6 +50,7 @@ type IndexMappingImpl struct { DefaultAnalyzer string `json:"default_analyzer"` DefaultDateTimeParser string `json:"default_datetime_parser"` DefaultSynonymSource string `json:"default_synonym_source,omitempty"` + DefaultSimilarity string `json:"default_similarity,omitempty"` DefaultField string `json:"default_field"` StoreDynamic bool `json:"store_dynamic"` IndexDynamic bool `json:"index_dynamic"` diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index 5f966e489..7c4d6fab9 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -132,6 +132,11 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) { } } +// multiplies deciding how much does a doc length affect the score and also +// how much can the term frequency affect the score in BM25 scoring +var k1 float64 = 1.2 +var b float64 = 0.75 + func (s *TermQueryScorer) docScore(tf, norm float64) float64 { // tf-idf scoring by default score := tf * norm * s.idf @@ -140,16 +145,52 @@ func (s *TermQueryScorer) docScore(tf, norm float64) float64 { // using the posting's norm value to recompute the field length for the doc num fieldLength := 1 / (norm * norm) - // multiplies deciding how much does a doc length affect the score and also - // how much can the term frequency affect the score - var k1 float64 = 1.2 - var b float64 = 0.75 score = s.idf * (tf * k1) / (tf + k1*(1-b+(b*fieldLength/s.avgDocLength))) } return score } +func (s *TermQueryScorer) scoreExplanation(tf float64, termMatch *index.TermFieldDoc) []*search.Explanation { + var rv []*search.Explanation + if s.avgDocLength > 0 { + fieldLength := 1 / (termMatch.Norm * termMatch.Norm) + fieldNormVal := 1 - b + (b * fieldLength / s.avgDocLength) + fieldNormalizeExplanation := &search.Explanation{ + Value: fieldNormVal, + Message: fmt.Sprintf("fieldNorm(field=%s), b=%f, fieldLength=%f, avgFieldLength=%f)", + s.queryField, b, fieldLength, s.avgDocLength), + } + + saturationExplanation := &search.Explanation{ + Value: k1 / (tf + k1*fieldNormVal), + Message: fmt.Sprintf("saturation(term:%s), k1=%f/(tf=%f + k1*fieldNorm=%f))", + termMatch.Term, tf, k1, fieldNormVal), + Children: []*search.Explanation{fieldNormalizeExplanation}, + } + + rv = make([]*search.Explanation, 3) + rv[0] = &search.Explanation{ + Value: tf, + Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), + } + rv[1] = saturationExplanation + rv[2] = s.idfExplanation + } else { + rv = make([]*search.Explanation, 3) + rv[0] = &search.Explanation{ + Value: tf, + Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), + } + rv[1] = &search.Explanation{ + Value: termMatch.Norm, + Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID), + } + rv[2] = s.idfExplanation + } + return rv +} + func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.TermFieldDoc) *search.DocumentMatch { rv := ctx.DocumentMatchPool.Get() // perform any score computations only when needed @@ -163,18 +204,8 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term } score := s.docScore(tf, termMatch.Norm) - // todo: explain stuff properly if s.options.Explain { - childrenExplanations := make([]*search.Explanation, 3) - childrenExplanations[0] = &search.Explanation{ - Value: tf, - Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), - } - childrenExplanations[1] = &search.Explanation{ - Value: termMatch.Norm, - Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID), - } - childrenExplanations[2] = s.idfExplanation + childrenExplanations := s.scoreExplanation(tf, termMatch) scoreExplanation = &search.Explanation{ Value: score, Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID), diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index 55215b944..c052ea00c 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -139,7 +139,7 @@ func newTermSearcherFromReader(ctx context.Context, indexReader index.IndexReade if similaritModelCallback, ok := ctx.Value(search. GetSimilarityModelCallbackKey).(search.GetSimilarityModelCallbackFn); ok { similarityModel := similaritModelCallback(field) - if similarityModel == "" || similarityModel == index.BM25Similarity { + if similarityModel == index.BM25Similarity { // in case of bm25 need to fetch the multipliers as well (perhaps via context's presearch data) count, avgDocLength, err = bm25ScoreMetrics(ctx, field, indexReader) if err != nil {