score explanation

blevesearch · Jan 6, 2025 · 36db386 · 36db386
1 parent 52e318d
commit 36db386
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 17 deletions.
diff --git a/index_alias_impl.go b/index_alias_impl.go
@@ -591,7 +591,7 @@ func isBM25Enabled(req *SearchRequest, m mapping.IndexMapping) (bool, query.Fiel
 	// scoring. Otherwise, we just skip the presearch
 	for field := range fs {
 		f := m.FieldMappingForPath(field)
-		if f.Similarity == "" || f.Similarity == index.BM25Similarity {
+		if f.Similarity == index.BM25Similarity {
 			rv = true
 			break
 		}

diff --git a/mapping/index.go b/mapping/index.go
@@ -50,6 +50,7 @@ type IndexMappingImpl struct {
 	DefaultAnalyzer       string                      `json:"default_analyzer"`
 	DefaultDateTimeParser string                      `json:"default_datetime_parser"`
 	DefaultSynonymSource  string                      `json:"default_synonym_source,omitempty"`
+	DefaultSimilarity     string                      `json:"default_similarity,omitempty"`
 	DefaultField          string                      `json:"default_field"`
 	StoreDynamic          bool                        `json:"store_dynamic"`
 	IndexDynamic          bool                        `json:"index_dynamic"`

diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go
@@ -132,6 +132,11 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
 	}
 }
 
+// multiplies deciding how much does a doc length affect the score and also
+// how much can the term frequency affect the score in BM25 scoring
+var k1 float64 = 1.2
+var b float64 = 0.75
+
 func (s *TermQueryScorer) docScore(tf, norm float64) float64 {
 	// tf-idf scoring by default
 	score := tf * norm * s.idf
@@ -140,16 +145,52 @@ func (s *TermQueryScorer) docScore(tf, norm float64) float64 {
 		// using the posting's norm value to recompute the field length for the doc num
 		fieldLength := 1 / (norm * norm)
 
-		// multiplies deciding how much does a doc length affect the score and also
-		// how much can the term frequency affect the score
-		var k1 float64 = 1.2
-		var b float64 = 0.75
 		score = s.idf * (tf * k1) /
 			(tf + k1*(1-b+(b*fieldLength/s.avgDocLength)))
 	}
 	return score
 }
 
+func (s *TermQueryScorer) scoreExplanation(tf float64, termMatch *index.TermFieldDoc) []*search.Explanation {
+	var rv []*search.Explanation
+	if s.avgDocLength > 0 {
+		fieldLength := 1 / (termMatch.Norm * termMatch.Norm)
+		fieldNormVal := 1 - b + (b * fieldLength / s.avgDocLength)
+		fieldNormalizeExplanation := &search.Explanation{
+			Value: fieldNormVal,
+			Message: fmt.Sprintf("fieldNorm(field=%s), b=%f, fieldLength=%f, avgFieldLength=%f)",
+				s.queryField, b, fieldLength, s.avgDocLength),
+		}
+
+		saturationExplanation := &search.Explanation{
+			Value: k1 / (tf + k1*fieldNormVal),
+			Message: fmt.Sprintf("saturation(term:%s), k1=%f/(tf=%f + k1*fieldNorm=%f))",
+				termMatch.Term, tf, k1, fieldNormVal),
+			Children: []*search.Explanation{fieldNormalizeExplanation},
+		}
+
+		rv = make([]*search.Explanation, 3)
+		rv[0] = &search.Explanation{
+			Value:   tf,
+			Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
+		}
+		rv[1] = saturationExplanation
+		rv[2] = s.idfExplanation
+	} else {
+		rv = make([]*search.Explanation, 3)
+		rv[0] = &search.Explanation{
+			Value:   tf,
+			Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
+		}
+		rv[1] = &search.Explanation{
+			Value:   termMatch.Norm,
+			Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID),
+		}
+		rv[2] = s.idfExplanation
+	}
+	return rv
+}
+
 func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.TermFieldDoc) *search.DocumentMatch {
 	rv := ctx.DocumentMatchPool.Get()
 	// perform any score computations only when needed
@@ -163,18 +204,8 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
 		}
 
 		score := s.docScore(tf, termMatch.Norm)
-		// todo: explain stuff properly
 		if s.options.Explain {
-			childrenExplanations := make([]*search.Explanation, 3)
-			childrenExplanations[0] = &search.Explanation{
-				Value:   tf,
-				Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
-			}
-			childrenExplanations[1] = &search.Explanation{
-				Value:   termMatch.Norm,
-				Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID),
-			}
-			childrenExplanations[2] = s.idfExplanation
+			childrenExplanations := s.scoreExplanation(tf, termMatch)
 			scoreExplanation = &search.Explanation{
 				Value:    score,
 				Message:  fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID),

diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go
@@ -139,7 +139,7 @@ func newTermSearcherFromReader(ctx context.Context, indexReader index.IndexReade
 		if similaritModelCallback, ok := ctx.Value(search.
 			GetSimilarityModelCallbackKey).(search.GetSimilarityModelCallbackFn); ok {
 			similarityModel := similaritModelCallback(field)
-			if similarityModel == "" || similarityModel == index.BM25Similarity {
+			if similarityModel == index.BM25Similarity {
 				// in case of bm25 need to fetch the multipliers as well (perhaps via context's presearch data)
 				count, avgDocLength, err = bm25ScoreMetrics(ctx, field, indexReader)
 				if err != nil {