Skip to content

Commit

Permalink
code comment, exposing the multipliers to be made configurable
Browse files Browse the repository at this point in the history
  • Loading branch information
Thejas-bhat committed Jan 10, 2025
1 parent 8cdb525 commit d478f4f
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 20 deletions.
21 changes: 8 additions & 13 deletions search/scorer/scorer_term.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ type TermQueryScorer struct {
queryTerm string
queryField string
queryBoost float64
docTerm uint64
docTotal uint64
docTerm uint64 // number of documents containing the term
docTotal uint64 // total number of documents in the index
avgDocLength float64
idf float64
options search.SearcherOptions
Expand Down Expand Up @@ -132,11 +132,6 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
}
}

// multiplies deciding how much does a doc length affect the score and also
// how much can the term frequency affect the score in BM25 scoring
var k1 float64 = 1.2
var b float64 = 0.75

func (s *TermQueryScorer) docScore(tf, norm float64) float64 {
// tf-idf scoring by default
score := tf * norm * s.idf
Expand All @@ -145,8 +140,8 @@ func (s *TermQueryScorer) docScore(tf, norm float64) float64 {
// using the posting's norm value to recompute the field length for the doc num
fieldLength := 1 / (norm * norm)

score = s.idf * (tf * k1) /
(tf + k1*(1-b+(b*fieldLength/s.avgDocLength)))
score = s.idf * (tf * search.BM25_k1) /
(tf + search.BM25_k1*(1-search.BM25_b+(search.BM25_b*fieldLength/s.avgDocLength)))
}
return score
}
Expand All @@ -155,17 +150,17 @@ func (s *TermQueryScorer) scoreExplanation(tf float64, termMatch *index.TermFiel
var rv []*search.Explanation
if s.avgDocLength > 0 {
fieldLength := 1 / (termMatch.Norm * termMatch.Norm)
fieldNormVal := 1 - b + (b * fieldLength / s.avgDocLength)
fieldNormVal := 1 - search.BM25_b + (search.BM25_b * fieldLength / s.avgDocLength)
fieldNormalizeExplanation := &search.Explanation{
Value: fieldNormVal,
Message: fmt.Sprintf("fieldNorm(field=%s), b=%f, fieldLength=%f, avgFieldLength=%f)",
s.queryField, b, fieldLength, s.avgDocLength),
s.queryField, search.BM25_b, fieldLength, s.avgDocLength),
}

saturationExplanation := &search.Explanation{
Value: k1 / (tf + k1*fieldNormVal),
Value: search.BM25_k1 / (tf + search.BM25_k1*fieldNormVal),
Message: fmt.Sprintf("saturation(term:%s), k1=%f/(tf=%f + k1*fieldNorm=%f))",
termMatch.Term, k1, tf, fieldNormVal),
termMatch.Term, search.BM25_k1, tf, fieldNormVal),
Children: []*search.Explanation{fieldNormalizeExplanation},
}

Expand Down
13 changes: 6 additions & 7 deletions search/searcher/search_term.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,18 +67,17 @@ func NewTermSearcherBytes(ctx context.Context, indexReader index.IndexReader,
return newTermSearcherFromReader(ctx, indexReader, reader, term, field, boost, options)
}

func tfTDFScoreMetrics(indexReader index.IndexReader) (uint64, float64, error) {
func tfIDFScoreMetrics(indexReader index.IndexReader) (uint64, error) {
// default tf-idf stats
count, err := indexReader.DocCount()
if err != nil {
return 0, 0, err
return 0, err
}
// field cardinality metric is not used in the tf-idf scoring algo.
fieldCardinality := 0

if count == 0 {
return 0, 0, nil
return 0, nil
}
return count, float64(fieldCardinality / int(count)), nil
return count, nil
}

func bm25ScoreMetrics(ctx context.Context, field string,
Expand Down Expand Up @@ -137,7 +136,7 @@ func newTermSearcherFromReader(ctx context.Context, indexReader index.IndexReade
case index.TFIDFScoring:
fallthrough
default:
count, avgDocLength, err = tfTDFScoreMetrics(indexReader)
count, err = tfIDFScoreMetrics(indexReader)
if err != nil {
_ = reader.Close()
return nil, err
Expand Down
7 changes: 7 additions & 0 deletions search/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,13 @@ func (f FieldTermSynonymMap) MergeWith(fts FieldTermSynonymMap) {
}
}

// BM25 specific multipliers which affect the scoring of a document.
//
// BM25_b - how much does a doc's field length affect the score
// BM25_k1 - how much can the term frequency affect the score
var BM25_k1 float64 = 1.2
var BM25_b float64 = 0.75

type BM25Stats struct {
DocCount float64 `json:"doc_count"`
FieldCardinality map[string]int `json:"field_cardinality"`
Expand Down

0 comments on commit d478f4f

Please sign in to comment.