Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MB-35347: Synonyms interface support + MB-58901: fieldDict's cardinality API #25

Merged
merged 3 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ func (d *FieldDict) Next() (*index.DictEntry, error) {
return nil, nil
}

func (d *FieldDict) Cardinality() int {
return len(d.terms)
}

func (d *FieldDict) BytesRead() uint64 {
// not implemented
return 0
Expand Down
2 changes: 1 addition & 1 deletion fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

package sear

func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (dist int, exceeded bool, reuse []int) {
func levenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (dist int, exceeded bool, reuse []int) {
la := len(a)
lb := len(b)

Expand Down
4 changes: 2 additions & 2 deletions fuzzy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,13 @@ func TestFuzzyMatch(t *testing.T) {
test := test
t.Run(fmt.Sprintf("%s-%d", test.searchTerm, test.fuzziness), func(t *testing.T) {
for _, sm := range test.shouldMatch {
dist, exceeded, _ := LevenshteinDistanceMaxReuseSlice(test.searchTerm, sm, test.fuzziness, nil)
dist, exceeded, _ := levenshteinDistanceMaxReuseSlice(test.searchTerm, sm, test.fuzziness, nil)
if dist > test.fuzziness || exceeded {
t.Errorf("expected %s to match, did not", sm)
}
}
for _, snm := range test.shouldNotMatch {
dist, exceeded, _ := LevenshteinDistanceMaxReuseSlice(test.searchTerm, snm, test.fuzziness, nil)
dist, exceeded, _ := levenshteinDistanceMaxReuseSlice(test.searchTerm, snm, test.fuzziness, nil)
if dist <= test.fuzziness && !exceeded {
t.Errorf("expected %s not to match, did", snm)
}
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ module github.com/blevesearch/sear
go 1.21

require (
github.com/blevesearch/bleve_index_api v1.1.12
github.com/blevesearch/vellum v1.0.10
github.com/blevesearch/bleve_index_api v1.2.1
github.com/blevesearch/vellum v1.1.0
)

require (
Expand Down
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
github.com/bits-and-blooms/bitset v1.2.0 h1:Kn4yilvwNtMACtf1eYDlG8H77R07mZSPbMjLyS07ChA=
github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY=
github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/bleve_index_api v1.2.1 h1:IuXwLvmyp7I7+e0FOA68gcHHLfzSQ4AqQ8wVab5uxk0=
github.com/blevesearch/bleve_index_api v1.2.1/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxyMIPI=
github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k=
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a h1:dGzPydgVsqGcTRVwiLJ1jVbufYwmzD3LfVPLKsKg+0k=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
71 changes: 63 additions & 8 deletions reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ import (
"sort"
"strings"

"github.com/blevesearch/vellum"

index "github.com/blevesearch/bleve_index_api"
"github.com/blevesearch/vellum"
vellev "github.com/blevesearch/vellum/levenshtein"
velreg "github.com/blevesearch/vellum/regexp"
)

Expand Down Expand Up @@ -150,29 +150,41 @@ func automatonMatch(la vellum.Automaton, termStr string) bool {
}

func (r *Reader) FieldDictRegexp(field, regexStr string) (index.FieldDict, error) {
fd, _, err := r.fieldDictRegexp(field, regexStr)
return fd, err
}

func (r *Reader) FieldDictRegexpAutomaton(field, regexStr string) (
index.FieldDict, index.RegexAutomaton, error) {
return r.fieldDictRegexp(field, regexStr)
}

func (r *Reader) fieldDictRegexp(field, regexStr string) (
index.FieldDict, index.RegexAutomaton, error) {
regex, cached := r.velregCache[regexStr]
if !cached {
var err error
regex, err = velreg.New(regexStr)
if err != nil {
return nil, fmt.Errorf("error compiling regexp: %v", err)
return nil, nil, fmt.Errorf("error compiling regexp: %v", err)
}
r.velregCache[regexStr] = regex
}
if r.s.doc == nil {
return fieldDictEmpty, nil
return fieldDictEmpty, regex, nil
}
fieldSortedTerms, err := r.s.doc.SortedTermsForField(field)
if err != nil {
// only error is field doesn't exist in doc
return fieldDictEmpty, nil
return fieldDictEmpty, regex, nil
}
return NewFieldDictWithTerms(fieldSortedTerms, func(s string) bool {
return automatonMatch(regex, s)
}), nil
}), regex, nil
}

func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string) (index.FieldDict, error) {
func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string) (
index.FieldDict, error) {
if r.s.doc == nil {
return fieldDictEmpty, nil
}
Expand All @@ -184,14 +196,30 @@ func (r *Reader) FieldDictFuzzy(field, term string, fuzziness int, prefix string
return NewFieldDictWithTerms(fieldSortedTerms, func(indexTerm string) bool {
var dist int
var exceeded bool
dist, exceeded, r.levSlice = LevenshteinDistanceMaxReuseSlice(term, indexTerm, fuzziness, r.levSlice)
dist, exceeded, r.levSlice = levenshteinDistanceMaxReuseSlice(
term, indexTerm, fuzziness, r.levSlice)
if dist <= fuzziness && !exceeded {
return true
}
return false
}), nil
}

func (r *Reader) FieldDictFuzzyAutomaton(field, term string, fuzziness int, prefix string) (
index.FieldDict, index.FuzzyAutomaton, error) {
a, err := getLevAutomaton(term, uint8(fuzziness))
if err != nil {
return nil, nil, err
}
var fa index.FuzzyAutomaton
if vfa, ok := a.(vellum.FuzzyAutomaton); ok {
fa = vfa
}

fd, err := r.FieldDictFuzzy(field, term, fuzziness, prefix)
return fd, fa, err
}

func (r *Reader) FieldDictContains(field string) (index.FieldDictContains, error) {
if r.s.doc == nil {
return fieldDictContainsEmpty, nil
Expand Down Expand Up @@ -253,3 +281,30 @@ func (r *Reader) InternalID(id string) (index.IndexInternalID, error) {
func (r *Reader) Close() error {
return nil
}

// -----------------------------------------------------------------------------

// re usable, threadsafe levenshtein builders
var lb1, lb2 *vellev.LevenshteinAutomatonBuilder

func init() {
var err error
lb1, err = vellev.NewLevenshteinAutomatonBuilder(1, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err))
}
lb2, err = vellev.NewLevenshteinAutomatonBuilder(2, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err))
}
}

// https://github.com/blevesearch/bleve/blob/77458c4/index/scorch/snapshot_index.go#L291
func getLevAutomaton(term string, fuzziness uint8) (vellum.Automaton, error) {
if fuzziness == 1 {
return lb1.BuildDfa(term, fuzziness)
} else if fuzziness == 2 {
return lb2.BuildDfa(term, fuzziness)
}
return nil, fmt.Errorf("fuzziness exceeds the max limit")
}
Loading