diff --git a/analysis/benchmark_test.go b/analysis/benchmark_test.go index c5b6647aa..6c61c5231 100644 --- a/analysis/benchmark_test.go +++ b/analysis/benchmark_test.go @@ -32,7 +32,10 @@ func BenchmarkAnalysis(b *testing.B) { b.Fatal(err) } - ts := analyzer.Analyze(bleveWikiArticle) + ts, err := analysis.AnalyzeForTokens(analyzer, bleveWikiArticle) + if err != nil { + b.Fatalf("error analyzing text: %v", err) + } freqs := analysis.TokenFrequency(ts, nil, index.IncludeTermVectors) if len(freqs) != 511 { b.Errorf("expected %d freqs, got %d", 511, len(freqs)) diff --git a/analysis/lang/ar/analyzer_ar_test.go b/analysis/lang/ar/analyzer_ar_test.go index 437d69fd9..29590d20a 100644 --- a/analysis/lang/ar/analyzer_ar_test.go +++ b/analysis/lang/ar/analyzer_ar_test.go @@ -175,7 +175,10 @@ func TestArabicAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term) diff --git a/analysis/lang/cjk/analyzer_cjk_test.go b/analysis/lang/cjk/analyzer_cjk_test.go index afd895788..8b8f7d232 100644 --- a/analysis/lang/cjk/analyzer_cjk_test.go +++ b/analysis/lang/cjk/analyzer_cjk_test.go @@ -617,7 +617,10 @@ func TestCJKAnalyzer(t *testing.T) { if err != nil { t.Fatal(err) } - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/ckb/analyzer_ckb_test.go b/analysis/lang/ckb/analyzer_ckb_test.go index 9e6adab8e..ec368ff14 100644 --- a/analysis/lang/ckb/analyzer_ckb_test.go +++ b/analysis/lang/ckb/analyzer_ckb_test.go @@ -69,7 +69,10 @@ func TestSoraniAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/ckb/sorani_stemmer_filter_test.go b/analysis/lang/ckb/sorani_stemmer_filter_test.go index 4dd9a9fda..0a56aae1c 100644 --- a/analysis/lang/ckb/sorani_stemmer_filter_test.go +++ b/analysis/lang/ckb/sorani_stemmer_filter_test.go @@ -26,7 +26,7 @@ func TestSoraniStemmerFilter(t *testing.T) { // in order to match the lucene tests // we will test with an analyzer, not just the stemmer - analyzer := analysis.DefaultAnalyzer{ + analyzer := &analysis.DefaultAnalyzer{ Tokenizer: single.NewSingleTokenTokenizer(), TokenFilters: []analysis.TokenFilter{ NewSoraniNormalizeFilter(), @@ -283,7 +283,11 @@ func TestSoraniStemmerFilter(t *testing.T) { } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Errorf("error analyzing input: %v", err) + } + if !reflect.DeepEqual(actual, test.output) { t.Errorf("for input %s(% x)", test.input, test.input) t.Errorf("\texpected:") diff --git a/analysis/lang/da/analyzer_da_test.go b/analysis/lang/da/analyzer_da_test.go index e22f32567..de862fd74 100644 --- a/analysis/lang/da/analyzer_da_test.go +++ b/analysis/lang/da/analyzer_da_test.go @@ -63,7 +63,10 @@ func TestDanishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/de/analyzer_de_test.go b/analysis/lang/de/analyzer_de_test.go index f404ded94..ce1dc6383 100644 --- a/analysis/lang/de/analyzer_de_test.go +++ b/analysis/lang/de/analyzer_de_test.go @@ -147,7 +147,10 @@ func TestGermanAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/en/analyzer_en_test.go b/analysis/lang/en/analyzer_en_test.go index 6db7c3000..47e5688bb 100644 --- a/analysis/lang/en/analyzer_en_test.go +++ b/analysis/lang/en/analyzer_en_test.go @@ -97,7 +97,10 @@ func TestEnglishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/es/analyzer_es_test.go b/analysis/lang/es/analyzer_es_test.go index ad3b1f650..6c209de2f 100644 --- a/analysis/lang/es/analyzer_es_test.go +++ b/analysis/lang/es/analyzer_es_test.go @@ -114,7 +114,10 @@ func TestSpanishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/fa/analyzer_fa_test.go b/analysis/lang/fa/analyzer_fa_test.go index f648261f9..f0f416125 100644 --- a/analysis/lang/fa/analyzer_fa_test.go +++ b/analysis/lang/fa/analyzer_fa_test.go @@ -305,7 +305,10 @@ func TestPersianAnalyzerVerbs(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } @@ -600,7 +603,10 @@ func TestPersianAnalyzerVerbsDefective(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } @@ -671,7 +677,10 @@ func TestPersianAnalyzerOthers(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/fi/analyzer_fi_test.go b/analysis/lang/fi/analyzer_fi_test.go index 45aa242de..c08537848 100644 --- a/analysis/lang/fi/analyzer_fi_test.go +++ b/analysis/lang/fi/analyzer_fi_test.go @@ -57,7 +57,10 @@ func TestFinishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/fr/analyzer_fr_test.go b/analysis/lang/fr/analyzer_fr_test.go index 38f89e079..ed742e4d4 100644 --- a/analysis/lang/fr/analyzer_fr_test.go +++ b/analysis/lang/fr/analyzer_fr_test.go @@ -196,7 +196,10 @@ func TestFrenchAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/hi/analyzer_hi_test.go b/analysis/lang/hi/analyzer_hi_test.go index a86aeefd8..568398850 100644 --- a/analysis/lang/hi/analyzer_hi_test.go +++ b/analysis/lang/hi/analyzer_hi_test.go @@ -58,7 +58,10 @@ func TestHindiAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/hr/analyzer_hr_test.go b/analysis/lang/hr/analyzer_hr_test.go index e1ab35afc..6adbe7e92 100644 --- a/analysis/lang/hr/analyzer_hr_test.go +++ b/analysis/lang/hr/analyzer_hr_test.go @@ -84,7 +84,10 @@ func TestCroatianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/hu/analyzer_hu_test.go b/analysis/lang/hu/analyzer_hu_test.go index 8745668c2..c4e03abe8 100644 --- a/analysis/lang/hu/analyzer_hu_test.go +++ b/analysis/lang/hu/analyzer_hu_test.go @@ -57,7 +57,10 @@ func TestHungarianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/it/analyzer_it_test.go b/analysis/lang/it/analyzer_it_test.go index 19b9d4dfe..65a3a0222 100644 --- a/analysis/lang/it/analyzer_it_test.go +++ b/analysis/lang/it/analyzer_it_test.go @@ -83,7 +83,10 @@ func TestItalianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/nl/analyzer_nl_test.go b/analysis/lang/nl/analyzer_nl_test.go index 707655f01..fe5fcc48d 100644 --- a/analysis/lang/nl/analyzer_nl_test.go +++ b/analysis/lang/nl/analyzer_nl_test.go @@ -57,7 +57,10 @@ func TestDutchAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/no/analyzer_no_test.go b/analysis/lang/no/analyzer_no_test.go index b37cb4d1c..5d7fab141 100644 --- a/analysis/lang/no/analyzer_no_test.go +++ b/analysis/lang/no/analyzer_no_test.go @@ -57,7 +57,10 @@ func TestNorwegianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/pl/analyzer_pl_test.go b/analysis/lang/pl/analyzer_pl_test.go index 073a28f79..8f0e8ba91 100644 --- a/analysis/lang/pl/analyzer_pl_test.go +++ b/analysis/lang/pl/analyzer_pl_test.go @@ -136,7 +136,10 @@ func TestPolishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/pt/analyzer_pt_test.go b/analysis/lang/pt/analyzer_pt_test.go index 417e64066..ea5ae3cfe 100644 --- a/analysis/lang/pt/analyzer_pt_test.go +++ b/analysis/lang/pt/analyzer_pt_test.go @@ -57,7 +57,10 @@ func TestPortugueseAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/ro/analyzer_ro_test.go b/analysis/lang/ro/analyzer_ro_test.go index 0fe4645bd..b8ec2bcb4 100644 --- a/analysis/lang/ro/analyzer_ro_test.go +++ b/analysis/lang/ro/analyzer_ro_test.go @@ -57,7 +57,10 @@ func TestRomanianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/ru/analyzer_ru_test.go b/analysis/lang/ru/analyzer_ru_test.go index 38534aff2..ef231853d 100644 --- a/analysis/lang/ru/analyzer_ru_test.go +++ b/analysis/lang/ru/analyzer_ru_test.go @@ -109,7 +109,10 @@ func TestRussianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/sv/analyzer_sv_test.go b/analysis/lang/sv/analyzer_sv_test.go index a3bd5f161..b7b4bc40b 100644 --- a/analysis/lang/sv/analyzer_sv_test.go +++ b/analysis/lang/sv/analyzer_sv_test.go @@ -57,7 +57,10 @@ func TestSwedishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/tr/analyzer_tr_test.go b/analysis/lang/tr/analyzer_tr_test.go index 3c4592b6e..720996bda 100644 --- a/analysis/lang/tr/analyzer_tr_test.go +++ b/analysis/lang/tr/analyzer_tr_test.go @@ -77,7 +77,10 @@ func TestTurkishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/type.go b/analysis/type.go index e3a7f201b..0fc725795 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -72,8 +72,58 @@ type TokenFilter interface { Filter(TokenStream) TokenStream } +// ----------------------------------------------------------------------------- + +type AnalyzerType int + +const ( + TokensAnalyzer AnalyzerType = iota + VectorAnalyzer +) + type Analyzer interface { - Analyze([]byte) TokenStream + Type() AnalyzerType + Analyze([]byte) (any, error) +} + +// Convenience method to analyze given input using an Analyzer. +// Performs type assertion to ensure that the output is a token stream. +func AnalyzeForTokens(analyzer Analyzer, input []byte) (TokenStream, error) { + if analyzer.Type() != TokensAnalyzer { + return nil, fmt.Errorf("incompatible analyzer type") + } + + output, err := analyzer.Analyze(input) + if err != nil { + return nil, err + } + + rv, ok := output.(TokenStream) + if !ok { + return nil, fmt.Errorf("unexpected output type, expected TokenStream") + } + + return rv, nil +} + +// Convenience method to analyze given input using an Analyzer. +// Performs type assertion to ensure that the output is a vector. +func AnalyzeForVectors(analyzer Analyzer, input []byte) ([]float32, error) { + if analyzer.Type() != VectorAnalyzer { + return nil, fmt.Errorf("incompatible analyzer type") + } + + output, err := analyzer.Analyze(input) + if err != nil { + return nil, err + } + + rv, ok := output.([]float32) + if !ok { + return nil, fmt.Errorf("unexpected output type, expected []float32") + } + + return rv, nil } type DefaultAnalyzer struct { @@ -82,7 +132,7 @@ type DefaultAnalyzer struct { TokenFilters []TokenFilter } -func (a *DefaultAnalyzer) Analyze(input []byte) TokenStream { +func (a *DefaultAnalyzer) Analyze(input []byte) (any, error) { if a.CharFilters != nil { for _, cf := range a.CharFilters { input = cf.Filter(input) @@ -94,9 +144,15 @@ func (a *DefaultAnalyzer) Analyze(input []byte) TokenStream { tokens = tf.Filter(tokens) } } - return tokens + return tokens, nil } +func (a *DefaultAnalyzer) Type() AnalyzerType { + return TokensAnalyzer +} + +// ----------------------------------------------------------------------------- + var ErrInvalidDateTime = fmt.Errorf("unable to parse datetime with any of the layouts") var ErrInvalidTimestampString = fmt.Errorf("unable to parse timestamp string") diff --git a/document/field.go b/document/field.go index eb104e2df..0b590a91a 100644 --- a/document/field.go +++ b/document/field.go @@ -29,7 +29,7 @@ type Field interface { // "doc1", then "field" in "doc2". ArrayPositions() []uint64 Options() index.FieldIndexingOptions - Analyze() + Analyze() error Value() []byte // NumPlainTextBytes should return the number of plain text bytes diff --git a/document/field_boolean.go b/document/field_boolean.go index 8c2987a7f..72379b453 100644 --- a/document/field_boolean.go +++ b/document/field_boolean.go @@ -66,7 +66,7 @@ func (b *BooleanField) Options() index.FieldIndexingOptions { return b.options } -func (b *BooleanField) Analyze() { +func (b *BooleanField) Analyze() error { tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ Start: 0, @@ -78,6 +78,8 @@ func (b *BooleanField) Analyze() { b.length = len(tokens) b.frequencies = analysis.TokenFrequency(tokens, b.arrayPositions, b.options) + + return nil } func (b *BooleanField) Value() []byte { diff --git a/document/field_composite.go b/document/field_composite.go index e0ba8af7a..57da02b3c 100644 --- a/document/field_composite.go +++ b/document/field_composite.go @@ -94,7 +94,8 @@ func (c *CompositeField) Options() index.FieldIndexingOptions { return c.options } -func (c *CompositeField) Analyze() { +func (c *CompositeField) Analyze() error { + return nil } func (c *CompositeField) Value() []byte { diff --git a/document/field_datetime.go b/document/field_datetime.go index f3b859c43..0f3a5141c 100644 --- a/document/field_datetime.go +++ b/document/field_datetime.go @@ -98,7 +98,7 @@ func (n *DateTimeField) splitValue() (numeric.PrefixCoded, string) { return numeric.PrefixCoded(parts[0]), string(parts[1]) } -func (n *DateTimeField) Analyze() { +func (n *DateTimeField) Analyze() error { valueWithoutLayout, _ := n.splitValue() tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ @@ -132,6 +132,8 @@ func (n *DateTimeField) Analyze() { n.length = len(tokens) n.frequencies = analysis.TokenFrequency(tokens, n.arrayPositions, n.options) + + return nil } func (n *DateTimeField) Value() []byte { diff --git a/document/field_geopoint.go b/document/field_geopoint.go index 5795043f2..6d007afb8 100644 --- a/document/field_geopoint.go +++ b/document/field_geopoint.go @@ -82,7 +82,7 @@ func (n *GeoPointField) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.frequencies } -func (n *GeoPointField) Analyze() { +func (n *GeoPointField) Analyze() error { tokens := make(analysis.TokenStream, 0, 8) tokens = append(tokens, &analysis.Token{ Start: 0, @@ -133,6 +133,8 @@ func (n *GeoPointField) Analyze() { n.length = len(tokens) n.frequencies = analysis.TokenFrequency(tokens, n.arrayPositions, n.options) + + return nil } func (n *GeoPointField) Value() []byte { diff --git a/document/field_geoshape.go b/document/field_geoshape.go index 6bf7b010a..24a21fd6c 100644 --- a/document/field_geoshape.go +++ b/document/field_geoshape.go @@ -84,7 +84,7 @@ func (n *GeoShapeField) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.frequencies } -func (n *GeoShapeField) Analyze() { +func (n *GeoShapeField) Analyze() error { // compute the bytes representation for the coordinates tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ @@ -111,6 +111,8 @@ func (n *GeoShapeField) Analyze() { n.length = len(tokens) n.frequencies = analysis.TokenFrequency(tokens, n.arrayPositions, n.options) + + return nil } func (n *GeoShapeField) Value() []byte { diff --git a/document/field_ip.go b/document/field_ip.go index 80a353a01..0a0dbabc6 100644 --- a/document/field_ip.go +++ b/document/field_ip.go @@ -79,7 +79,7 @@ func (n *IPField) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.frequencies } -func (b *IPField) Analyze() { +func (b *IPField) Analyze() error { tokens := analysis.TokenStream{ &analysis.Token{ @@ -92,6 +92,8 @@ func (b *IPField) Analyze() { } b.length = 1 b.frequencies = analysis.TokenFrequency(tokens, b.arrayPositions, b.options) + + return nil } func (b *IPField) Value() []byte { diff --git a/document/field_numeric.go b/document/field_numeric.go index 1ee7b75ee..2de8ebe83 100644 --- a/document/field_numeric.go +++ b/document/field_numeric.go @@ -81,7 +81,7 @@ func (n *NumericField) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.frequencies } -func (n *NumericField) Analyze() { +func (n *NumericField) Analyze() error { tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ Start: 0, @@ -114,6 +114,8 @@ func (n *NumericField) Analyze() { n.length = len(tokens) n.frequencies = analysis.TokenFrequency(tokens, n.arrayPositions, n.options) + + return nil } func (n *NumericField) Value() []byte { diff --git a/document/field_text.go b/document/field_text.go index d35e74732..75ab9e779 100644 --- a/document/field_text.go +++ b/document/field_text.go @@ -79,8 +79,9 @@ func (t *TextField) AnalyzedTokenFrequencies() index.TokenFrequencies { return t.frequencies } -func (t *TextField) Analyze() { +func (t *TextField) Analyze() error { var tokens analysis.TokenStream + var err error if t.analyzer != nil { bytesToAnalyze := t.Value() if t.options.IsStored() { @@ -89,7 +90,7 @@ func (t *TextField) Analyze() { copy(bytesCopied, bytesToAnalyze) bytesToAnalyze = bytesCopied } - tokens = t.analyzer.Analyze(bytesToAnalyze) + tokens, err = analysis.AnalyzeForTokens(t.analyzer, bytesToAnalyze) } else { tokens = analysis.TokenStream{ &analysis.Token{ @@ -101,8 +102,13 @@ func (t *TextField) Analyze() { }, } } - t.length = len(tokens) // number of tokens in this doc field - t.frequencies = analysis.TokenFrequency(tokens, t.arrayPositions, t.options) + + if err == nil { + t.length = len(tokens) // number of tokens in this doc field + t.frequencies = analysis.TokenFrequency(tokens, t.arrayPositions, t.options) + } + + return err } func (t *TextField) Analyzer() analysis.Analyzer { diff --git a/document/field_vector.go b/document/field_vector.go index 53334d202..63f8968c5 100644 --- a/document/field_vector.go +++ b/document/field_vector.go @@ -82,8 +82,9 @@ func (n *VectorField) AnalyzedTokenFrequencies() index.TokenFrequencies { return nil } -func (n *VectorField) Analyze() { +func (n *VectorField) Analyze() error { // vectors aren't analyzed + return nil } func (n *VectorField) Value() []byte { diff --git a/document/field_vector_base64.go b/document/field_vector_base64.go index 31d6cbffd..0deb6f79d 100644 --- a/document/field_vector_base64.go +++ b/document/field_vector_base64.go @@ -79,7 +79,8 @@ func (n *VectorBase64Field) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.vectorField.AnalyzedTokenFrequencies() } -func (n *VectorBase64Field) Analyze() { +func (n *VectorBase64Field) Analyze() error { + return nil } func (n *VectorBase64Field) Value() []byte { diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 429d1daa9..b86bdac89 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -409,6 +409,11 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { if doc != nil { // put the work on the queue s.analysisQueue.Queue(func() { + // ignoring the analysis errors here. + // Data for all the fields with analysis errors will be ignored. + // + // todo: collect stats on analysis errors and also maintain + // records of such unique errors. analyze(doc, s.setSpatialAnalyzerPlugin) resultChan <- doc }) @@ -674,8 +679,8 @@ func (s *Scorch) StatsMap() map[string]interface{} { return m } -func (s *Scorch) Analyze(d index.Document) { - analyze(d, s.setSpatialAnalyzerPlugin) +func (s *Scorch) Analyze(d index.Document) map[string]error { + return analyze(d, s.setSpatialAnalyzerPlugin) } type customAnalyzerPluginInitFunc func(field index.Field) @@ -691,14 +696,19 @@ func (s *Scorch) setSpatialAnalyzerPlugin(f index.Field) { } } -func analyze(d index.Document, fn customAnalyzerPluginInitFunc) { +func analyze(d index.Document, fn customAnalyzerPluginInitFunc) map[string]error { + rv := make(map[string]error) d.VisitFields(func(field index.Field) { if field.Options().IsIndexed() { if fn != nil { fn(field) } - field.Analyze() + err := field.Analyze() + if err != nil { + rv[field.Name()] = err + return + } if d.HasComposite() && field.Name() != "_id" { // see if any of the composite fields need this @@ -708,6 +718,8 @@ func analyze(d index.Document, fn customAnalyzerPluginInitFunc) { } } }) + + return rv } func (s *Scorch) AddEligibleForRemoval(epoch uint64) { diff --git a/mapping/index.go b/mapping/index.go index fe8c96713..a151342bf 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -425,7 +425,7 @@ func (im *IndexMappingImpl) AnalyzeText(analyzerName string, text []byte) (analy if err != nil { return nil, err } - return analyzer.Analyze(text), nil + return analysis.AnalyzeForTokens(analyzer, text) } // FieldAnalyzer returns the name of the analyzer used on a field. diff --git a/registry/analyzer.go b/registry/analyzer.go index f4753bc1c..3192a0648 100644 --- a/registry/analyzer.go +++ b/registry/analyzer.go @@ -16,20 +16,64 @@ package registry import ( "fmt" + "sync" "github.com/blevesearch/bleve/v2/analysis" ) -func RegisterAnalyzer(name string, constructor AnalyzerConstructor) { - _, exists := analyzers[name] +type AnalyzerConstructor func(config map[string]interface{}, cache *Cache) (analysis.Analyzer, error) + +// This registry serves as a cache of analyzer constructors. +// On the event of index creation/opening, all the analyzers referenced in the +// index mapping are instantiated using the constructors stored in this registry. +// Each Index maintain a cache of instantiated analyzers. +type AnalyzerRegistry struct { + m sync.RWMutex + cons map[string]AnalyzerConstructor +} + +func NewAnalyzerRegistry() *AnalyzerRegistry { + return &AnalyzerRegistry{ + cons: make(map[string]AnalyzerConstructor), + } +} + +func RegisterAnalyzer(name string, constructor AnalyzerConstructor) error { + analyzers.m.Lock() + defer analyzers.m.Unlock() + + _, exists := analyzers.cons[name] if exists { panic(fmt.Errorf("attempted to register duplicate analyzer named '%s'", name)) } - analyzers[name] = constructor + + analyzers.cons[name] = constructor + + return nil } -type AnalyzerConstructor func(config map[string]interface{}, cache *Cache) (analysis.Analyzer, error) -type AnalyzerRegistry map[string]AnalyzerConstructor +// Replace an existing analyzer constructor with a new one +// or register a new analyzer constructor if it doesn't exist +// +// It is the caller's responsibility to ensure that all indexes using the +// existing analyzer are closed and rebuilt after replacing the analyzer. +func ReplaceAnalyzer(name string, constructor AnalyzerConstructor) { + analyzers.m.Lock() + analyzers.cons[name] = constructor + analyzers.m.Unlock() +} + +// Remove an analyzer constructor from the registry +// +// It is the caller's responsibility to ensure that all indexes using the +// analyzer are closed and rebuilt after deregistering the analyzer. +func DeregisterAnalyzer(name string) { + analyzers.m.Lock() + delete(analyzers.cons, name) + analyzers.m.Unlock() +} + +// ----------------------------------------------------------------------------- type AnalyzerCache struct { *ConcurrentCache @@ -42,15 +86,18 @@ func NewAnalyzerCache() *AnalyzerCache { } func AnalyzerBuild(name string, config map[string]interface{}, cache *Cache) (interface{}, error) { - cons, registered := analyzers[name] - if !registered { - return nil, fmt.Errorf("no analyzer with name or type '%s' registered", name) - } - analyzer, err := cons(config, cache) - if err != nil { - return nil, fmt.Errorf("error building analyzer: %v", err) + analyzers.m.RLock() + defer analyzers.m.RUnlock() + + if cons, registered := analyzers.cons[name]; registered { + analyzer, err := cons(config, cache) + if err != nil { + return nil, fmt.Errorf("error building analyzer: %v", err) + } + return analyzer, nil } - return analyzer, nil + + return nil, fmt.Errorf("no analyzer with name or type '%s' registered", name) } func (c *AnalyzerCache) AnalyzerNamed(name string, cache *Cache) (analysis.Analyzer, error) { @@ -73,11 +120,15 @@ func (c *AnalyzerCache) DefineAnalyzer(name string, typ string, config map[strin } func AnalyzerTypesAndInstances() ([]string, []string) { + analyzers.m.RLock() + defer analyzers.m.RUnlock() + emptyConfig := map[string]interface{}{} emptyCache := NewCache() var types []string var instances []string - for name, cons := range analyzers { + + for name, cons := range analyzers.cons { _, err := cons(emptyConfig, emptyCache) if err == nil { instances = append(instances, name) diff --git a/registry/registry.go b/registry/registry.go index 1954d0896..eefaafaf0 100644 --- a/registry/registry.go +++ b/registry/registry.go @@ -34,7 +34,7 @@ var charFilters = make(CharFilterRegistry, 0) var tokenizers = make(TokenizerRegistry, 0) var tokenMaps = make(TokenMapRegistry, 0) var tokenFilters = make(TokenFilterRegistry, 0) -var analyzers = make(AnalyzerRegistry, 0) +var analyzers = NewAnalyzerRegistry() var dateTimeParsers = make(DateTimeParserRegistry, 0) type Cache struct { diff --git a/search/query/match.go b/search/query/match.go index 074d11d34..a57232ffb 100644 --- a/search/query/match.go +++ b/search/query/match.go @@ -18,6 +18,7 @@ import ( "context" "fmt" + "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/util" @@ -134,7 +135,10 @@ func (q *MatchQuery) Searcher(ctx context.Context, i index.IndexReader, m mappin return nil, fmt.Errorf("no analyzer named '%s' registered", q.Analyzer) } - tokens := analyzer.Analyze([]byte(q.Match)) + tokens, err := analysis.AnalyzeForTokens(analyzer, []byte(q.Match)) + if err != nil { + return nil, fmt.Errorf("error analyzing input, err:%v", err) + } if len(tokens) > 0 { tqs := make([]Query, len(tokens)) diff --git a/search/query/match_phrase.go b/search/query/match_phrase.go index 63a16a534..726e63ef7 100644 --- a/search/query/match_phrase.go +++ b/search/query/match_phrase.go @@ -84,7 +84,10 @@ func (q *MatchPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m return nil, fmt.Errorf("no analyzer named '%s' registered", q.Analyzer) } - tokens := analyzer.Analyze([]byte(q.MatchPhrase)) + tokens, err := analysis.AnalyzeForTokens(analyzer, []byte(q.MatchPhrase)) + if err != nil { + return nil, fmt.Errorf("error analyzing input, err:%v", err) + } if len(tokens) > 0 { phrase := tokenStreamToPhrase(tokens) phraseQuery := NewMultiPhraseQuery(phrase, field)