Skip to content

Commit

Permalink
Address PR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
swethakann committed Apr 22, 2024
1 parent f848734 commit 913d062
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 56 deletions.
14 changes: 5 additions & 9 deletions clientlib/src/main/proto/yelp/nrtsearch/search.proto
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,11 @@ enum RewriteMethod {
}

message FuzzyParams {
int32 prefixLength = 1; // Length of common (non-fuzzy) prefix
int32 maxExpansions = 2; // The maximum number of terms to match.
bool transpositions = 3; // True if transpositions should be treated as a primitive edit operation. If this is false (default), comparisons will implement the classic Levenshtein algorithm.

// Fuzziness can be AUTO or based on number of edits. AUTO will determine maxEdits based in term length.
oneof Fuzziness {
int32 maxEdits = 4; // The maximum allowed Levenshtein Edit Distance (or number of edits). Possible values are 0, 1 and 2.
AutoFuzziness auto = 5; // Auto fuzziness which determines the max edits based on the term length. AUTO is the preferred setting.
}
int32 maxEdits = 1; // The maximum allowed Levenshtein Edit Distance (or number of edits). Possible values are 0, 1 and 2. Either set this or auto.
int32 prefixLength = 2; // Length of common (non-fuzzy) prefix
int32 maxExpansions = 3; // The maximum number of terms to match.
bool transpositions = 4; // True if transpositions should be treated as a primitive edit operation. If this is false (default), comparisons will implement the classic Levenshtein algorithm.
AutoFuzziness auto = 5; // Auto fuzziness which determines the max edits based on the term length. AUTO is the preferred setting. Either set this or maxEdits.

// Optional low and high values for auto fuzziness. Defaults to low: 3 and high: 6 if both are unset. Valid values are low >= 0 and low < high
message AutoFuzziness {
Expand Down
16 changes: 6 additions & 10 deletions docs/queries/match.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ A query that analyzes the text before finding matching documents. The tokens res

Fuzziness: Allows inexact fuzzy matching using FuzzyParams. When querying text or keyword fields, fuzziness is interpreted as Levenshtein Edit Distance (the number of one character changes that need to be made to one string to make it the same as another string).

The FuzzyParams can be specified as:
The FuzzyParams can be specified as follows. Only one of the two should be set.

maxEdits - 0, 1, 2 which is the maximum allowed Levenshtein Edit Distance.
AutoFuzziness - edit distance is computed based on the term length. It can optionally take a low and high value which are 3 and 6 by default.
Expand All @@ -27,15 +27,11 @@ Proto definition:
}
message FuzzyParams {
int32 prefixLength = 1; // Length of common (non-fuzzy) prefix
int32 maxExpansions = 2; // The maximum number of terms to match.
bool transpositions = 3; // True if transpositions should be treated as a primitive edit operation. If this is false (default), comparisons will implement the classic Levenshtein algorithm.
// Fuzziness can be AUTO or based on number of edits. AUTO will determine maxEdits based in term length.
oneof Fuzziness {
int32 maxEdits = 4; // The maximum allowed Levenshtein Edit Distance (or number of edits). Possible values are 0, 1 and 2.
AutoFuzziness auto = 5; // Auto fuzziness which determines the max edits based on the term length. AUTO is the preferred setting.
}
int32 maxEdits = 1; // The maximum allowed Levenshtein Edit Distance (or number of edits). Possible values are 0, 1 and 2. Either set this or auto.
int32 prefixLength = 2; // Length of common (non-fuzzy) prefix
int32 maxExpansions = 3; // The maximum number of terms to match.
bool transpositions = 4; // True if transpositions should be treated as a primitive edit operation. If this is false (default), comparisons will implement the classic Levenshtein algorithm.
AutoFuzziness auto = 5; // Auto fuzziness which determines the max edits based on the term length. AUTO is the preferred setting. Either set this or maxEdits.
// Optional low and high values for auto fuzziness. Defaults to low: 3 and high: 6 if both are unset. Valid values are low >= 0 and low < high
message AutoFuzziness {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
*/
package com.yelp.nrtsearch.server.luceneserver;

import static com.yelp.nrtsearch.server.utils.QueryUtils.computeMaxEditsFromTermLength;

import com.yelp.nrtsearch.server.grpc.FuzzyParams;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
Expand All @@ -24,62 +26,30 @@

public class MatchQueryBuilder extends QueryBuilder {

private final int DEFAULT_LOW = 3;
private final int DEFAULT_HIGH = 6;
int prefixLength;
int maxExpansions;
boolean transpositions;
FuzzyParams fuzzyParams;
int maxEdits;

public MatchQueryBuilder(Analyzer analyzer, FuzzyParams fuzzyParams) {
super(analyzer);
this.fuzzyParams = fuzzyParams;
this.maxEdits = fuzzyParams.getMaxEdits();
this.prefixLength = fuzzyParams.getPrefixLength();
this.maxExpansions = fuzzyParams.getMaxExpansions();
this.transpositions = fuzzyParams.getTranspositions();
this.fuzzyParams = fuzzyParams;
}

@Override
protected Query newTermQuery(Term term) {
FuzzyParams.FuzzinessCase fuzziness = fuzzyParams.getFuzzinessCase();
int maxEdits;
switch (fuzziness) {
case AUTO -> maxEdits = computeMaxEditsFromTermLength(term);
case MAXEDITS -> maxEdits = fuzzyParams.getMaxEdits();
default -> {
return super.newTermQuery(term); // If fuzziness is not set
}
if (fuzzyParams.hasAuto()) {
maxEdits = computeMaxEditsFromTermLength(term, fuzzyParams.getAuto());
}
if (maxEdits == 0) {
return super.newTermQuery(term);
} else {
return new FuzzyQuery(term, maxEdits, prefixLength, maxExpansions, transpositions);
}
}

protected int computeMaxEditsFromTermLength(Term term) {
int maxEdits;
int low = fuzzyParams.getAuto().getLow();
int high = fuzzyParams.getAuto().getHigh();
int termLength = term.bytes().length;
// If both values are not set, use default values
if (low == 0 && high == 0) {
low = DEFAULT_LOW;
high = DEFAULT_HIGH;
}
if (low < 0) {
throw new IllegalArgumentException("AutoFuzziness low value cannot be negative");
}
if (low >= high) {
throw new IllegalArgumentException("AutoFuzziness low value should be < high value");
}
if (termLength >= 0 && termLength < low) {
maxEdits = 0;
} else if (termLength >= low && termLength < high) {
maxEdits = 1;
} else {
maxEdits = 2;
}
return maxEdits;
}
}
51 changes: 51 additions & 0 deletions src/main/java/com/yelp/nrtsearch/server/utils/QueryUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright 2024 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.utils;

import com.yelp.nrtsearch.server.grpc.FuzzyParams;
import org.apache.lucene.index.Term;

public class QueryUtils {
private static final int DEFAULT_LOW = 3; // default low value for auto fuzziness
private static final int DEFAULT_HIGH = 6; // default high value for auto fuzziness

public static int computeMaxEditsFromTermLength(
Term term, FuzzyParams.AutoFuzziness autoFuzziness) {
int maxEdits;
int low = autoFuzziness.getLow();
int high = autoFuzziness.getHigh();
int termLength = term.bytes().length;
// If both values are not set, use default values
if (low == 0 && high == 0) {
low = DEFAULT_LOW;
high = DEFAULT_HIGH;
}
if (low < 0) {
throw new IllegalArgumentException("AutoFuzziness low value cannot be negative");
}
if (low >= high) {
throw new IllegalArgumentException("AutoFuzziness low value should be < high value");
}
if (termLength >= 0 && termLength < low) {
maxEdits = 0;
} else if (termLength >= low && termLength < high) {
maxEdits = 1;
} else {
maxEdits = 2;
}
return maxEdits;
}
}

0 comments on commit 913d062

Please sign in to comment.