diff --git a/clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto b/clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto index 280940fa0..da99a7535 100644 --- a/clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto +++ b/clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto @@ -574,6 +574,9 @@ message Field { VectorElementType vectorElementType = 34; // Position increment gap for indexing multi valued TEXT fields. Must be >= 0, defaulting to 100 when not set. optional int32 positionIncrementGap = 35; + // For arrays of strings, ignoreAbove will be applied for each array element separately and string elements longer than ignore_above will not be indexed or stored. + // This option is also useful for protecting against Lucene’s term byte-length limit of 32766 + optional int32 ignoreAbove = 36; } // Vector field element type diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/TextBaseFieldDef.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/TextBaseFieldDef.java index decb50368..fa4cc2608 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/TextBaseFieldDef.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/TextBaseFieldDef.java @@ -62,6 +62,7 @@ public abstract class TextBaseFieldDef extends IndexableFieldDef public final Map ordinalLookupCache = new HashMap<>(); private final Object ordinalBuilderLock = new Object(); + private final int ignoreAbove; /** * Field constructor. Uses {@link IndexableFieldDef#IndexableFieldDef(String, Field)} to do common @@ -76,6 +77,7 @@ protected TextBaseFieldDef(String name, Field requestField) { indexAnalyzer = parseIndexAnalyzer(requestField); searchAnalyzer = parseSearchAnalyzer(requestField); eagerFieldGlobalOrdinals = requestField.getEagerFieldGlobalOrdinals(); + ignoreAbove = requestField.hasIgnoreAbove() ? requestField.getIgnoreAbove() : Integer.MAX_VALUE; } @Override @@ -236,18 +238,20 @@ public void parseDocumentField( BytesRef stringBytes = new BytesRef(fieldStr); if (docValuesType == DocValuesType.BINARY) { document.add(new BinaryDocValuesField(getName(), stringBytes)); - } else if (docValuesType == DocValuesType.SORTED) { - document.add(new SortedDocValuesField(getName(), stringBytes)); - } else if (docValuesType == DocValuesType.SORTED_SET) { - document.add(new SortedSetDocValuesField(getName(), stringBytes)); - } else { - throw new IllegalArgumentException( - String.format( - "Unsupported doc value type %s for field %s", docValuesType, this.getName())); + } else if (fieldStr.length() <= ignoreAbove) { + if (docValuesType == DocValuesType.SORTED) { + document.add(new SortedDocValuesField(getName(), stringBytes)); + } else if (docValuesType == DocValuesType.SORTED_SET) { + document.add(new SortedSetDocValuesField(getName(), stringBytes)); + } else { + throw new IllegalArgumentException( + String.format( + "Unsupported doc value type %s for field %s", docValuesType, this.getName())); + } } } - if (isStored() || isSearchable()) { + if ((isStored() || isSearchable()) && fieldStr.length() <= ignoreAbove) { document.add(new FieldWithData(getName(), fieldType, fieldStr)); } diff --git a/src/test/java/com/yelp/nrtsearch/server/grpc/IgnoreAboveTest.java b/src/test/java/com/yelp/nrtsearch/server/grpc/IgnoreAboveTest.java new file mode 100644 index 000000000..43be04058 --- /dev/null +++ b/src/test/java/com/yelp/nrtsearch/server/grpc/IgnoreAboveTest.java @@ -0,0 +1,140 @@ +/* + * Copyright 2022 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.yelp.nrtsearch.server.grpc; + +import static org.junit.Assert.assertEquals; + +import com.yelp.nrtsearch.server.config.IndexStartConfig.IndexDataLocationType; +import com.yelp.nrtsearch.server.grpc.AddDocumentRequest.MultiValuedField; +import java.io.IOException; +import java.util.List; +import java.util.stream.Stream; +import org.junit.After; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class IgnoreAboveTest { + + @Rule public final TemporaryFolder folder = new TemporaryFolder(); + + private static final List fields = + List.of( + Field.newBuilder() + .setName("id") + .setType(FieldType._ID) + .setStoreDocValues(true) + .setSearch(true) + .build(), + Field.newBuilder() + .setName("field1") + .setStoreDocValues(true) + .setSearch(true) + .setMultiValued(true) + .setIgnoreAbove(12) + .setType(FieldType.TEXT) + .build()); + + @After + public void cleanup() { + TestServer.cleanupAll(); + } + + private void addInitialDoc(TestServer testServer) { + AddDocumentRequest addDocumentRequest = + AddDocumentRequest.newBuilder() + .setIndexName("test_index") + .putFields("id", MultiValuedField.newBuilder().addValue("1").build()) + .putFields("field1", MultiValuedField.newBuilder().addValue("first Vendor").build()) + .build(); + testServer.addDocs(Stream.of(addDocumentRequest)); + } + + private void addAdditionalDoc(TestServer testServer) { + AddDocumentRequest addDocumentRequest = + AddDocumentRequest.newBuilder() + .setIndexName("test_index") + .putFields("id", MultiValuedField.newBuilder().addValue("2").build()) + .putFields( + "field1", + MultiValuedField.newBuilder() + .addValue("second Vendor") + .addValue("new Vendor") + .build()) + .build(); + testServer.addDocs(Stream.of(addDocumentRequest)); + } + + private void verifyDocs(TestServer testServer) { + SearchRequest request = + SearchRequest.newBuilder() + .setIndexName("test_index") + .addRetrieveFields("id") + .setStartHit(0) + .setTopHits(10) + .setQuery( + Query.newBuilder() + .setMatchQuery( + MatchQuery.newBuilder().setField("field1").setQuery("first").build()) + .build()) + .build(); + SearchResponse response = testServer.getClient().getBlockingStub().search(request); + assertEquals(1, response.getHitsCount()); + request = + SearchRequest.newBuilder() + .setIndexName("test_index") + .addRetrieveFields("id") + .setStartHit(0) + .setTopHits(10) + .setQuery( + Query.newBuilder() + .setMatchQuery( + MatchQuery.newBuilder().setField("field1").setQuery("second").build()) + .build()) + .build(); + response = testServer.getClient().getBlockingStub().search(request); + assertEquals(0, response.getHitsCount()); + request = + SearchRequest.newBuilder() + .setIndexName("test_index") + .addRetrieveFields("id") + .setStartHit(0) + .setTopHits(10) + .setQuery( + Query.newBuilder() + .setMatchQuery( + MatchQuery.newBuilder().setField("field1").setQuery("new").build()) + .build()) + .build(); + response = testServer.getClient().getBlockingStub().search(request); + assertEquals(1, response.getHitsCount()); + } + + @Test + public void testIgnoreAbove() throws IOException { + TestServer primaryServer = + TestServer.builder(folder) + .withAutoStartConfig(true, Mode.PRIMARY, 0, IndexDataLocationType.LOCAL) + .build(); + primaryServer.createIndex("test_index"); + primaryServer.registerFields("test_index", fields); + primaryServer.startPrimaryIndex("test_index", -1, null); + addInitialDoc(primaryServer); + addAdditionalDoc(primaryServer); + primaryServer.refresh("test_index"); + verifyDocs(primaryServer); + } +}