From 354624f5a682a82d07727f2d06f6a2da8dabfb9e Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Tue, 22 Mar 2022 09:41:43 +0100 Subject: [PATCH 01/19] Support old postings formats --- .../core/internal/io/IOUtils.java | 42 + .../xpack/lucene/bwc/codecs/BWCCodec.java | 81 +- .../LegacyAdaptingPerFieldPostingsFormat.java | 200 ++ .../blocktree/CompressionAlgorithm.java | 77 + .../lucene40/blocktree/FieldReader.java | 207 ++ .../blocktree/IntersectTermsEnum.java | 577 ++++++ .../blocktree/IntersectTermsEnumFrame.java | 358 ++++ .../Lucene40BlockTreeTermsReader.java | 395 ++++ .../lucene40/blocktree/SegmentTermsEnum.java | 1170 +++++++++++ .../blocktree/SegmentTermsEnumFrame.java | 765 +++++++ .../bwc/codecs/lucene40/blocktree/Stats.java | 277 +++ .../lucene50/BWCLucene50PostingsFormat.java | 477 +++++ .../lucene/bwc/codecs/lucene50/ForUtil.java | 235 +++ .../lucene50/Lucene50PostingsReader.java | 1787 +++++++++++++++++ .../lucene50/Lucene50ScoreSkipReader.java | 167 ++ .../codecs/lucene50/Lucene50SkipReader.java | 210 ++ .../bwc/codecs/lucene60/Lucene60Codec.java | 22 + .../bwc/codecs/lucene62/Lucene62Codec.java | 23 + .../bwc/codecs/lucene70/BWCLucene70Codec.java | 13 + .../bwc/codecs/lucene70/fst/BitTableUtil.java | 176 ++ .../lucene70/fst/ByteSequenceOutputs.java | 164 ++ .../codecs/lucene70/fst/BytesRefFSTEnum.java | 129 ++ .../bwc/codecs/lucene70/fst/BytesStore.java | 520 +++++ .../lucene/bwc/codecs/lucene70/fst/FST.java | 1569 +++++++++++++++ .../bwc/codecs/lucene70/fst/FSTCompiler.java | 804 ++++++++ .../bwc/codecs/lucene70/fst/FSTEnum.java | 660 ++++++ .../bwc/codecs/lucene70/fst/FSTStore.java | 37 + .../lucene70/fst/ForwardBytesReader.java | 64 + .../bwc/codecs/lucene70/fst/NodeHash.java | 192 ++ .../codecs/lucene70/fst/OffHeapFSTStore.java | 79 + .../codecs/lucene70/fst/OnHeapFSTStore.java | 103 + .../bwc/codecs/lucene70/fst/Outputs.java | 108 + .../lucene70/fst/ReverseBytesReader.java | 62 + .../fst/ReverseRandomAccessReader.java | 67 + .../lucene/bwc/codecs/lucene70/fst/Util.java | 903 +++++++++ .../Lucene40BlockTreeTermsWriter.java | 1124 +++++++++++ .../lucene50/BlockPostingsFormat2Tests.java | 149 ++ .../lucene50/BlockPostingsFormat3Tests.java | 477 +++++ .../lucene50/BlockPostingsFormatTests.java | 138 ++ .../lucene50/Lucene50PostingsWriter.java | 513 +++++ .../lucene50/Lucene50RWPostingsFormat.java | 56 + .../codecs/lucene50/Lucene50SkipWriter.java | 233 +++ .../org.apache.lucene.codecs.PostingsFormat | 16 + .../oldrepos/OldRepositoryAccessIT.java | 19 + 44 files changed, 15368 insertions(+), 77 deletions(-) create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/LegacyAdaptingPerFieldPostingsFormat.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/CompressionAlgorithm.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/FieldReader.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/IntersectTermsEnum.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/IntersectTermsEnumFrame.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/SegmentTermsEnum.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/SegmentTermsEnumFrame.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Stats.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BWCLucene50PostingsFormat.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/ForUtil.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsReader.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50ScoreSkipReader.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50SkipReader.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BitTableUtil.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ByteSequenceOutputs.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesRefFSTEnum.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesStore.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FST.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTCompiler.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTEnum.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTStore.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ForwardBytesReader.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/NodeHash.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OffHeapFSTStore.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OnHeapFSTStore.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Outputs.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ReverseBytesReader.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ReverseRandomAccessReader.java create mode 100644 x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Util.java create mode 100644 x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsWriter.java create mode 100644 x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormat2Tests.java create mode 100644 x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormat3Tests.java create mode 100644 x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormatTests.java create mode 100644 x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsWriter.java create mode 100644 x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50RWPostingsFormat.java create mode 100644 x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50SkipWriter.java create mode 100644 x-pack/plugin/old-lucene-versions/src/test/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat diff --git a/libs/core/src/main/java/org/elasticsearch/core/internal/io/IOUtils.java b/libs/core/src/main/java/org/elasticsearch/core/internal/io/IOUtils.java index 0a45fe010bbb5..183ff4111b693 100644 --- a/libs/core/src/main/java/org/elasticsearch/core/internal/io/IOUtils.java +++ b/libs/core/src/main/java/org/elasticsearch/core/internal/io/IOUtils.java @@ -316,4 +316,46 @@ public static void fsync(final Path fileToSync, final boolean isDir, final boole } } } + + /** + * This utility method takes a previously caught (non-null) {@code Throwable} and rethrows either + * the original argument if it was a subclass of the {@code IOException} or an {@code + * RuntimeException} with the cause set to the argument. + * + *

This method never returns any value, even though it declares a return value + * of type {@link Error}. The return value declaration is very useful to let the compiler know + * that the code path following the invocation of this method is unreachable. So in most cases the + * invocation of this method will be guarded by an {@code if} and used together with a {@code + * throw} statement, as in: + * + *

{@code
+     * if (t != null) throw IOUtils.rethrowAlways(t)
+     * }
+ * + * @param th The throwable to rethrow, must not be null. + * @return This method always results in an exception, it never returns any value. See method + * documentation for details and usage example. + * @throws IOException if the argument was an instance of IOException + * @throws RuntimeException with the {@link RuntimeException#getCause()} set to the argument, if + * it was not an instance of IOException. + */ + public static Error rethrowAlways(Throwable th) throws IOException, RuntimeException { + if (th == null) { + throw new AssertionError("rethrow argument must not be null."); + } + + if (th instanceof IOException) { + throw (IOException) th; + } + + if (th instanceof RuntimeException) { + throw (RuntimeException) th; + } + + if (th instanceof Error) { + throw (Error) th; + } + + throw new RuntimeException(th); + } } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java index b350f6a62404f..be5be0bc6a965 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java @@ -10,30 +10,20 @@ import org.apache.lucene.backward_codecs.lucene70.Lucene70Codec; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FieldInfosFormat; -import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsFormat; -import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.Terms; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.BWCLucene70Codec; import java.io.IOException; import java.util.ArrayList; -import java.util.Iterator; import java.util.List; /** @@ -41,17 +31,10 @@ */ public abstract class BWCCodec extends Codec { - private final PostingsFormat postingsFormat = new EmptyPostingsFormat(); - protected BWCCodec(String name) { super(name); } - @Override - public PostingsFormat postingsFormat() { - return postingsFormat; - } - @Override public NormsFormat normsFormat() { throw new UnsupportedOperationException(); @@ -72,62 +55,6 @@ public KnnVectorsFormat knnVectorsFormat() { throw new UnsupportedOperationException(); } - /** - * In-memory postings format that shows no postings available. - * TODO: Remove once https://issues.apache.org/jira/browse/LUCENE-10291 is fixed. - */ - static class EmptyPostingsFormat extends PostingsFormat { - - protected EmptyPostingsFormat() { - super("EmptyPostingsFormat"); - } - - @Override - public FieldsConsumer fieldsConsumer(SegmentWriteState state) { - return new FieldsConsumer() { - @Override - public void write(Fields fields, NormsProducer norms) { - throw new UnsupportedOperationException(); - } - - @Override - public void close() { - - } - }; - } - - @Override - public FieldsProducer fieldsProducer(SegmentReadState state) { - return new FieldsProducer() { - @Override - public void close() { - - } - - @Override - public void checkIntegrity() { - - } - - @Override - public Iterator iterator() { - return null; - } - - @Override - public Terms terms(String field) { - return null; - } - - @Override - public int size() { - return 0; - } - }; - } - } - protected static SegmentInfoFormat wrap(SegmentInfoFormat wrapped) { return new SegmentInfoFormat() { @Override @@ -158,7 +85,7 @@ public void write(Directory directory, SegmentInfo segmentInfo, String segmentSu }; } - // mark all fields as having no postings, no term vectors, no norms, no payloads, no points, and no vectors. + // mark all fields as no term vectors, no norms, no payloads, no points, and no vectors. private static FieldInfos filterFields(FieldInfos fieldInfos) { List fieldInfoCopy = new ArrayList<>(fieldInfos.size()); for (FieldInfo fieldInfo : fieldInfos) { @@ -167,9 +94,9 @@ private static FieldInfos filterFields(FieldInfos fieldInfos) { fieldInfo.name, fieldInfo.number, false, + true, false, - false, - IndexOptions.NONE, + fieldInfo.getIndexOptions(), fieldInfo.getDocValuesType(), fieldInfo.getDocValuesGen(), fieldInfo.attributes(), @@ -202,7 +129,7 @@ public static SegmentInfo wrap(SegmentInfo segmentInfo) { codec, segmentInfo.getDiagnostics(), segmentInfo.getId(), - segmentInfo.getAttributes(), + segmentInfo.getAttributes(), // adapt attributes so that per-field format codecs are overriden segmentInfo.getIndexSort() ); segmentInfo1.setFiles(segmentInfo.files()); diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/LegacyAdaptingPerFieldPostingsFormat.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/LegacyAdaptingPerFieldPostingsFormat.java new file mode 100644 index 0000000000000..8aefcd875834c --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/LegacyAdaptingPerFieldPostingsFormat.java @@ -0,0 +1,200 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.lucene.bwc.codecs; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.elasticsearch.core.internal.io.IOUtils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.IdentityHashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +public abstract class LegacyAdaptingPerFieldPostingsFormat extends PostingsFormat { + /** Name of this {@link PostingsFormat}. */ + public static final String PER_FIELD_NAME = "PerField40"; + + /** {@link FieldInfo} attribute name used to store the format name for each field. */ + public static final String PER_FIELD_FORMAT_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".format"; + + /** {@link FieldInfo} attribute name used to store the segment suffix name for each field. */ + public static final String PER_FIELD_SUFFIX_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".suffix"; + + /** Sole constructor. */ + protected LegacyAdaptingPerFieldPostingsFormat() { + super(PER_FIELD_NAME); + } + + static String getSuffix(String formatName, String suffix) { + return formatName + "_" + suffix; + } + + protected PostingsFormat getPostingsFormat(String formatName) { + throw new IllegalArgumentException(formatName); + } + + private class FieldsWriter extends FieldsConsumer { + final SegmentWriteState writeState; + final List toClose = new ArrayList(); + + FieldsWriter(SegmentWriteState writeState) { + this.writeState = writeState; + } + + @Override + public void write(Fields fields, NormsProducer norms) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void merge(MergeState mergeState, NormsProducer norms) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void close() throws IOException { + IOUtils.close(toClose); + } + } + + private static class FieldsReader extends FieldsProducer { + + private final Map fields = new TreeMap<>(); + private final Map formats = new HashMap<>(); + private final String segment; + + // clone for merge + FieldsReader(FieldsReader other) { + Map oldToNew = new IdentityHashMap<>(); + // First clone all formats + for (Map.Entry ent : other.formats.entrySet()) { + FieldsProducer values = ent.getValue().getMergeInstance(); + formats.put(ent.getKey(), values); + oldToNew.put(ent.getValue(), values); + } + + // Then rebuild fields: + for (Map.Entry ent : other.fields.entrySet()) { + FieldsProducer producer = oldToNew.get(ent.getValue()); + assert producer != null; + fields.put(ent.getKey(), producer); + } + + segment = other.segment; + } + + FieldsReader(final SegmentReadState readState, LegacyAdaptingPerFieldPostingsFormat legacyAdaptingPerFieldPostingsFormat) + throws IOException { + + // Read _X.per and init each format: + boolean success = false; + try { + // Read field name -> format name + for (FieldInfo fi : readState.fieldInfos) { + if (fi.getIndexOptions() != IndexOptions.NONE) { + final String fieldName = fi.name; + final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY); + if (formatName != null) { + // null formatName means the field is in fieldInfos, but has no postings! + final String suffix = fi.getAttribute(PER_FIELD_SUFFIX_KEY); + if (suffix == null) { + throw new IllegalStateException("missing attribute: " + PER_FIELD_SUFFIX_KEY + " for field: " + fieldName); + } + PostingsFormat format = legacyAdaptingPerFieldPostingsFormat.getPostingsFormat(formatName); + String segmentSuffix = getSuffix(formatName, suffix); + if (formats.containsKey(segmentSuffix) == false) { + formats.put(segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix))); + } + fields.put(fieldName, formats.get(segmentSuffix)); + } + } + } + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(formats.values()); + } + } + + this.segment = readState.segmentInfo.name; + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableSet(fields.keySet()).iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + FieldsProducer fieldsProducer = fields.get(field); + return fieldsProducer == null ? null : fieldsProducer.terms(field); + } + + @Override + public int size() { + return fields.size(); + } + + @Override + public void close() throws IOException { + IOUtils.close(formats.values()); + } + + @Override + public void checkIntegrity() throws IOException { + for (FieldsProducer producer : formats.values()) { + producer.checkIntegrity(); + } + } + + @Override + public FieldsProducer getMergeInstance() { + return new FieldsReader(this); + } + + @Override + public String toString() { + return "PerFieldPostings(segment=" + segment + " formats=" + formats.size() + ")"; + } + } + + @Override + public final FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new FieldsWriter(state); + } + + @Override + public final FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + return new FieldsReader(state, this); + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

The field to format mapping is written to the index, so this method is only invoked when + * writing, not when reading. + */ + public abstract PostingsFormat getPostingsFormatForField(String field); +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/CompressionAlgorithm.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/CompressionAlgorithm.java new file mode 100644 index 0000000000000..c353279451a3e --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/CompressionAlgorithm.java @@ -0,0 +1,77 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree; + +import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.compress.LowercaseAsciiCompression; + +import java.io.IOException; + +/** Compression algorithm used for suffixes of a block of terms. */ +enum CompressionAlgorithm { + NO_COMPRESSION(0x00) { + + @Override + void read(DataInput in, byte[] out, int len) throws IOException { + in.readBytes(out, 0, len); + } + }, + + LOWERCASE_ASCII(0x01) { + + @Override + void read(DataInput in, byte[] out, int len) throws IOException { + LowercaseAsciiCompression.decompress(in, out, len); + } + }, + + LZ4(0x02) { + + @Override + void read(DataInput in, byte[] out, int len) throws IOException { + org.apache.lucene.util.compress.LZ4.decompress(EndiannessReverserUtil.wrapDataInput(in), len, out, 0); + } + }; + + private static final CompressionAlgorithm[] BY_CODE = new CompressionAlgorithm[3]; + + static { + for (CompressionAlgorithm alg : CompressionAlgorithm.values()) { + BY_CODE[alg.code] = alg; + } + } + + /** Look up a {@link CompressionAlgorithm} by its {@link CompressionAlgorithm#code}. */ + static CompressionAlgorithm byCode(int code) { + if (code < 0 || code >= BY_CODE.length) { + throw new IllegalArgumentException("Illegal code for a compression algorithm: " + code); + } + return BY_CODE[code]; + } + + public final int code; + + CompressionAlgorithm(int code) { + this.code = code; + } + + abstract void read(DataInput in, byte[] out, int len) throws IOException; +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/FieldReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/FieldReader.java new file mode 100644 index 0000000000000..3d24e82edd18b --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/FieldReader.java @@ -0,0 +1,207 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.ByteSequenceOutputs; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.OffHeapFSTStore; + +import java.io.IOException; + +/** + * BlockTree's implementation of {@link Terms}. + * + * @lucene.internal + */ +public final class FieldReader extends Terms { + + // private final boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + final long numTerms; + final FieldInfo fieldInfo; + final long sumTotalTermFreq; + final long sumDocFreq; + final int docCount; + final long rootBlockFP; + final BytesRef rootCode; + final BytesRef minTerm; + final BytesRef maxTerm; + final Lucene40BlockTreeTermsReader parent; + + final FST index; + // private boolean DEBUG; + + FieldReader( + Lucene40BlockTreeTermsReader parent, + FieldInfo fieldInfo, + long numTerms, + BytesRef rootCode, + long sumTotalTermFreq, + long sumDocFreq, + int docCount, + long indexStartFP, + IndexInput metaIn, + IndexInput indexIn, + BytesRef minTerm, + BytesRef maxTerm + ) throws IOException { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + // DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); + this.parent = parent; + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.rootCode = rootCode; + this.minTerm = minTerm; + this.maxTerm = maxTerm; + // if (DEBUG) { + // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + // + rootCode + " divisor=" + indexDivisor); + // } + rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() + >>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; + // Initialize FST always off-heap. + final IndexInput clone = indexIn.clone(); + clone.seek(indexStartFP); + if (metaIn == indexIn) { // Only true before Lucene 8.6 + index = new FST<>(clone, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore()); + } else { + index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore()); + } + /* + if (false) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(index, w, false, false); + System.out.println("FST INDEX: SAVED to " + dotFileName); + w.close(); + } + */ + } + + @Override + public BytesRef getMin() throws IOException { + if (minTerm == null) { + // Older index that didn't store min/maxTerm + return super.getMin(); + } else { + return minTerm; + } + } + + @Override + public BytesRef getMax() throws IOException { + if (maxTerm == null) { + // Older index that didn't store min/maxTerm + return super.getMax(); + } else { + return maxTerm; + } + } + + /** For debugging -- used by CheckIndex too */ + @Override + public Stats getStats() throws IOException { + return new SegmentTermsEnum(this).computeBlockStats(); + } + + @Override + public boolean hasFreqs() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + } + + @Override + public boolean hasOffsets() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + + @Override + public TermsEnum iterator() throws IOException { + return new SegmentTermsEnum(this); + } + + @Override + public long size() { + return numTerms; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() { + return sumDocFreq; + } + + @Override + public int getDocCount() { + return docCount; + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + // if (DEBUG) System.out.println(" FieldReader.intersect startTerm=" + + // BlockTreeTermsWriter.brToString(startTerm)); + // System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton); + // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum? + // can we optimize knowing that...? + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm); + } + + @Override + public String toString() { + return "BlockTreeTerms(seg=" + + parent.segment + + " terms=" + + numTerms + + ",postings=" + + sumDocFreq + + ",positions=" + + sumTotalTermFreq + + ",docs=" + + docCount + + ")"; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/IntersectTermsEnum.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/IntersectTermsEnum.java new file mode 100644 index 0000000000000..7bc765a78bd0e --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/IntersectTermsEnum.java @@ -0,0 +1,577 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree; + +import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.Transition; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.ByteSequenceOutputs; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.Outputs; + +import java.io.IOException; + +/** + * This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot + * seek, except for the initial term on init. It just "nexts" through the intersection of the + * automaton and the terms. It does not use the terms index at all: on init, it loads the root + * block, and scans its way to the initial term. Likewise, in next it scans until it finds a term + * that matches the current automaton transition. + */ +final class IntersectTermsEnum extends BaseTermsEnum { + + // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + final IndexInput in; + static final Outputs fstOutputs = ByteSequenceOutputs.getSingleton(); + + IntersectTermsEnumFrame[] stack; + + @SuppressWarnings({ "rawtypes", "unchecked" }) + private FST.Arc[] arcs = new FST.Arc[5]; + + final RunAutomaton runAutomaton; + final Automaton automaton; + final BytesRef commonSuffix; + + private IntersectTermsEnumFrame currentFrame; + private Transition currentTransition; + + private final BytesRef term = new BytesRef(); + + private final FST.BytesReader fstReader; + + final FieldReader fr; + + private BytesRef savedStartTerm; + + // TODO: in some cases we can filter by length? eg + // regexp foo*bar must be at least length 6 bytes + IntersectTermsEnum(FieldReader fr, Automaton automaton, RunAutomaton runAutomaton, BytesRef commonSuffix, BytesRef startTerm) + throws IOException { + this.fr = fr; + + assert automaton != null; + assert runAutomaton != null; + + this.runAutomaton = runAutomaton; + this.automaton = automaton; + this.commonSuffix = commonSuffix; + + in = fr.parent.termsIn.clone(); + stack = new IntersectTermsEnumFrame[5]; + for (int idx = 0; idx < stack.length; idx++) { + stack[idx] = new IntersectTermsEnumFrame(this, idx); + } + for (int arcIdx = 0; arcIdx < arcs.length; arcIdx++) { + arcs[arcIdx] = new FST.Arc<>(); + } + + fstReader = fr.index.getBytesReader(); + + // TODO: if the automaton is "smallish" we really + // should use the terms index to seek at least to + // the initial term and likely to subsequent terms + // (or, maybe just fallback to ATE for such cases). + // Else the seek cost of loading the frames will be + // too costly. + + final FST.Arc arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + + // Special pushFrame since it's the first one: + final IntersectTermsEnumFrame f = stack[0]; + f.fp = f.fpOrig = fr.rootBlockFP; + f.prefix = 0; + f.setState(0); + f.arc = arc; + f.outputPrefix = arc.output(); + f.load(fr.rootCode); + + // for assert: + assert setSavedStartTerm(startTerm); + + currentFrame = f; + if (startTerm != null) { + seekToStartTerm(startTerm); + } + currentTransition = currentFrame.transition; + } + + // only for assert: + private boolean setSavedStartTerm(BytesRef startTerm) { + savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm); + return true; + } + + @Override + public TermState termState() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.clone(); + } + + private IntersectTermsEnumFrame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final IntersectTermsEnumFrame[] next = new IntersectTermsEnumFrame[ArrayUtil.oversize( + 1 + ord, + RamUsageEstimator.NUM_BYTES_OBJECT_REF + )]; + System.arraycopy(stack, 0, next, 0, stack.length); + for (int stackOrd = stack.length; stackOrd < next.length; stackOrd++) { + next[stackOrd] = new IntersectTermsEnumFrame(this, stackOrd); + } + stack = next; + } + assert stack[ord].ord == ord; + return stack[ord]; + } + + private FST.Arc getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings({ "rawtypes", "unchecked" }) + final FST.Arc[] next = new FST.Arc[ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for (int arcOrd = arcs.length; arcOrd < next.length; arcOrd++) { + next[arcOrd] = new FST.Arc<>(); + } + arcs = next; + } + return arcs[ord]; + } + + private IntersectTermsEnumFrame pushFrame(int state) throws IOException { + assert currentFrame != null; + + final IntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1 + currentFrame.ord); + + f.fp = f.fpOrig = currentFrame.lastSubFP; + f.prefix = currentFrame.prefix + currentFrame.suffix; + f.setState(state); + + // Walk the arc through the index -- we only + // "bother" with this so we can get the floor data + // from the index and skip floor blocks when + // possible: + FST.Arc arc = currentFrame.arc; + int idx = currentFrame.prefix; + assert currentFrame.suffix > 0; + BytesRef output = currentFrame.outputPrefix; + while (idx < f.prefix) { + final int target = term.bytes[idx] & 0xff; + // TODO: we could be more efficient for the next() + // case by using current arc as starting point, + // passed to findTargetArc + arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader); + assert arc != null; + output = fstOutputs.add(output, arc.output()); + idx++; + } + + f.arc = arc; + f.outputPrefix = output; + assert arc.isFinal(); + f.load(fstOutputs.add(output, arc.nextFinalOutput())); + return f; + } + + @Override + public BytesRef term() { + return term; + } + + @Override + public int docFreq() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + currentFrame.decodeMetaData(); + return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.termState, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + currentFrame.decodeMetaData(); + return fr.parent.postingsReader.impacts(fr.fieldInfo, currentFrame.termState, flags); + } + + private int getState() { + int state = currentFrame.state; + for (int idx = 0; idx < currentFrame.suffix; idx++) { + state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff); + assert state != -1; + } + return state; + } + + // NOTE: specialized to only doing the first-time + // seek, but we could generalize it to allow + // arbitrary seekExact/Ceil. Note that this is a + // seekFloor! + private void seekToStartTerm(BytesRef target) throws IOException { + assert currentFrame.ord == 0; + if (term.length < target.length) { + term.bytes = ArrayUtil.grow(term.bytes, target.length); + } + FST.Arc arc = arcs[0]; + assert arc == currentFrame.arc; + + for (int idx = 0; idx <= target.length; idx++) { + + while (true) { + final int savNextEnt = currentFrame.nextEnt; + final int savePos = currentFrame.suffixesReader.getPosition(); + final int saveLengthPos = currentFrame.suffixLengthsReader.getPosition(); + final int saveStartBytePos = currentFrame.startBytePos; + final int saveSuffix = currentFrame.suffix; + final long saveLastSubFP = currentFrame.lastSubFP; + final int saveTermBlockOrd = currentFrame.termState.termBlockOrd; + + final boolean isSubBlock = currentFrame.next(); + + term.length = currentFrame.prefix + currentFrame.suffix; + if (term.bytes.length < term.length) { + term.bytes = ArrayUtil.grow(term.bytes, term.length); + } + System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); + + if (isSubBlock && StringHelper.startsWith(target, term)) { + // Recurse + currentFrame = pushFrame(getState()); + break; + } else { + final int cmp = term.compareTo(target); + if (cmp < 0) { + if (currentFrame.nextEnt == currentFrame.entCount) { + if (currentFrame.isLastInFloor == false) { + // Advance to next floor block + currentFrame.loadNextFloorBlock(); + continue; + } else { + return; + } + } + continue; + } else if (cmp == 0) { + return; + } else { + // Fallback to prior entry: the semantics of + // this method is that the first call to + // next() will return the term after the + // requested term + currentFrame.nextEnt = savNextEnt; + currentFrame.lastSubFP = saveLastSubFP; + currentFrame.startBytePos = saveStartBytePos; + currentFrame.suffix = saveSuffix; + currentFrame.suffixesReader.setPosition(savePos); + currentFrame.suffixLengthsReader.setPosition(saveLengthPos); + currentFrame.termState.termBlockOrd = saveTermBlockOrd; + System.arraycopy( + currentFrame.suffixBytes, + currentFrame.startBytePos, + term.bytes, + currentFrame.prefix, + currentFrame.suffix + ); + term.length = currentFrame.prefix + currentFrame.suffix; + // If the last entry was a block we don't + // need to bother recursing and pushing to + // the last term under it because the first + // next() will simply skip the frame anyway + return; + } + } + } + } + + assert false; + } + + private boolean popPushNext() throws IOException { + // Pop finished frames + while (currentFrame.nextEnt == currentFrame.entCount) { + if (currentFrame.isLastInFloor == false) { + // Advance to next floor block + currentFrame.loadNextFloorBlock(); + break; + } else { + if (currentFrame.ord == 0) { + throw NoMoreTermsException.INSTANCE; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord - 1]; + currentTransition = currentFrame.transition; + assert currentFrame.lastSubFP == lastFP; + } + } + + return currentFrame.next(); + } + + // Only used internally when there are no more terms in next(): + private static final class NoMoreTermsException extends RuntimeException { + + // Only used internally when there are no more terms in next(): + public static final NoMoreTermsException INSTANCE = new NoMoreTermsException(); + + private NoMoreTermsException() {} + + @Override + public Throwable fillInStackTrace() { + // Do nothing: + return this; + } + } + + @Override + public BytesRef next() throws IOException { + try { + return _next(); + } catch (@SuppressWarnings("unused") NoMoreTermsException eoi) { + // Provoke NPE if we are (illegally!) called again: + currentFrame = null; + return null; + } + } + + private BytesRef _next() throws IOException { + + boolean isSubBlock = popPushNext(); + + nextTerm: while (true) { + assert currentFrame.transition == currentTransition; + + int state; + int lastState; + + // NOTE: suffix == 0 can only happen on the first term in a block, when + // there is a term exactly matching a prefix in the index. If we + // could somehow re-org the code so we only checked this case immediately + // after pushing a frame... + if (currentFrame.suffix != 0) { + + final byte[] suffixBytes = currentFrame.suffixBytes; + + // This is the first byte of the suffix of the term we are now on: + final int label = suffixBytes[currentFrame.startBytePos] & 0xff; + + if (label < currentTransition.min) { + // Common case: we are scanning terms in this block to "catch up" to + // current transition in the automaton: + int minTrans = currentTransition.min; + while (currentFrame.nextEnt < currentFrame.entCount) { + isSubBlock = currentFrame.next(); + if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) { + continue nextTerm; + } + } + + // End of frame: + isSubBlock = popPushNext(); + continue nextTerm; + } + + // Advance where we are in the automaton to match this label: + + while (label > currentTransition.max) { + if (currentFrame.transitionIndex >= currentFrame.transitionCount - 1) { + // Pop this frame: no further matches are possible because + // we've moved beyond what the max transition will allow + if (currentFrame.ord == 0) { + // Provoke NPE if we are (illegally!) called again: + currentFrame = null; + return null; + } + currentFrame = stack[currentFrame.ord - 1]; + currentTransition = currentFrame.transition; + isSubBlock = popPushNext(); + continue nextTerm; + } + currentFrame.transitionIndex++; + automaton.getNextTransition(currentTransition); + + if (label < currentTransition.min) { + int minTrans = currentTransition.min; + while (currentFrame.nextEnt < currentFrame.entCount) { + isSubBlock = currentFrame.next(); + if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) { + continue nextTerm; + } + } + + // End of frame: + isSubBlock = popPushNext(); + continue nextTerm; + } + } + + if (commonSuffix != null && isSubBlock == false) { + final int termLen = currentFrame.prefix + currentFrame.suffix; + if (termLen < commonSuffix.length) { + // No match + isSubBlock = popPushNext(); + continue nextTerm; + } + + final byte[] commonSuffixBytes = commonSuffix.bytes; + + final int lenInPrefix = commonSuffix.length - currentFrame.suffix; + assert commonSuffix.offset == 0; + int suffixBytesPos; + int commonSuffixBytesPos = 0; + + if (lenInPrefix > 0) { + // A prefix of the common suffix overlaps with + // the suffix of the block prefix so we first + // test whether the prefix part matches: + final byte[] termBytes = term.bytes; + int termBytesPos = currentFrame.prefix - lenInPrefix; + assert termBytesPos >= 0; + final int termBytesPosEnd = currentFrame.prefix; + while (termBytesPos < termBytesPosEnd) { + if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + isSubBlock = popPushNext(); + continue nextTerm; + } + } + suffixBytesPos = currentFrame.startBytePos; + } else { + suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffix.length; + } + + // Test overlapping suffix part: + final int commonSuffixBytesPosEnd = commonSuffix.length; + while (commonSuffixBytesPos < commonSuffixBytesPosEnd) { + if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + isSubBlock = popPushNext(); + continue nextTerm; + } + } + } + + // TODO: maybe we should do the same linear test + // that AutomatonTermsEnum does, so that if we + // reach a part of the automaton where .* is + // "temporarily" accepted, we just blindly .next() + // until the limit + + // See if the term suffix matches the automaton: + + // We know from above that the first byte in our suffix (label) matches + // the current transition, so we step from the 2nd byte + // in the suffix: + lastState = currentFrame.state; + state = currentTransition.dest; + + int end = currentFrame.startBytePos + currentFrame.suffix; + for (int idx = currentFrame.startBytePos + 1; idx < end; idx++) { + lastState = state; + state = runAutomaton.step(state, suffixBytes[idx] & 0xff); + if (state == -1) { + // No match + isSubBlock = popPushNext(); + continue nextTerm; + } + } + } else { + state = currentFrame.state; + lastState = currentFrame.lastState; + } + + if (isSubBlock) { + // Match! Recurse: + copyTerm(); + currentFrame = pushFrame(state); + currentTransition = currentFrame.transition; + currentFrame.lastState = lastState; + } else if (runAutomaton.isAccept(state)) { + copyTerm(); + assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0 + : "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString(); + return term; + } else { + // This term is a prefix of a term accepted by the automaton, but is not itself accepted + } + + isSubBlock = popPushNext(); + } + } + + // for debugging + @SuppressWarnings("unused") + static String brToString(BytesRef b) { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + + private void copyTerm() { + final int len = currentFrame.prefix + currentFrame.suffix; + if (term.bytes.length < len) { + term.bytes = ArrayUtil.grow(term.bytes, len); + } + System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); + term.length = len; + } + + @Override + public boolean seekExact(BytesRef text) { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } + + @Override + public SeekStatus seekCeil(BytesRef text) { + throw new UnsupportedOperationException(); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/IntersectTermsEnumFrame.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/IntersectTermsEnumFrame.java new file mode 100644 index 0000000000000..ab515b958b689 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/IntersectTermsEnumFrame.java @@ -0,0 +1,358 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Transition; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST; + +import java.io.IOException; +import java.util.Arrays; + +// TODO: can we share this with the frame in STE? +final class IntersectTermsEnumFrame { + final int ord; + long fp; + long fpOrig; + long fpEnd; + long lastSubFP; + + // private static boolean DEBUG = IntersectTermsEnum.DEBUG; + + // State in automaton + int state; + + // State just before the last label + int lastState; + + int metaDataUpto; + + byte[] suffixBytes = new byte[128]; + final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); + + byte[] suffixLengthBytes; + final ByteArrayDataInput suffixLengthsReader; + + byte[] statBytes = new byte[64]; + int statsSingletonRunLength = 0; + final ByteArrayDataInput statsReader = new ByteArrayDataInput(); + + byte[] floorData = new byte[32]; + final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); + + // Length of prefix shared by all terms in this block + int prefix; + + // Number of entries (term or sub-block) in this block + int entCount; + + // Which term we will next read + int nextEnt; + + // True if this block is either not a floor block, + // or, it's the last sub-block of a floor block + boolean isLastInFloor; + + // True if all entries are terms + boolean isLeafBlock; + + int numFollowFloorBlocks; + int nextFloorLabel; + + final Transition transition = new Transition(); + int transitionIndex; + int transitionCount; + + FST.Arc arc; + + final BlockTermState termState; + + // metadata buffer + byte[] bytes = new byte[32]; + + final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + // Cumulative output so far + BytesRef outputPrefix; + + int startBytePos; + int suffix; + + private final IntersectTermsEnum ite; + private final int version; + + IntersectTermsEnumFrame(IntersectTermsEnum ite, int ord) throws IOException { + this.ite = ite; + this.ord = ord; + this.termState = ite.fr.parent.postingsReader.newTermState(); + this.termState.totalTermFreq = -1; + this.version = ite.fr.parent.version; + if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { + suffixLengthBytes = new byte[32]; + suffixLengthsReader = new ByteArrayDataInput(); + } else { + suffixLengthBytes = null; + suffixLengthsReader = suffixesReader; + } + } + + void loadNextFloorBlock() throws IOException { + assert numFollowFloorBlocks > 0 : "nextFloorLabel=" + nextFloorLabel; + + do { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min); + + load(null); + } + + public void setState(int state) { + this.state = state; + transitionIndex = 0; + transitionCount = ite.automaton.getNumTransitions(state); + if (transitionCount != 0) { + ite.automaton.initTransition(state, transition); + ite.automaton.getNextTransition(transition); + } else { + + // Must set min to -1 so the "label < min" check never falsely triggers: + transition.min = -1; + + // Must set max to -1 so we immediately realize we need to step to the next transition and + // then pop this frame: + transition.max = -1; + } + } + + void load(BytesRef frameIndexData) throws IOException { + if (frameIndexData != null) { + floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); + // Skip first long -- has redundant fp, hasTerms + // flag, isFloor flag + final long code = floorDataReader.readVLong(); + if ((code & Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { + // Floor frame + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + + // If current state is not accept, and has transitions, we must process + // first block in case it has empty suffix: + if (ite.runAutomaton.isAccept(state) == false && transitionCount != 0) { + // Maybe skip floor blocks: + assert transitionIndex == 0 : "transitionIndex=" + transitionIndex; + while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + } + } + } + } + + ite.in.seek(fp); + int code = ite.in.readVInt(); + entCount = code >>> 1; + assert entCount > 0; + isLastInFloor = (code & 1) != 0; + + // term suffixes: + if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { + final long codeL = ite.in.readVLong(); + isLeafBlock = (codeL & 0x04) != 0; + final int numSuffixBytes = (int) (codeL >>> 3); + if (suffixBytes.length < numSuffixBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numSuffixBytes, 1)]; + } + final CompressionAlgorithm compressionAlg; + try { + compressionAlg = CompressionAlgorithm.byCode((int) codeL & 0x03); + } catch (IllegalArgumentException e) { + throw new CorruptIndexException(e.getMessage(), ite.in, e); + } + compressionAlg.read(ite.in, suffixBytes, numSuffixBytes); + suffixesReader.reset(suffixBytes, 0, numSuffixBytes); + + int numSuffixLengthBytes = ite.in.readVInt(); + final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0; + numSuffixLengthBytes >>>= 1; + if (suffixLengthBytes.length < numSuffixLengthBytes) { + suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)]; + } + if (allEqual) { + Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ite.in.readByte()); + } else { + ite.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes); + } + suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes); + } else { + code = ite.in.readVInt(); + isLeafBlock = (code & 1) != 0; + int numBytes = code >>> 1; + if (suffixBytes.length < numBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ite.in.readBytes(suffixBytes, 0, numBytes); + suffixesReader.reset(suffixBytes, 0, numBytes); + } + + // stats + int numBytes = ite.in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ite.in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + statsSingletonRunLength = 0; + metaDataUpto = 0; + + termState.termBlockOrd = 0; + nextEnt = 0; + + // metadata + numBytes = ite.in.readVInt(); + if (bytes.length < numBytes) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ite.in.readBytes(bytes, 0, numBytes); + bytesReader.reset(bytes, 0, numBytes); + + if (isLastInFloor == false) { + // Sub-blocks of a single floor block are always + // written one after another -- tail recurse: + fpEnd = ite.in.getFilePointer(); + } + } + + // TODO: maybe add scanToLabel; should give perf boost + + // Decodes next entry; returns true if it's a sub-block + public boolean next() { + if (isLeafBlock) { + nextLeaf(); + return false; + } else { + return nextNonLeaf(); + } + } + + public void nextLeaf() { + assert nextEnt != -1 && nextEnt < entCount : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffix = suffixLengthsReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + } + + public boolean nextNonLeaf() { + assert nextEnt != -1 && nextEnt < entCount : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixLengthsReader.readVInt(); + suffix = code >>> 1; + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + if ((code & 1) == 0) { + // A normal term + termState.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + lastSubFP = fp - suffixLengthsReader.readVLong(); + return true; + } + } + + public int getTermBlockOrd() { + return isLeafBlock ? nextEnt : termState.termBlockOrd; + } + + public void decodeMetaData() throws IOException { + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + boolean absolute = metaDataUpto == 0; + assert limit > 0; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + + // stats + if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { + if (statsSingletonRunLength > 0) { + termState.docFreq = 1; + termState.totalTermFreq = 1; + statsSingletonRunLength--; + } else { + int token = statsReader.readVInt(); + if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES && (token & 1) == 1) { + termState.docFreq = 1; + termState.totalTermFreq = 1; + statsSingletonRunLength = token >>> 1; + } else { + termState.docFreq = token >>> 1; + if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + termState.totalTermFreq = termState.docFreq; + } else { + termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); + } + } + } + } else { + termState.docFreq = statsReader.readVInt(); + // if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + termState.totalTermFreq = termState.docFreq; // all postings have freq=1 + } else { + termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); + // if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } + } + // metadata + ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute); + + metaDataUpto++; + absolute = false; + } + termState.termBlockOrd = metaDataUpto; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java new file mode 100644 index 0000000000000..807b821d8d145 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java @@ -0,0 +1,395 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree; + +import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.core.internal.io.IOUtils; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.ByteSequenceOutputs; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.Outputs; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * A block-based terms index and dictionary that assigns terms to variable length blocks according + * to how they share prefixes. The terms index is a prefix trie whose leaves are term blocks. The + * advantage of this approach is that seekExact is often able to determine a term cannot exist + * without doing any IO, and intersection with Automata is very fast. Note that this terms + * dictionary has its own fixed terms index (ie, it does not support a pluggable terms index + * implementation). + * + *

NOTE: this terms dictionary supports min/maxItemsPerBlock during indexing to control + * how much memory the terms index uses. + * + *

The data structure used by this implementation is very similar to a burst trie + * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), but with added logic to break + * up too-large blocks of all terms sharing a given prefix into smaller ones. + * + *

Use {@link org.apache.lucene.index.CheckIndex} with the -verbose option to see + * summary statistics on the blocks in the dictionary. + * + *

See {@code BlockTreeTermsWriter}. + * + * @lucene.experimental + */ +public final class Lucene40BlockTreeTermsReader extends FieldsProducer { + + static final Outputs FST_OUTPUTS = ByteSequenceOutputs.getSingleton(); + + static final BytesRef NO_OUTPUT = FST_OUTPUTS.getNoOutput(); + + static final int OUTPUT_FLAGS_NUM_BITS = 2; + static final int OUTPUT_FLAGS_MASK = 0x3; + static final int OUTPUT_FLAG_IS_FLOOR = 0x1; + static final int OUTPUT_FLAG_HAS_TERMS = 0x2; + + /** Extension of terms file */ + static final String TERMS_EXTENSION = "tim"; + + static final String TERMS_CODEC_NAME = "BlockTreeTermsDict"; + + /** Initial terms format. */ + public static final int VERSION_START = 2; + + /** Auto-prefix terms have been superseded by points. */ + public static final int VERSION_AUTO_PREFIX_TERMS_REMOVED = 3; + + /** The long[] + byte[] metadata has been replaced with a single byte[]. */ + public static final int VERSION_META_LONGS_REMOVED = 4; + + /** Suffixes are compressed to save space. */ + public static final int VERSION_COMPRESSED_SUFFIXES = 5; + + /** Metadata is written to its own file. */ + public static final int VERSION_META_FILE = 6; + + /** Current terms format. */ + public static final int VERSION_CURRENT = VERSION_META_FILE; + + /** Extension of terms index file */ + static final String TERMS_INDEX_EXTENSION = "tip"; + + static final String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex"; + + /** Extension of terms meta file */ + static final String TERMS_META_EXTENSION = "tmd"; + + static final String TERMS_META_CODEC_NAME = "BlockTreeTermsMeta"; + + // Open input to the main terms dict file (_X.tib) + final IndexInput termsIn; + // Open input to the terms index file (_X.tip) + final IndexInput indexIn; + + // private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // Reads the terms dict entries, to gather state to + // produce DocsEnum on demand + final PostingsReaderBase postingsReader; + + private final Map fieldMap; + private final List fieldList; + + final String segment; + + final int version; + + /** Sole constructor. */ + public Lucene40BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException { + boolean success = false; + + this.postingsReader = postingsReader; + this.segment = state.segmentInfo.name; + + try { + String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION); + termsIn = EndiannessReverserUtil.openInput(state.directory, termsName, state.context); + version = CodecUtil.checkIndexHeader( + termsIn, + TERMS_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + if (version < VERSION_AUTO_PREFIX_TERMS_REMOVED) { + // pre-6.2 index, records whether auto-prefix terms are enabled in the header + byte b = termsIn.readByte(); + if (b != 0) { + throw new CorruptIndexException("Index header pretends the index has auto-prefix terms: " + b, termsIn); + } + } + + String indexName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION); + indexIn = EndiannessReverserUtil.openInput(state.directory, indexName, state.context); + CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix); + + if (version < VERSION_META_FILE) { + // Have PostingsReader init itself + postingsReader.init(termsIn, state); + + // Verifying the checksum against all bytes would be too costly, but for now we at least + // verify proper structure of the checksum footer. This is cheap and can detect some forms + // of corruption such as file truncation. + CodecUtil.retrieveChecksum(indexIn); + CodecUtil.retrieveChecksum(termsIn); + } + + // Read per-field details + String metaName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION); + Map fieldMap = null; + Throwable priorE = null; + long indexLength = -1, termsLength = -1; + try ( + ChecksumIndexInput metaIn = version >= VERSION_META_FILE + ? EndiannessReverserUtil.openChecksumInput(state.directory, metaName, state.context) + : null + ) { + try { + final IndexInput indexMetaIn, termsMetaIn; + if (version >= VERSION_META_FILE) { + CodecUtil.checkIndexHeader( + metaIn, + TERMS_META_CODEC_NAME, + version, + version, + state.segmentInfo.getId(), + state.segmentSuffix + ); + indexMetaIn = termsMetaIn = metaIn; + postingsReader.init(metaIn, state); + } else { + seekDir(termsIn); + seekDir(termsIn); + seekDir(indexIn); + indexMetaIn = indexIn; + termsMetaIn = termsIn; + } + + final int numFields = termsMetaIn.readVInt(); + if (numFields < 0) { + throw new CorruptIndexException("invalid numFields: " + numFields, termsMetaIn); + } + fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1); + for (int i = 0; i < numFields; ++i) { + final int field = termsMetaIn.readVInt(); + final long numTerms = termsMetaIn.readVLong(); + if (numTerms <= 0) { + throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsMetaIn); + } + final BytesRef rootCode = readBytesRef(termsMetaIn); + final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); + if (fieldInfo == null) { + throw new CorruptIndexException("invalid field number: " + field, termsMetaIn); + } + final long sumTotalTermFreq = termsMetaIn.readVLong(); + // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is + // written. + final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS + ? sumTotalTermFreq + : termsMetaIn.readVLong(); + final int docCount = termsMetaIn.readVInt(); + if (version < VERSION_META_LONGS_REMOVED) { + final int longsSize = termsMetaIn.readVInt(); + if (longsSize < 0) { + throw new CorruptIndexException( + "invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, + termsMetaIn + ); + } + } + BytesRef minTerm = readBytesRef(termsMetaIn); + BytesRef maxTerm = readBytesRef(termsMetaIn); + if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs + throw new CorruptIndexException( + "invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), + termsMetaIn + ); + } + if (sumDocFreq < docCount) { // #postings must be >= #docs with field + throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsMetaIn); + } + if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings + throw new CorruptIndexException( + "invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, + termsMetaIn + ); + } + final long indexStartFP = indexMetaIn.readVLong(); + FieldReader previous = fieldMap.put( + fieldInfo.name, + new FieldReader( + this, + fieldInfo, + numTerms, + rootCode, + sumTotalTermFreq, + sumDocFreq, + docCount, + indexStartFP, + indexMetaIn, + indexIn, + minTerm, + maxTerm + ) + ); + if (previous != null) { + throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsMetaIn); + } + } + if (version >= VERSION_META_FILE) { + indexLength = metaIn.readLong(); + termsLength = metaIn.readLong(); + } + } catch (Throwable exception) { + priorE = exception; + } finally { + if (metaIn != null) { + CodecUtil.checkFooter(metaIn, priorE); + } else if (priorE != null) { + IOUtils.rethrowAlways(priorE); + } + } + } + if (version >= VERSION_META_FILE) { + // At this point the checksum of the meta file has been verified so the lengths are likely + // correct + CodecUtil.retrieveChecksum(indexIn, indexLength); + CodecUtil.retrieveChecksum(termsIn, termsLength); + } else { + assert indexLength == -1 : indexLength; + assert termsLength == -1 : termsLength; + } + List fieldList = new ArrayList<>(fieldMap.keySet()); + fieldList.sort(null); + this.fieldMap = fieldMap; + this.fieldList = Collections.unmodifiableList(fieldList); + success = true; + } finally { + if (success == false) { + // this.close() will close in: + IOUtils.closeWhileHandlingException(this); + } + } + } + + private static BytesRef readBytesRef(IndexInput in) throws IOException { + int numBytes = in.readVInt(); + if (numBytes < 0) { + throw new CorruptIndexException("invalid bytes length: " + numBytes, in); + } + + BytesRef bytes = new BytesRef(); + bytes.length = numBytes; + bytes.bytes = new byte[numBytes]; + in.readBytes(bytes.bytes, 0, numBytes); + + return bytes; + } + + /** Seek {@code input} to the directory offset. */ + private static void seekDir(IndexInput input) throws IOException { + input.seek(input.length() - CodecUtil.footerLength() - 8); + long offset = input.readLong(); + input.seek(offset); + } + + // for debugging + // private static String toHex(int v) { + // return "0x" + Integer.toHexString(v); + // } + + @Override + public void close() throws IOException { + try { + IOUtils.close(indexIn, termsIn, postingsReader); + } finally { + // Clear so refs to terms index is GCable even if + // app hangs onto us: + fieldMap.clear(); + } + } + + @Override + public Iterator iterator() { + return fieldList.iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + assert field != null; + return fieldMap.get(field); + } + + @Override + public int size() { + return fieldMap.size(); + } + + // for debugging + String brToString(BytesRef b) { + if (b == null) { + return "null"; + } else { + try { + return b.utf8ToString() + " " + b; + } catch (@SuppressWarnings("unused") Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + } + + @Override + public void checkIntegrity() throws IOException { + // terms index + CodecUtil.checksumEntireFile(indexIn); + + // term dictionary + CodecUtil.checksumEntireFile(termsIn); + + // postings + postingsReader.checkIntegrity(); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(fields=" + fieldMap.size() + ",delegate=" + postingsReader + ")"; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/SegmentTermsEnum.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/SegmentTermsEnum.java new file mode 100644 index 0000000000000..60aa82a8255bd --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/SegmentTermsEnum.java @@ -0,0 +1,1170 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermState; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.RamUsageEstimator; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.Util; + +import java.io.IOException; +import java.io.PrintStream; + +/** Iterates through terms in this field. */ +final class SegmentTermsEnum extends BaseTermsEnum { + + // Lazy init: + IndexInput in; + + private SegmentTermsEnumFrame[] stack; + private final SegmentTermsEnumFrame staticFrame; + SegmentTermsEnumFrame currentFrame; + boolean termExists; + final FieldReader fr; + + private int targetBeforeCurrentLength; + + // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + private final ByteArrayDataInput scratchReader = new ByteArrayDataInput(); + + // What prefix of the current term was present in the index; when we only next() through the + // index, this stays at 0. It's only set when + // we seekCeil/Exact: + private int validIndexPrefix; + + // assert only: + private boolean eof; + + final BytesRefBuilder term = new BytesRefBuilder(); + private final FST.BytesReader fstReader; + + @SuppressWarnings({ "rawtypes", "unchecked" }) + private FST.Arc[] arcs = new FST.Arc[1]; + + SegmentTermsEnum(FieldReader fr) throws IOException { + this.fr = fr; + + // if (DEBUG) { + // System.out.println("BTTR.init seg=" + fr.parent.segment); + // } + stack = new SegmentTermsEnumFrame[0]; + + // Used to hold seek by TermState, or cached seek + staticFrame = new SegmentTermsEnumFrame(this, -1); + + if (fr.index == null) { + fstReader = null; + } else { + fstReader = fr.index.getBytesReader(); + } + + // Init w/ root block; don't use index since it may + // not (and need not) have been loaded + for (int arcIdx = 0; arcIdx < arcs.length; arcIdx++) { + arcs[arcIdx] = new FST.Arc<>(); + } + + currentFrame = staticFrame; + final FST.Arc arc; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + // currentFrame = pushFrame(arc, rootCode, 0); + // currentFrame.loadBlock(); + validIndexPrefix = 0; + // if (DEBUG) { + // System.out.println("init frame state " + currentFrame.ord); + // printSeekState(); + // } + + // System.out.println(); + // computeBlockStats().print(System.out); + } + + // Not private to avoid synthetic access$NNN methods + void initIndexInput() { + if (this.in == null) { + this.in = fr.parent.termsIn.clone(); + } + } + + /** Runs next() through the entire terms dict, computing aggregate statistics. */ + public Stats computeBlockStats() throws IOException { + + Stats stats = new Stats(fr.parent.segment, fr.fieldInfo.name); + if (fr.index != null) { + stats.indexNumBytes = fr.index.ramBytesUsed(); + } + + currentFrame = staticFrame; + FST.Arc arc; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + + // Empty string prefix must have an output in the + // index! + currentFrame = pushFrame(arc, fr.rootCode, 0); + currentFrame.fpOrig = currentFrame.fp; + currentFrame.loadBlock(); + validIndexPrefix = 0; + + stats.startBlock(currentFrame, currentFrame.isLastInFloor == false); + + allTerms: while (true) { + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + stats.endBlock(currentFrame); + if (currentFrame.isLastInFloor == false) { + // Advance to next floor block + currentFrame.loadNextFloorBlock(); + stats.startBlock(currentFrame, true); + break; + } else { + if (currentFrame.ord == 0) { + break allTerms; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord - 1]; + assert lastFP == currentFrame.lastSubFP; + // if (DEBUG) { + // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + // } + } + } + + while (true) { + if (currentFrame.next()) { + // Push to new block: + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length()); + currentFrame.fpOrig = currentFrame.fp; + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.loadBlock(); + stats.startBlock(currentFrame, currentFrame.isLastInFloor == false); + } else { + stats.term(term.get()); + break; + } + } + } + + stats.finish(); + + // Put root frame back: + currentFrame = staticFrame; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = pushFrame(arc, fr.rootCode, 0); + currentFrame.rewind(); + currentFrame.loadBlock(); + validIndexPrefix = 0; + term.clear(); + + return stats; + } + + private SegmentTermsEnumFrame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final SegmentTermsEnumFrame[] next = new SegmentTermsEnumFrame[ArrayUtil.oversize( + 1 + ord, + RamUsageEstimator.NUM_BYTES_OBJECT_REF + )]; + System.arraycopy(stack, 0, next, 0, stack.length); + for (int stackOrd = stack.length; stackOrd < next.length; stackOrd++) { + next[stackOrd] = new SegmentTermsEnumFrame(this, stackOrd); + } + stack = next; + } + assert stack[ord].ord == ord; + return stack[ord]; + } + + private FST.Arc getArc(int ord) { + if (ord >= arcs.length) { + @SuppressWarnings({ "rawtypes", "unchecked" }) + final FST.Arc[] next = new FST.Arc[ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, next, 0, arcs.length); + for (int arcOrd = arcs.length; arcOrd < next.length; arcOrd++) { + next[arcOrd] = new FST.Arc<>(); + } + arcs = next; + } + return arcs[ord]; + } + + // Pushes a frame we seek'd to + SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { + scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); + final long code = scratchReader.readVLong(); + final long fpSeek = code >>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; + final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); + f.hasTerms = (code & Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; + f.hasTermsOrig = f.hasTerms; + f.isFloor = (code & Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0; + if (f.isFloor) { + f.setFloorData(scratchReader, frameData); + } + pushFrame(arc, fpSeek, length); + + return f; + } + + // Pushes next'd frame or seek'd frame; we later + // lazy-load the frame only when needed + SegmentTermsEnumFrame pushFrame(FST.Arc arc, long fp, int length) throws IOException { + final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); + f.arc = arc; + if (f.fpOrig == fp && f.nextEnt != -1) { + // if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " + // isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + + // f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + + // term.length + " vs prefix=" + f.prefix); + // if (f.prefix > targetBeforeCurrentLength) { + if (f.ord > targetBeforeCurrentLength) { + f.rewind(); + } else { + // if (DEBUG) { + // System.out.println(" skip rewind!"); + // } + } + assert length == f.prefix; + } else { + f.nextEnt = -1; + f.prefix = length; + f.state.termBlockOrd = 0; + f.fpOrig = f.fp = fp; + f.lastSubFP = -1; + // if (DEBUG) { + // final int sav = term.length; + // term.length = length; + // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + + // f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); + // term.length = sav; + // } + } + + return f; + } + + // asserts only + private boolean clearEOF() { + eof = false; + return true; + } + + // asserts only + private boolean setEOF() { + eof = true; + return true; + } + + /* + // for debugging + @SuppressWarnings("unused") + static String brToString(BytesRef b) { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + + // for debugging + @SuppressWarnings("unused") + static String brToString(BytesRefBuilder b) { + return brToString(b.get()); + } + */ + + @Override + public boolean seekExact(BytesRef target) throws IOException { + + if (fr.index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + if (fr.size() > 0 && (target.compareTo(fr.getMin()) < 0 || target.compareTo(fr.getMax()) > 0)) { + return false; + } + + term.grow(1 + target.length); + + assert clearEOF(); + + // if (DEBUG) { + // System.out.println("\nBTTR.seekExact seg=" + fr.parent.segment + " target=" + + // fr.fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + // + termExists + ") validIndexPrefix=" + validIndexPrefix); + // printSeekState(System.out); + // } + + FST.Arc arc; + int targetUpto; + BytesRef output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + // if (DEBUG) { + // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + // } + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output(); + targetUpto = 0; + + SegmentTermsEnumFrame lastFrame = stack[0]; + assert validIndexPrefix <= term.length(); + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TODO: reverse vLong byte order for better FST + // prefix output sharing + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); + // if (DEBUG) { + // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + + // " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + // + " output=" + output); + // } + if (cmp != 0) { + break; + } + arc = arcs[1 + targetUpto]; + assert arc.label() == (target.bytes[target.offset + targetUpto] & 0xFF) + : "arc.label=" + (char) arc.label() + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + if (arc.output() != Lucene40BlockTreeTermsReader.NO_OUTPUT) { + output = Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); + } + if (arc.isFinal()) { + lastFrame = stack[1 + lastFrame.ord]; + } + targetUpto++; + } + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + + // Second compare the rest of the term, but + // don't save arc/output/frame; we only do this + // to find out if the target term is before, + // equal or after the current term + final int targetLimit2 = Math.min(target.length, term.length()); + while (targetUpto < targetLimit2) { + cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); + // if (DEBUG) { + // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + + // targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + + // targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); + // } + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length() - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + // if (DEBUG) { + // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); + // frame.ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = lastFrame.ord; + // if (DEBUG) { + // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); + // rewind frame ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length() == target.length; + if (termExists) { + // if (DEBUG) { + // System.out.println(" target is same as current; return true"); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" target is same as current but term doesn't exist"); + // } + } + // validIndexPrefix = currentFrame.depth; + // term.length = target.length; + // return termExists; + } + + } else { + + targetBeforeCurrentLength = -1; + arc = fr.index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output() != null; + + // if (DEBUG) { + // System.out.println(" no seek state; push root frame"); + // } + + output = arc.output(); + + currentFrame = staticFrame; + + // term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); + } + + // if (DEBUG) { + // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " + // currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + + // targetBeforeCurrentLength); + // } + + // We are done sharing the common prefix with the incoming target and where we are currently + // seek'd; now continue walking the index: + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1 + targetUpto), fstReader); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + + // toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefix; + // validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + if (currentFrame.hasTerms == false) { + termExists = false; + term.setByteAt(targetUpto, (byte) targetLabel); + term.setLength(1 + targetUpto); + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got " + result + "; return NOT_FOUND term=" + + // brToString(term)); + // } + return false; + } + } else { + // Follow this arc + arc = nextArc; + term.setByteAt(targetUpto, (byte) targetLabel); + // Aggregate output as we go: + assert arc.output() != null; + if (arc.output() != Lucene40BlockTreeTermsReader.NO_OUTPUT) { + output = Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); + } + + // if (DEBUG) { + // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + + // targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + // } + targetUpto++; + + if (arc.isFinal()) { + // if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), targetUpto); + // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + + // currentFrame.hasTerms); + } + } + } + + // validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + // Target term is entirely contained in the index: + if (currentFrame.hasTerms == false) { + termExists = false; + term.setLength(targetUpto); + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + brToString(term)); + // } + return false; + } + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got result " + result + "; return NOT_FOUND term=" + + // term.utf8ToString()); + // } + + return false; + } + } + + @Override + public SeekStatus seekCeil(BytesRef target) throws IOException { + + if (fr.index == null) { + throw new IllegalStateException("terms index was not loaded"); + } + + term.grow(1 + target.length); + + assert clearEOF(); + + // if (DEBUG) { + // System.out.println("\nBTTR.seekCeil seg=" + fr.parent.segment + " target=" + + // fr.fieldInfo.name + ":" + brToString(target) + " " + target + " current=" + brToString(term) + // + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix); + // printSeekState(System.out); + // } + + FST.Arc arc; + int targetUpto; + BytesRef output; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + // if (DEBUG) { + // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + // } + + arc = arcs[0]; + assert arc.isFinal(); + output = arc.output(); + targetUpto = 0; + + SegmentTermsEnumFrame lastFrame = stack[0]; + assert validIndexPrefix <= term.length(); + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // TODO: we should write our vLong backwards (MSB + // first) to get better sharing from the FST + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); + // if (DEBUG) { + // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + + // ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " + // vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")" + " arc.output=" + arc.output + + // " output=" + output); + // } + if (cmp != 0) { + break; + } + arc = arcs[1 + targetUpto]; + assert arc.label() == (target.bytes[target.offset + targetUpto] & 0xFF) + : "arc.label=" + (char) arc.label() + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + // TODO: we could save the outputs in local + // byte[][] instead of making new objs ever + // seek; but, often the FST doesn't have any + // shared bytes (but this could change if we + // reverse vLong byte order) + if (arc.output() != Lucene40BlockTreeTermsReader.NO_OUTPUT) { + output = Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); + } + if (arc.isFinal()) { + lastFrame = stack[1 + lastFrame.ord]; + } + targetUpto++; + } + + if (cmp == 0) { + final int targetUptoMid = targetUpto; + // Second compare the rest of the term, but + // don't save arc/output/frame: + final int targetLimit2 = Math.min(target.length, term.length()); + while (targetUpto < targetLimit2) { + cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); + // if (DEBUG) { + // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + // + " vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")"); + // } + if (cmp != 0) { + break; + } + targetUpto++; + } + + if (cmp == 0) { + cmp = term.length() - target.length; + } + targetUpto = targetUptoMid; + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + // if (DEBUG) { + // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); + // clear frame.scanned ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = 0; + // if (DEBUG) { + // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); + // rewind frame ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length() == target.length; + if (termExists) { + // if (DEBUG) { + // System.out.println(" target is same as current; return FOUND"); + // } + return SeekStatus.FOUND; + } else { + // if (DEBUG) { + // System.out.println(" target is same as current but term doesn't exist"); + // } + } + } + + } else { + + targetBeforeCurrentLength = -1; + arc = fr.index.getFirstArc(arcs[0]); + + // Empty string prefix must have an output (block) in the index! + assert arc.isFinal(); + assert arc.output() != null; + + // if (DEBUG) { + // System.out.println(" no seek state; push root frame"); + // } + + output = arc.output(); + + currentFrame = staticFrame; + + // term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(arc, Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); + } + + // if (DEBUG) { + // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " + // currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + + // targetBeforeCurrentLength); + // } + + // We are done sharing the common prefix with the incoming target and where we are currently + // seek'd; now continue walking the index: + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final FST.Arc nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1 + targetUpto), fstReader); + + if (nextArc == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + + // targetLabel); + // } + + validIndexPrefix = currentFrame.prefix; + // validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + // if (DEBUG) System.out.println(" now scanToTerm"); + final SeekStatus result = currentFrame.scanToTerm(target, false); + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + + if (next() != null) { + // if (DEBUG) { + // System.out.println(" return NOT_FOUND term=" + brToString(term)); + // } + return SeekStatus.NOT_FOUND; + } else { + // if (DEBUG) { + // System.out.println(" return END"); + // } + return SeekStatus.END; + } + } else { + // if (DEBUG) { + // System.out.println(" return " + result + " term=" + brToString(term)); + // } + return result; + } + } else { + // Follow this arc + term.setByteAt(targetUpto, (byte) targetLabel); + arc = nextArc; + // Aggregate output as we go: + assert arc.output() != null; + if (arc.output() != Lucene40BlockTreeTermsReader.NO_OUTPUT) { + output = Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); + } + + // if (DEBUG) { + // System.out.println(" index: follow label=" + (target.bytes[target.offset + + // targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput); + // } + targetUpto++; + + if (arc.isFinal()) { + // if (DEBUG) System.out.println(" arc is final!"); + currentFrame = pushFrame(arc, Lucene40BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), targetUpto); + // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + + // currentFrame.hasTerms); + } + } + } + + // validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefix; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + if (next() != null) { + // if (DEBUG) { + // System.out.println(" return NOT_FOUND term=" + term.get().utf8ToString() + " " + term); + // } + return SeekStatus.NOT_FOUND; + } else { + // if (DEBUG) { + // System.out.println(" return END"); + // } + return SeekStatus.END; + } + } else { + return result; + } + } + + @SuppressWarnings("unused") + private void printSeekState(PrintStream out) throws IOException { + if (currentFrame == staticFrame) { + out.println(" no prior seek"); + } else { + out.println(" prior seek state:"); + int ord = 0; + boolean isSeekFrame = true; + while (true) { + SegmentTermsEnumFrame f = getFrame(ord); + assert f != null; + final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix); + if (f.nextEnt == -1) { + out.println( + " frame " + + (isSeekFrame ? "(seek)" : "(next)") + + " ord=" + + ord + + " fp=" + + f.fp + + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + + " prefixLen=" + + f.prefix + + " prefix=" + + prefix + + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + + " hasTerms=" + + f.hasTerms + + " isFloor=" + + f.isFloor + + " code=" + + ((f.fp << Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms + ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS + : 0) + (f.isFloor ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + + " isLastInFloor=" + + f.isLastInFloor + + " mdUpto=" + + f.metaDataUpto + + " tbOrd=" + + f.getTermBlockOrd() + ); + } else { + out.println( + " frame " + + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + + " ord=" + + ord + + " fp=" + + f.fp + + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + + " prefixLen=" + + f.prefix + + " prefix=" + + prefix + + " nextEnt=" + + f.nextEnt + + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + + " hasTerms=" + + f.hasTerms + + " isFloor=" + + f.isFloor + + " code=" + + ((f.fp << Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms + ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS + : 0) + (f.isFloor ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + + " lastSubFP=" + + f.lastSubFP + + " isLastInFloor=" + + f.isLastInFloor + + " mdUpto=" + + f.metaDataUpto + + " tbOrd=" + + f.getTermBlockOrd() + ); + } + if (fr.index != null) { + assert isSeekFrame == false || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; + if (f.prefix > 0 && isSeekFrame && f.arc.label() != (term.byteAt(f.prefix - 1) & 0xFF)) { + out.println( + " broken seek state: arc.label=" + + (char) f.arc.label() + + " vs term byte=" + + (char) (term.byteAt(f.prefix - 1) & 0xFF) + ); + throw new RuntimeException("seek state is broken"); + } + BytesRef output = Util.get(fr.index, prefix); + if (output == null) { + out.println(" broken seek state: prefix is not final in index"); + throw new RuntimeException("seek state is broken"); + } else if (isSeekFrame && f.isFloor == false) { + final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); + final long codeOrig = reader.readVLong(); + final long code = (f.fp << Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms + ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS + : 0) | (f.isFloor ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0); + if (codeOrig != code) { + out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); + throw new RuntimeException("seek state is broken"); + } + } + } + if (f == currentFrame) { + break; + } + if (f.prefix == validIndexPrefix) { + isSeekFrame = false; + } + ord++; + } + } + } + + /* Decodes only the term bytes of the next term. If caller then asks for + metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) + decode all metadata up to the current term. */ + @Override + public BytesRef next() throws IOException { + if (in == null) { + // Fresh TermsEnum; seek to first term: + final FST.Arc arc; + if (fr.index != null) { + arc = fr.index.getFirstArc(arcs[0]); + // Empty string prefix must have an output in the index! + assert arc.isFinal(); + } else { + arc = null; + } + currentFrame = pushFrame(arc, fr.rootCode, 0); + currentFrame.loadBlock(); + } + + targetBeforeCurrentLength = currentFrame.ord; + + assert eof == false; + // if (DEBUG) { + // System.out.println("\nBTTR.next seg=" + fr.parent.segment + " term=" + brToString(term) + " + // termExists?=" + termExists + " field=" + fr.fieldInfo.name + " termBlockOrd=" + + // currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix); + // printSeekState(System.out); + // } + + if (currentFrame == staticFrame) { + // If seek was previously called and the term was + // cached, or seek(TermState) was called, usually + // caller is just going to pull a D/&PEnum or get + // docFreq, etc. But, if they then call next(), + // this method catches up all internal state so next() + // works properly: + // if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + + // term); + final boolean result = seekExact(term.get()); + assert result; + } + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + if (currentFrame.isLastInFloor == false) { + // Advance to next floor block + currentFrame.loadNextFloorBlock(); + break; + } else { + // if (DEBUG) System.out.println(" pop frame"); + if (currentFrame.ord == 0) { + // if (DEBUG) System.out.println(" return null"); + assert setEOF(); + term.clear(); + validIndexPrefix = 0; + currentFrame.rewind(); + termExists = false; + return null; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord - 1]; + + if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) { + // We popped into a frame that's not loaded + // yet or not scan'd to the right entry + currentFrame.scanToFloorFrame(term.get()); + currentFrame.loadBlock(); + currentFrame.scanToSubBlock(lastFP); + } + + // Note that the seek state (last seek) has been + // invalidated beyond this depth + validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); + // if (DEBUG) { + // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + // } + } + } + + while (true) { + if (currentFrame.next()) { + // Push to new block: + // if (DEBUG) System.out.println(" push frame"); + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length()); + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.loadBlock(); + } else { + // if (DEBUG) System.out.println(" return term=" + brToString(term) + " currentFrame.ord=" + // + currentFrame.ord); + return term.get(); + } + } + } + + @Override + public BytesRef term() { + assert eof == false; + return term.get(); + } + + @Override + public int docFreq() throws IOException { + assert eof == false; + // if (DEBUG) System.out.println("BTR.docFreq"); + currentFrame.decodeMetaData(); + // if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq); + return currentFrame.state.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + assert eof == false; + currentFrame.decodeMetaData(); + return currentFrame.state.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + assert eof == false; + // if (DEBUG) { + // System.out.println("BTTR.docs seg=" + segment); + // } + currentFrame.decodeMetaData(); + // if (DEBUG) { + // System.out.println(" state=" + currentFrame.state); + // } + return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.state, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + assert eof == false; + // if (DEBUG) { + // System.out.println("BTTR.docs seg=" + segment); + // } + currentFrame.decodeMetaData(); + // if (DEBUG) { + // System.out.println(" state=" + currentFrame.state); + // } + return fr.parent.postingsReader.impacts(fr.fieldInfo, currentFrame.state, flags); + } + + @Override + public void seekExact(BytesRef target, TermState otherState) { + // if (DEBUG) { + // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + + // target.utf8ToString() + " " + target + " state=" + otherState); + // } + assert clearEOF(); + if (target.compareTo(term.get()) != 0 || termExists == false) { + assert otherState != null && otherState instanceof BlockTermState; + currentFrame = staticFrame; + currentFrame.state.copyFrom(otherState); + term.copyBytes(target); + currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); + assert currentFrame.metaDataUpto > 0; + validIndexPrefix = 0; + } else { + // if (DEBUG) { + // System.out.println(" skip seek: already on target state=" + currentFrame.state); + // } + } + } + + @Override + public TermState termState() throws IOException { + assert eof == false; + currentFrame.decodeMetaData(); + TermState ts = currentFrame.state.clone(); + // if (DEBUG) System.out.println("BTTR.termState seg=" + segment + " state=" + ts); + return ts; + } + + @Override + public void seekExact(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/SegmentTermsEnumFrame.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/SegmentTermsEnumFrame.java new file mode 100644 index 0000000000000..82060c9cc5db3 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/SegmentTermsEnumFrame.java @@ -0,0 +1,765 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST; + +import java.io.IOException; +import java.util.Arrays; + +final class SegmentTermsEnumFrame { + // Our index in stack[]: + final int ord; + + boolean hasTerms; + boolean hasTermsOrig; + boolean isFloor; + + FST.Arc arc; + + // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // File pointer where this block was loaded from + long fp; + long fpOrig; + long fpEnd; + long totalSuffixBytes; // for stats + + byte[] suffixBytes = new byte[128]; + final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); + + byte[] suffixLengthBytes; + final ByteArrayDataInput suffixLengthsReader; + + byte[] statBytes = new byte[64]; + int statsSingletonRunLength = 0; + final ByteArrayDataInput statsReader = new ByteArrayDataInput(); + + byte[] floorData = new byte[32]; + final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); + + // Length of prefix shared by all terms in this block + int prefix; + + // Number of entries (term or sub-block) in this block + int entCount; + + // Which term we will next read, or -1 if the block + // isn't loaded yet + int nextEnt; + + // True if this block is either not a floor block, + // or, it's the last sub-block of a floor block + boolean isLastInFloor; + + // True if all entries are terms + boolean isLeafBlock; + + long lastSubFP; + + int nextFloorLabel; + int numFollowFloorBlocks; + + // Next term to decode metaData; we decode metaData + // lazily so that scanning to find the matching term is + // fast and only if you find a match and app wants the + // stats or docs/positions enums, will we decode the + // metaData + int metaDataUpto; + + final BlockTermState state; + + // metadata buffer + byte[] bytes = new byte[32]; + final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + private final SegmentTermsEnum ste; + private final int version; + + SegmentTermsEnumFrame(SegmentTermsEnum ste, int ord) throws IOException { + this.ste = ste; + this.ord = ord; + this.state = ste.fr.parent.postingsReader.newTermState(); + this.state.totalTermFreq = -1; + this.version = ste.fr.parent.version; + if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { + suffixLengthBytes = new byte[32]; + suffixLengthsReader = new ByteArrayDataInput(); + } else { + suffixLengthBytes = null; + suffixLengthsReader = suffixesReader; + } + } + + public void setFloorData(ByteArrayDataInput in, BytesRef source) { + final int numBytes = source.length - (in.getPosition() - source.offset); + if (numBytes > floorData.length) { + floorData = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes); + floorDataReader.reset(floorData, 0, numBytes); + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + // if (DEBUG) { + // System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new + // BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + // + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel)); + // } + } + + public int getTermBlockOrd() { + return isLeafBlock ? nextEnt : state.termBlockOrd; + } + + void loadNextFloorBlock() throws IOException { + // if (DEBUG) { + // System.out.println(" loadNextFloorBlock fp=" + fp + " fpEnd=" + fpEnd); + // } + assert arc == null || isFloor : "arc=" + arc + " isFloor=" + isFloor; + fp = fpEnd; + nextEnt = -1; + loadBlock(); + } + + /* Does initial decode of next block of terms; this + doesn't actually decode the docFreq, totalTermFreq, + postings details (frq/prx offset, etc.) metadata; + it just loads them as byte[] blobs which are then + decoded on-demand if the metadata is ever requested + for any term in this block. This enables terms-only + intensive consumes (eg certain MTQs, respelling) to + not pay the price of decoding metadata they won't + use. */ + void loadBlock() throws IOException { + + // Clone the IndexInput lazily, so that consumers + // that just pull a TermsEnum to + // seekExact(TermState) don't pay this cost: + ste.initIndexInput(); + + if (nextEnt != -1) { + // Already loaded + return; + } + // System.out.println("blc=" + blockLoadCount); + + ste.in.seek(fp); + int code = ste.in.readVInt(); + entCount = code >>> 1; + assert entCount > 0; + isLastInFloor = (code & 1) != 0; + + assert arc == null || (isLastInFloor || isFloor) + : "fp=" + fp + " arc=" + arc + " isFloor=" + isFloor + " isLastInFloor=" + isLastInFloor; + + // TODO: if suffixes were stored in random-access + // array structure, then we could do binary search + // instead of linear scan to find target term; eg + // we could have simple array of offsets + + final long startSuffixFP = ste.in.getFilePointer(); + // term suffixes: + if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { + final long codeL = ste.in.readVLong(); + isLeafBlock = (codeL & 0x04) != 0; + final int numSuffixBytes = (int) (codeL >>> 3); + if (suffixBytes.length < numSuffixBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numSuffixBytes, 1)]; + } + try { + compressionAlg = CompressionAlgorithm.byCode((int) codeL & 0x03); + } catch (IllegalArgumentException e) { + throw new CorruptIndexException(e.getMessage(), ste.in, e); + } + compressionAlg.read(ste.in, suffixBytes, numSuffixBytes); + suffixesReader.reset(suffixBytes, 0, numSuffixBytes); + + int numSuffixLengthBytes = ste.in.readVInt(); + final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0; + numSuffixLengthBytes >>>= 1; + if (suffixLengthBytes.length < numSuffixLengthBytes) { + suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)]; + } + if (allEqual) { + Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ste.in.readByte()); + } else { + ste.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes); + } + suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes); + } else { + code = ste.in.readVInt(); + isLeafBlock = (code & 1) != 0; + int numBytes = code >>> 1; + if (suffixBytes.length < numBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(suffixBytes, 0, numBytes); + suffixesReader.reset(suffixBytes, 0, numBytes); + } + totalSuffixBytes = ste.in.getFilePointer() - startSuffixFP; + + // stats + int numBytes = ste.in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + statsSingletonRunLength = 0; + metaDataUpto = 0; + + state.termBlockOrd = 0; + nextEnt = 0; + lastSubFP = -1; + + // TODO: we could skip this if !hasTerms; but + // that's rare so won't help much + // metadata + numBytes = ste.in.readVInt(); + if (bytes.length < numBytes) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(bytes, 0, numBytes); + bytesReader.reset(bytes, 0, numBytes); + + // Sub-blocks of a single floor block are always + // written one after another -- tail recurse: + fpEnd = ste.in.getFilePointer(); + // if (DEBUG) { + // System.out.println(" fpEnd=" + fpEnd); + // } + } + + void rewind() { + + // Force reload: + fp = fpOrig; + nextEnt = -1; + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.rewind(); + numFollowFloorBlocks = floorDataReader.readVInt(); + assert numFollowFloorBlocks > 0; + nextFloorLabel = floorDataReader.readByte() & 0xff; + } + } + + // Decodes next entry; returns true if it's a sub-block + public boolean next() throws IOException { + if (isLeafBlock) { + nextLeaf(); + return false; + } else { + return nextNonLeaf(); + } + } + + public void nextLeaf() { + // if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " + // entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffix = suffixLengthsReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + ste.term.setLength(prefix + suffix); + ste.term.grow(ste.term.length()); + suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); + ste.termExists = true; + } + + public boolean nextNonLeaf() throws IOException { + // if (DEBUG) System.out.println(" stef.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + // + entCount + " fp=" + suffixesReader.getPosition()); + while (true) { + if (nextEnt == entCount) { + assert arc == null || (isFloor && isLastInFloor == false) : "isFloor=" + isFloor + " isLastInFloor=" + isLastInFloor; + loadNextFloorBlock(); + if (isLeafBlock) { + nextLeaf(); + return false; + } else { + continue; + } + } + + assert nextEnt != -1 && nextEnt < entCount : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixLengthsReader.readVInt(); + suffix = code >>> 1; + startBytePos = suffixesReader.getPosition(); + ste.term.setLength(prefix + suffix); + ste.term.grow(ste.term.length()); + suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); + if ((code & 1) == 0) { + // A normal term + ste.termExists = true; + subCode = 0; + state.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + ste.termExists = false; + subCode = suffixLengthsReader.readVLong(); + lastSubFP = fp - subCode; + // if (DEBUG) { + // System.out.println(" lastSubFP=" + lastSubFP); + // } + return true; + } + } + } + + // TODO: make this array'd so we can do bin search? + // likely not worth it? need to measure how many + // floor blocks we "typically" get + public void scanToFloorFrame(BytesRef target) { + + if (isFloor == false || target.length <= prefix) { + // if (DEBUG) { + // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + + // target.length + " vs prefix=" + prefix); + // } + return; + } + + final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; + + // if (DEBUG) { + // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + + // toHex(targetLabel) + " vs nextFloorLabel=" + toHex(nextFloorLabel) + " numFollowFloorBlocks=" + // + numFollowFloorBlocks); + // } + + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" already on correct block"); + // } + return; + } + + assert numFollowFloorBlocks != 0; + + long newFP = fpOrig; + while (true) { + final long code = floorDataReader.readVLong(); + newFP = fpOrig + (code >>> 1); + hasTerms = (code & 1) != 0; + // if (DEBUG) { + // System.out.println(" label=" + toHex(nextFloorLabel) + " fp=" + newFP + " + // hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks); + // } + + isLastInFloor = numFollowFloorBlocks == 1; + numFollowFloorBlocks--; + + if (isLastInFloor) { + nextFloorLabel = 256; + // if (DEBUG) { + // System.out.println(" stop! last block nextFloorLabel=" + + // toHex(nextFloorLabel)); + // } + break; + } else { + nextFloorLabel = floorDataReader.readByte() & 0xff; + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" stop! nextFloorLabel=" + toHex(nextFloorLabel)); + // } + break; + } + } + } + + if (newFP != fp) { + // Force re-load of the block: + // if (DEBUG) { + // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp); + // } + nextEnt = -1; + fp = newFP; + } else { + // if (DEBUG) { + // System.out.println(" stay on same fp=" + newFP); + // } + } + } + + public void decodeMetaData() throws IOException { + + // if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + segment + " mdUpto=" + + // metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + boolean absolute = metaDataUpto == 0; + assert limit > 0; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + + if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) { + if (statsSingletonRunLength > 0) { + state.docFreq = 1; + state.totalTermFreq = 1; + statsSingletonRunLength--; + } else { + int token = statsReader.readVInt(); + if ((token & 1) == 1) { + state.docFreq = 1; + state.totalTermFreq = 1; + statsSingletonRunLength = token >>> 1; + } else { + state.docFreq = token >>> 1; + if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + state.totalTermFreq = state.docFreq; + } else { + state.totalTermFreq = state.docFreq + statsReader.readVLong(); + } + } + } + } else { + assert statsSingletonRunLength == 0; + state.docFreq = statsReader.readVInt(); + // if (DEBUG) System.out.println(" dF=" + state.docFreq); + if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + state.totalTermFreq = state.docFreq; // all postings have freq=1 + } else { + state.totalTermFreq = state.docFreq + statsReader.readVLong(); + // if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); + } + } + + // metadata + ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute); + + metaDataUpto++; + absolute = false; + } + state.termBlockOrd = metaDataUpto; + } + + // Used only by assert + private boolean prefixMatches(BytesRef target) { + for (int bytePos = 0; bytePos < prefix; bytePos++) { + if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) { + return false; + } + } + + return true; + } + + // Scans to sub-block that has this target fp; only + // called by next(); NOTE: does not set + // startBytePos/suffix as a side effect + public void scanToSubBlock(long subFP) { + assert isLeafBlock == false; + // if (DEBUG) System.out.println(" scanToSubBlock fp=" + fp + " subFP=" + subFP + " entCount=" + // + entCount + " lastSubFP=" + lastSubFP); + // assert nextEnt == 0; + if (lastSubFP == subFP) { + // if (DEBUG) System.out.println(" already positioned"); + return; + } + assert subFP < fp : "fp=" + fp + " subFP=" + subFP; + final long targetSubCode = fp - subFP; + // if (DEBUG) System.out.println(" targetSubCode=" + targetSubCode); + while (true) { + assert nextEnt < entCount; + nextEnt++; + final int code = suffixLengthsReader.readVInt(); + suffixesReader.skipBytes(code >>> 1); + if ((code & 1) != 0) { + final long subCode = suffixLengthsReader.readVLong(); + if (targetSubCode == subCode) { + // if (DEBUG) System.out.println(" match!"); + lastSubFP = subFP; + return; + } + } else { + state.termBlockOrd++; + } + } + } + + // NOTE: sets startBytePos/suffix as a side effect + public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException { + return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly); + } + + private int startBytePos; + private int suffix; + private long subCode; + CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION; + + // for debugging + /* + @SuppressWarnings("unused") + static String brToString(BytesRef b) { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + */ + + // Target's prefix matches this block's prefix; we + // scan the entries check if the suffix matches. + public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOException { + + // if (DEBUG) System.out.println(" scanToTermLeaf: block fp=" + fp + " prefix=" + prefix + " + // nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + + // brToString(term)); + + assert nextEnt != -1; + + ste.termExists = true; + subCode = 0; + + if (nextEnt == entCount) { + if (exactOnly) { + fillTerm(); + } + return SeekStatus.END; + } + + assert prefixMatches(target); + + // TODO: binary search when all terms have the same length, which is common for ID fields, + // which are also the most sensitive to lookup performance? + // Loop over each entry (term or sub-block) in this block: + do { + nextEnt++; + + suffix = suffixLengthsReader.readVInt(); + + // if (DEBUG) { + // BytesRef suffixBytesRef = new BytesRef(); + // suffixBytesRef.bytes = suffixBytes; + // suffixBytesRef.offset = suffixesReader.getPosition(); + // suffixBytesRef.length = suffix; + // System.out.println(" cycle: term " + (nextEnt-1) + " (of " + entCount + ") suffix=" + // + brToString(suffixBytesRef)); + // } + + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + + // Loop over bytes in the suffix, comparing to the target + final int cmp = Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffix, + target.bytes, + target.offset + prefix, + target.offset + target.length + ); + + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); + + // if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! + + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: + + assert ste.termExists; + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; + } + } while (nextEnt < entCount); + + // It is possible (and OK) that terms index pointed us + // at this block, but, we scanned the entire block and + // did not find the term to position to. This happens + // when the target is after the last term in the block + // (but, before the next term in the index). EG + // target could be foozzz, and terms index pointed us + // to the foo* block, but the last term in this block + // was fooz (and, eg, first term in the next block will + // bee fop). + // if (DEBUG) System.out.println(" block end"); + if (exactOnly) { + fillTerm(); + } + + // TODO: not consistent that in the + // not-exact case we don't next() into the next + // frame here + return SeekStatus.END; + } + + // Target's prefix matches this block's prefix; we + // scan the entries check if the suffix matches. + public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException { + + // if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + + // " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + + // brToString(target)); + + assert nextEnt != -1; + + if (nextEnt == entCount) { + if (exactOnly) { + fillTerm(); + ste.termExists = subCode == 0; + } + return SeekStatus.END; + } + + assert prefixMatches(target); + + // Loop over each entry (term or sub-block) in this block: + while (nextEnt < entCount) { + + nextEnt++; + + final int code = suffixLengthsReader.readVInt(); + suffix = code >>> 1; + + // if (DEBUG) { + // BytesRef suffixBytesRef = new BytesRef(); + // suffixBytesRef.bytes = suffixBytes; + // suffixBytesRef.offset = suffixesReader.getPosition(); + // suffixBytesRef.length = suffix; + // System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + + // (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef)); + // } + + final int termLen = prefix + suffix; + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + ste.termExists = (code & 1) == 0; + if (ste.termExists) { + state.termBlockOrd++; + subCode = 0; + } else { + subCode = suffixLengthsReader.readVLong(); + lastSubFP = fp - subCode; + } + + final int cmp = Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffix, + target.bytes, + target.offset + prefix, + target.offset + target.length + ); + + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); + + // if (DEBUG) System.out.println(" maybe done exactOnly=" + exactOnly + " + // ste.termExists=" + ste.termExists); + + if (exactOnly == false && ste.termExists == false) { + // System.out.println(" now pushFrame"); + // TODO this + // We are on a sub-block, and caller wants + // us to position to the next term after + // the target, so we must recurse into the + // sub-frame(s): + ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen); + ste.currentFrame.loadBlock(); + while (ste.currentFrame.next()) { + ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); + ste.currentFrame.loadBlock(); + } + } + + // if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! + + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: + + assert ste.termExists; + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; + } + } + + // It is possible (and OK) that terms index pointed us + // at this block, but, we scanned the entire block and + // did not find the term to position to. This happens + // when the target is after the last term in the block + // (but, before the next term in the index). EG + // target could be foozzz, and terms index pointed us + // to the foo* block, but the last term in this block + // was fooz (and, eg, first term in the next block will + // bee fop). + // if (DEBUG) System.out.println(" block end"); + if (exactOnly) { + fillTerm(); + } + + // TODO: not consistent that in the + // not-exact case we don't next() into the next + // frame here + return SeekStatus.END; + } + + private void fillTerm() { + final int termLength = prefix + suffix; + ste.term.setLength(termLength); + ste.term.grow(termLength); + System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefix, suffix); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Stats.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Stats.java new file mode 100644 index 0000000000000..90ee6d1115a57 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Stats.java @@ -0,0 +1,277 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree; + +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.core.internal.io.IOUtils; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.util.Locale; + +/** + * BlockTree statistics for a single field returned by {@link FieldReader#getStats()}. + * + * @lucene.internal + */ +public class Stats { + /** Byte size of the index. */ + public long indexNumBytes; + + /** Total number of terms in the field. */ + public long totalTermCount; + + /** Total number of bytes (sum of term lengths) across all terms in the field. */ + public long totalTermBytes; + + /** The number of normal (non-floor) blocks in the terms file. */ + public int nonFloorBlockCount; + + /** + * The number of floor blocks (meta-blocks larger than the allowed {@code maxItemsPerBlock}) in + * the terms file. + */ + public int floorBlockCount; + + /** The number of sub-blocks within the floor blocks. */ + public int floorSubBlockCount; + + /** The number of "internal" blocks (that have both terms and sub-blocks). */ + public int mixedBlockCount; + + /** The number of "leaf" blocks (blocks that have only terms). */ + public int termsOnlyBlockCount; + + /** The number of "internal" blocks that do not contain terms (have only sub-blocks). */ + public int subBlocksOnlyBlockCount; + + /** Total number of blocks. */ + public int totalBlockCount; + + /** Number of blocks at each prefix depth. */ + public int[] blockCountByPrefixLen = new int[10]; + + private int startBlockCount; + private int endBlockCount; + + /** Total number of bytes used to store term suffixes. */ + public long totalBlockSuffixBytes; + + /** + * Number of times each compression method has been used. 0 = uncompressed 1 = lowercase_ascii 2 = + * LZ4 + */ + public final long[] compressionAlgorithms = new long[3]; + + /** Total number of suffix bytes before compression. */ + public long totalUncompressedBlockSuffixBytes; + + /** + * Total number of bytes used to store term stats (not including what the {@link + * PostingsReaderBase} stores. + */ + public long totalBlockStatsBytes; + + /** + * Total bytes stored by the {@link PostingsReaderBase}, plus the other few vInts stored in the + * frame. + */ + public long totalBlockOtherBytes; + + /** Segment name. */ + public final String segment; + + /** Field name. */ + public final String field; + + Stats(String segment, String field) { + this.segment = segment; + this.field = field; + } + + void startBlock(SegmentTermsEnumFrame frame, boolean isFloor) { + totalBlockCount++; + if (isFloor) { + if (frame.fp == frame.fpOrig) { + floorBlockCount++; + } + floorSubBlockCount++; + } else { + nonFloorBlockCount++; + } + + if (blockCountByPrefixLen.length <= frame.prefix) { + blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1 + frame.prefix); + } + blockCountByPrefixLen[frame.prefix]++; + startBlockCount++; + totalBlockSuffixBytes += frame.totalSuffixBytes; + totalUncompressedBlockSuffixBytes += frame.suffixesReader.length(); + if (frame.suffixesReader != frame.suffixLengthsReader) { + totalUncompressedBlockSuffixBytes += frame.suffixLengthsReader.length(); + } + totalBlockStatsBytes += frame.statsReader.length(); + compressionAlgorithms[frame.compressionAlg.code]++; + } + + void endBlock(SegmentTermsEnumFrame frame) { + final int termCount = frame.isLeafBlock ? frame.entCount : frame.state.termBlockOrd; + final int subBlockCount = frame.entCount - termCount; + totalTermCount += termCount; + if (termCount != 0 && subBlockCount != 0) { + mixedBlockCount++; + } else if (termCount != 0) { + termsOnlyBlockCount++; + } else if (subBlockCount != 0) { + subBlocksOnlyBlockCount++; + } else { + throw new IllegalStateException(); + } + endBlockCount++; + final long otherBytes = frame.fpEnd - frame.fp - frame.totalSuffixBytes - frame.statsReader.length(); + assert otherBytes > 0 : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd; + totalBlockOtherBytes += otherBytes; + } + + void term(BytesRef term) { + totalTermBytes += term.length; + } + + void finish() { + assert startBlockCount == endBlockCount : "startBlockCount=" + startBlockCount + " endBlockCount=" + endBlockCount; + assert totalBlockCount == floorSubBlockCount + nonFloorBlockCount + : "floorSubBlockCount=" + + floorSubBlockCount + + " nonFloorBlockCount=" + + nonFloorBlockCount + + " totalBlockCount=" + + totalBlockCount; + assert totalBlockCount == mixedBlockCount + termsOnlyBlockCount + subBlocksOnlyBlockCount + : "totalBlockCount=" + + totalBlockCount + + " mixedBlockCount=" + + mixedBlockCount + + " subBlocksOnlyBlockCount=" + + subBlocksOnlyBlockCount + + " termsOnlyBlockCount=" + + termsOnlyBlockCount; + } + + @Override + public String toString() { + final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); + PrintStream out; + try { + out = new PrintStream(bos, false, IOUtils.UTF_8); + } catch (UnsupportedEncodingException bogus) { + throw new RuntimeException(bogus); + } + + out.println(" index FST:"); + out.println(" " + indexNumBytes + " bytes"); + out.println(" terms:"); + out.println(" " + totalTermCount + " terms"); + out.println( + " " + + totalTermBytes + + " bytes" + + (totalTermCount != 0 + ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalTermBytes) / totalTermCount) + " bytes/term)" + : "") + ); + out.println(" blocks:"); + out.println(" " + totalBlockCount + " blocks"); + out.println(" " + termsOnlyBlockCount + " terms-only blocks"); + out.println(" " + subBlocksOnlyBlockCount + " sub-block-only blocks"); + out.println(" " + mixedBlockCount + " mixed blocks"); + out.println(" " + floorBlockCount + " floor blocks"); + out.println(" " + (totalBlockCount - floorSubBlockCount) + " non-floor blocks"); + out.println(" " + floorSubBlockCount + " floor sub-blocks"); + out.println( + " " + + totalUncompressedBlockSuffixBytes + + " term suffix bytes before compression" + + (totalBlockCount != 0 + ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes) / totalBlockCount) + " suffix-bytes/block)" + : "") + ); + StringBuilder compressionCounts = new StringBuilder(); + for (int code = 0; code < compressionAlgorithms.length; ++code) { + if (compressionAlgorithms[code] == 0) { + continue; + } + if (compressionCounts.length() > 0) { + compressionCounts.append(", "); + } + compressionCounts.append(CompressionAlgorithm.byCode(code)); + compressionCounts.append(": "); + compressionCounts.append(compressionAlgorithms[code]); + } + out.println( + " " + + totalBlockSuffixBytes + + " compressed term suffix bytes" + + (totalBlockCount != 0 + ? " (" + + String.format(Locale.ROOT, "%.2f", ((double) totalBlockSuffixBytes) / totalUncompressedBlockSuffixBytes) + + " compression ratio - compression count by algorithm: " + + compressionCounts + : "") + + ")" + ); + out.println( + " " + + totalBlockStatsBytes + + " term stats bytes " + + (totalBlockCount != 0 + ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes) / totalBlockCount) + " stats-bytes/block)" + : "") + ); + out.println( + " " + + totalBlockOtherBytes + + " other bytes" + + (totalBlockCount != 0 + ? " (" + String.format(Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes) / totalBlockCount) + " other-bytes/block)" + : "") + ); + if (totalBlockCount != 0) { + out.println(" by prefix length:"); + int total = 0; + for (int prefix = 0; prefix < blockCountByPrefixLen.length; prefix++) { + final int blockCount = blockCountByPrefixLen[prefix]; + total += blockCount; + if (blockCount != 0) { + out.println(" " + String.format(Locale.ROOT, "%2d", prefix) + ": " + blockCount); + } + } + assert totalBlockCount == total; + } + + try { + return bos.toString(IOUtils.UTF_8); + } catch (UnsupportedEncodingException bogus) { + throw new RuntimeException(bogus); + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BWCLucene50PostingsFormat.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BWCLucene50PostingsFormat.java new file mode 100644 index 0000000000000..fd04a28ce23fb --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BWCLucene50PostingsFormat.java @@ -0,0 +1,477 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.MultiLevelSkipListWriter; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.TermState; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.packed.PackedInts; +import org.elasticsearch.core.internal.io.IOUtils; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree.Lucene40BlockTreeTermsReader; + +import java.io.IOException; + +/** + * Lucene 5.0 postings format, which encodes postings in packed integer blocks for fast decode. + * + *

Basic idea: + * + *

    + *
  • Packed Blocks and VInt Blocks: + *

    In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed + * format}): the block size (i.e. number of integers inside block) is fixed (currently 128). + * Additionally blocks that are all the same value are encoded in an optimized way. + *

    In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}: the block + * size is variable. + *

  • Block structure: + *

    When the postings are long enough, Lucene50PostingsFormat will try to encode most + * integer data as a packed block. + *

    Take a term with 259 documents as an example, the first 256 document ids are encoded as + * two packed blocks, while the remaining 3 are encoded as one VInt block. + *

    Different kinds of data are always encoded separately into different packed blocks, but + * may possibly be interleaved into the same VInt block. + *

    This strategy is applied to pairs: <document number, frequency>, <position, + * payload length>, <position, offset start, offset length>, and <position, + * payload length, offsetstart, offset length>. + *

  • Skipdata settings: + *

    The structure of skip table is quite similar to previous version of Lucene. Skip + * interval is the same as block size, and each skip entry points to the beginning of each + * block. However, for the first block, skip data is omitted. + *

  • Positions, Payloads, and Offsets: + *

    A position is an integer indicating where the term occurs within one document. A payload + * is a blob of metadata associated with current position. An offset is a pair of integers + * indicating the tokenized start/end offsets for given term in current position: it is + * essentially a specialized payload. + *

    When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets + * (assuming a null payload contributes one count). As mentioned in block structure, it is + * possible to encode these three either combined or separately. + *

    In all cases, payloads and offsets are stored together. When encoded as a packed block, + * position data is separated out as .pos, while payloads and offsets are encoded in .pay + * (payload metadata will also be stored directly in .pay). When encoded as VInt blocks, all + * these three are stored interleaved into the .pos (so is payload metadata). + *

    With this strategy, the majority of payload and offset data will be outside .pos file. + * So for queries that require only position data, running on a full index with payloads and + * offsets, this reduces disk pre-fetches. + *

+ * + *

Files and detailed format: + * + *

+ * + * + * + *
+ *
Term Dictionary + *

The .tim file contains the list of terms in each field along with per-term statistics + * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the + * .doc, .pos, and .pay files. See {@code BlockTreeTermsWriter} for more details on the + * format. + *

NOTE: The term dictionary can plug into different postings implementations: the postings + * writer/reader are actually responsible for encoding and decoding the PostingsHeader and + * TermMetadata sections described here: + *

    + *
  • PostingsHeader --> Header, PackedBlockSize + *
  • TermMetadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, + * PayFPDelta?, SkipFPDelta? + *
  • Header, --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
  • PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt} + *
  • DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --> {@link + * DataOutput#writeVLong VLong} + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ *

Notes: + *

    + *
  • Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version + * information for the postings. + *
  • PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width + * is determined by the largest integer. Smaller block size result in smaller variance + * among width of integers hence smaller indexes. Larger block size result in more + * efficient bulk i/o hence better acceleration. This value should always be a multiple + * of 64, currently fixed as 128 as a tradeoff. It is also the skip interval used to + * accelerate {@link org.apache.lucene.index.PostingsEnum#advance(int)}. + *
  • DocFPDelta determines the position of this term's TermFreqs within the .doc file. In + * particular, it is the difference of file offset between this term's data and previous + * term's data (or zero, for the first term in the block).On disk it is stored as the + * difference from previous value in sequence. + *
  • PosFPDelta determines the position of this term's TermPositions within the .pos file. + * While PayFPDelta determines the position of this term's <TermPayloads, + * TermOffsets?> within the .pay file. Similar to DocFPDelta, it is the difference + * between two file positions (or neglected, for fields that omit payloads and offsets). + *
  • PosVIntBlockFPDelta determines the position of this term's last TermPosition in last + * pos packed block within the .pos file. It is synonym for PayVIntBlockFPDelta or + * OffsetVIntBlockFPDelta. This is actually used to indicate whether it is necessary to + * load following payloads and offsets from .pos instead of .pay. Every time a new block + * of positions are to be loaded, the PostingsReader will use this value to check + * whether current block is packed format or VInt. When packed format, payloads and + * offsets are fetched from .pay, otherwise from .pos. (this value is neglected when + * total number of positions i.e. totalTermFreq is less or equal to PackedBlockSize). + *
  • SkipFPDelta determines the position of this term's SkipData within the .doc file. In + * particular, it is the length of the TermFreq data. SkipDelta is only stored if + * DocFreq is not smaller than SkipMinimum (i.e. 128 in Lucene50PostingsFormat). + *
  • SingletonDocID is an optimization when a term only appears in one document. In this + * case, instead of writing a file pointer to the .doc file (DocFPDelta), and then a + * VIntBlock at that location, the single document ID is written to the term dictionary. + *
+ *
+ * + * + * + *
+ *
Term Index + *

The .tip file contains an index into the term dictionary, so that it can be accessed + * randomly. See {@code BlockTreeTermsWriter} for more details on the format. + *

+ * + * + * + *
+ *
Frequencies and Skip Data + *

The .doc file contains the lists of documents which contain each term, along with the + * frequency of the term in that document (except when frequencies are omitted: {@link + * IndexOptions#DOCS}). It also saves skip data to the beginning of each packed or VInt block, + * when the length of document list is larger than packed block size. + *

    + *
  • docFile(.doc) --> Header, <TermFreqs, SkipData?>TermCount, Footer + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
  • TermFreqs --> <PackedBlock> PackedDocBlockNum, VIntBlock? + *
  • PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock? + *
  • VIntBlock --> <DocDelta[, + * Freq?]>DocFreq-PackedBlockSize*PackedDocBlockNum + *
  • SkipData --> <<SkipLevelLength, SkipLevel> NumSkipLevels-1, + * SkipLevel>, SkipDatum? + *
  • SkipLevel --> <SkipDatum> TrimmedDocFreq/(PackedBlockSize^(Level + + * 1)) + *
  • SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?, + * PayFPSkip?>?, SkipChildLevelPointer? + *
  • PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts} + *
  • DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto, PayFPSkip + * --> {@link DataOutput#writeVInt VInt} + *
  • SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong} + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ *

Notes: + *

    + *
  • PackedDocDeltaBlock is theoretically generated from two steps: + *
      + *
    1. Calculate the difference between each document number and previous one, and get + * a d-gaps list (for the first document, use absolute value); + *
    2. For those d-gaps from first one to + * PackedDocBlockNum*PackedBlockSizeth, separately encode as packed + * blocks. + *
    + * If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step. + *
  • VIntBlock stores remaining d-gaps (along with frequencies when possible) with a + * format that encodes DocDelta and Freq: + *

    DocDelta: if frequencies are indexed, this determines both the document number and + * the frequency. In particular, DocDelta/2 is the difference between this document + * number and the previous document number (or zero when this is the first document in a + * TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the + * frequency is read as another VInt. If frequencies are omitted, DocDelta contains the + * gap (not multiplied by 2) between document numbers and no frequency information is + * stored. + *

    For example, the TermFreqs for a term which occurs once in document seven and + * three times in document eleven, with frequencies indexed, would be the following + * sequence of VInts: + *

    15, 8, 3 + *

    If frequencies were omitted ({@link IndexOptions#DOCS}) it would be this sequence + * of VInts instead: + *

    7,4 + *

  • PackedDocBlockNum is the number of packed blocks for current term's docids or + * frequencies. In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize) + *
  • TrimmedDocFreq = DocFreq % PackedBlockSize == 0 ? DocFreq - 1 : DocFreq. We use this + * trick since the definition of skip entry is a little different from base interface. + * In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for + * skipIntervalth, 2*skipIntervalth ... posting in the list. + * However, in Lucene50PostingsFormat, the skip data is saved for + * skipInterval+1th, 2*skipInterval+1th ... posting + * (skipInterval==PackedBlockSize in this case). When DocFreq is multiple of + * PackedBlockSize, MultiLevelSkipListWriter will expect one more skip data than + * Lucene50SkipWriter. + *
  • SkipDatum is the metadata of one skip entry. For the first block (no matter packed or + * VInt), it is omitted. + *
  • DocSkip records the document number of every PackedBlockSizeth document + * number in the postings (i.e. last document number in each packed block). On disk it + * is stored as the difference from previous value in the sequence. + *
  • DocFPSkip records the file offsets of each block (excluding )posting at + * PackedBlockSize+1th, 2*PackedBlockSize+1th ... , in DocFile. + * The file offsets are relative to the start of current term's TermFreqs. On disk it is + * also stored as the difference from previous SkipDatum in the sequence. + *
  • Since positions and payloads are also block encoded, the skip should skip to related + * block first, then fetch the values according to in-block offset. PosFPSkip and + * PayFPSkip record the file offsets of related block in .pos and .pay, respectively. + * While PosBlockOffset indicates which value to fetch inside the related block + * (PayBlockOffset is unnecessary since it is always equal to PosBlockOffset). Same as + * DocFPSkip, the file offsets are relative to the start of current term's TermFreqs, + * and stored as a difference sequence. + *
  • PayByteUpto indicates the start offset of the current payload. It is equivalent to + * the sum of the payload lengths in the current block up to PosBlockOffset + *
+ *
+ * + * + * + *
+ *
Positions + *

The .pos file contains the lists of positions that each term occurs at within documents. + * It also sometimes stores part of payloads and offsets for speedup. + *

    + *
  • PosFile(.pos) --> Header, <TermPositions> TermCount, Footer + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
  • TermPositions --> <PackedPosDeltaBlock> PackedPosBlockNum, + * VIntBlock? + *
  • VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?, OffsetDelta?, + * OffsetLength?>PosVIntCount + *
  • PackedPosDeltaBlock --> {@link PackedInts PackedInts} + *
  • PositionDelta, OffsetDelta, OffsetLength --> {@link DataOutput#writeVInt VInt} + *
  • PayloadData --> {@link DataOutput#writeByte byte}PayLength + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ *

Notes: + *

    + *
  • TermPositions are order by term (terms are implicit, from the term dictionary), and + * position values for each term document pair are incremental, and ordered by document + * number. + *
  • PackedPosBlockNum is the number of packed blocks for current term's positions, + * payloads or offsets. In particular, PackedPosBlockNum = + * floor(totalTermFreq/PackedBlockSize) + *
  • PosVIntCount is the number of positions encoded as VInt format. In particular, + * PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize + *
  • The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock + * in chapter Frequencies and Skip Data. + *
  • PositionDelta is, if payloads are disabled for the term's field, the difference + * between the position of the current occurrence in the document and the previous + * occurrence (or zero, if this is the first occurrence in this document). If payloads + * are enabled for the term's field, then PositionDelta/2 is the difference between the + * current and the previous position. If payloads are enabled and PositionDelta is odd, + * then PayloadLength is stored, indicating the length of the payload at the current + * term position. + *
  • For example, the TermPositions for a term which occurs as the fourth term in one + * document, and as the fifth and ninth term in a subsequent document, would be the + * following sequence of VInts (payloads disabled): + *

    4, 5, 4 + *

  • PayloadData is metadata associated with the current term position. If PayloadLength + * is stored at the current position, then it indicates the length of this payload. If + * PayloadLength is not stored, then this payload has the same length as the payload at + * the previous position. + *
  • OffsetDelta/2 is the difference between this position's startOffset from the previous + * occurrence (or zero, if this is the first occurrence in this document). If + * OffsetDelta is odd, then the length (endOffset-startOffset) differs from the previous + * occurrence and an OffsetLength follows. Offset data is only written for {@link + * IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}. + *
+ *
+ * + * + * + *
+ *
Payloads and Offsets + *

The .pay file will store payloads and offsets associated with certain term-document + * positions. Some payloads and offsets will be separated out into .pos file, for performance + * reasons. + *

    + *
  • PayFile(.pay): --> Header, <TermPayloads, TermOffsets?> + * TermCount, Footer + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
  • TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData> + * PackedPayBlockNum + *
  • TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock> + * PackedPayBlockNum + *
  • PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> + * {@link PackedInts PackedInts} + *
  • SumPayLength --> {@link DataOutput#writeVInt VInt} + *
  • PayData --> {@link DataOutput#writeByte byte}SumPayLength + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ *

Notes: + *

    + *
  • The order of TermPayloads/TermOffsets will be the same as TermPositions, note that + * part of payload/offsets are stored in .pos. + *
  • The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is + * the same as PackedFreqBlock in chapter Frequencies and Skip + * Data. While PackedStartDeltaBlock follows a same procedure as + * PackedDocDeltaBlock. + *
  • PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also + * synonym for PackedOffsetBlockNum. + *
  • SumPayLength is the total length of payloads written within one block, should be the + * sum of PayLengths in one packed block. + *
  • PayLength in PackedPayLengthBlock is the length of each payload associated with the + * current position. + *
+ *
+ * + * @lucene.experimental + */ +public class BWCLucene50PostingsFormat extends PostingsFormat { + + /** + * Filename extension for document number, frequencies, and skip data. See chapter: Frequencies and Skip Data + */ + public static final String DOC_EXTENSION = "doc"; + + /** Filename extension for positions. See chapter: Positions */ + public static final String POS_EXTENSION = "pos"; + + /** + * Filename extension for payloads and offsets. See chapter: Payloads and + * Offsets + */ + public static final String PAY_EXTENSION = "pay"; + + /** + * Expert: The maximum number of skip levels. Smaller values result in slightly smaller indexes, + * but slower skipping in big posting lists. + */ + static final int MAX_SKIP_LEVELS = 10; + + static final String TERMS_CODEC = "Lucene50PostingsWriterTerms"; + static final String DOC_CODEC = "Lucene50PostingsWriterDoc"; + static final String POS_CODEC = "Lucene50PostingsWriterPos"; + static final String PAY_CODEC = "Lucene50PostingsWriterPay"; + + // Increment version to change it + static final int VERSION_START = 0; + static final int VERSION_IMPACT_SKIP_DATA = 1; + static final int VERSION_CURRENT = VERSION_IMPACT_SKIP_DATA; + + /** Fixed packed block size, number of integers encoded in a single packed block. */ + // NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding + public static final int BLOCK_SIZE = 128; + + /** Creates {@code Lucene50PostingsFormat} with default settings. */ + public BWCLucene50PostingsFormat() { + super("Lucene50"); + } + + public BWCLucene50PostingsFormat(String name) { + super(name); + } + + @Override + public String toString() { + return getName() + "(blocksize=" + BLOCK_SIZE + ")"; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + throw new UnsupportedOperationException("Old formats can't be used for writing"); + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postingsReader = new Lucene50PostingsReader(state); + boolean success = false; + try { + FieldsProducer ret = new Lucene40BlockTreeTermsReader(postingsReader, state); + success = true; + return ret; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } + + /** + * Holds all state required for {@link Lucene50PostingsReader} to produce a {@link + * org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict. + * + * @lucene.internal + */ + public static final class IntBlockTermState extends BlockTermState { + /** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */ + public long docStartFP; + /** file pointer to the start of the positions enumeration, in {@link #POS_EXTENSION} file */ + public long posStartFP; + /** file pointer to the start of the payloads enumeration, in {@link #PAY_EXTENSION} file */ + public long payStartFP; + /** + * file offset for the start of the skip list, relative to docStartFP, if there are more than + * {@link #BLOCK_SIZE} docs; otherwise -1 + */ + public long skipOffset; + /** + * file offset for the last position in the last block, if there are more than {@link + * #BLOCK_SIZE} positions; otherwise -1 + */ + public long lastPosBlockOffset; + /** + * docid when there is a single pulsed posting, otherwise -1. freq is always implicitly + * totalTermFreq in this case. + */ + public int singletonDocID; + + /** Sole constructor. */ + public IntBlockTermState() { + skipOffset = -1; + lastPosBlockOffset = -1; + singletonDocID = -1; + } + + @Override + public IntBlockTermState clone() { + IntBlockTermState other = new IntBlockTermState(); + other.copyFrom(this); + return other; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + IntBlockTermState other = (IntBlockTermState) _other; + docStartFP = other.docStartFP; + posStartFP = other.posStartFP; + payStartFP = other.payStartFP; + lastPosBlockOffset = other.lastPosBlockOffset; + skipOffset = other.skipOffset; + singletonDocID = other.singletonDocID; + } + + @Override + public String toString() { + return super.toString() + + " docStartFP=" + + docStartFP + + " posStartFP=" + + posStartFP + + " payStartFP=" + + payStartFP + + " lastPosBlockOffset=" + + lastPosBlockOffset + + " singletonDocID=" + + singletonDocID; + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/ForUtil.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/ForUtil.java new file mode 100644 index 0000000000000..a567f25869407 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/ForUtil.java @@ -0,0 +1,235 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.PackedInts.Decoder; +import org.apache.lucene.util.packed.PackedInts.FormatAndBits; + +import java.io.IOException; +import java.util.Arrays; + +import static org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat.BLOCK_SIZE; + +/** + * Encode all values in normal area with fixed bit width, which is determined by the max value in + * this block. + */ +final class ForUtil { + + /** Special number of bits per value used whenever all values to encode are equal. */ + private static final int ALL_VALUES_EQUAL = 0; + + /** + * Upper limit of the number of bytes that might be required to stored BLOCK_SIZE + * encoded values. + */ + static final int MAX_ENCODED_SIZE = BLOCK_SIZE * 4; + + /** + * Upper limit of the number of values that might be decoded in a single call to {@link + * #readBlock(IndexInput, byte[], int[])}. Although values after BLOCK_SIZE are + * garbage, it is necessary to allocate value buffers whose size is {@code >= MAX_DATA_SIZE} to + * avoid {@link ArrayIndexOutOfBoundsException}s. + */ + static final int MAX_DATA_SIZE; + + static { + int maxDataSize = 0; + for (int version = PackedInts.VERSION_START; version <= PackedInts.VERSION_CURRENT; version++) { + for (PackedInts.Format format : PackedInts.Format.values()) { + for (int bpv = 1; bpv <= 32; ++bpv) { + if (format.isSupported(bpv) == false) { + continue; + } + final Decoder decoder = PackedInts.getDecoder(format, version, bpv); + final int iterations = computeIterations(decoder); + maxDataSize = Math.max(maxDataSize, iterations * decoder.byteValueCount()); + } + } + } + MAX_DATA_SIZE = maxDataSize; + } + + /** + * Compute the number of iterations required to decode BLOCK_SIZE values with the + * provided {@link Decoder}. + */ + private static int computeIterations(Decoder decoder) { + return (int) Math.ceil((float) BLOCK_SIZE / decoder.byteValueCount()); + } + + /** + * Compute the number of bytes required to encode a block of values that require + * bitsPerValue bits per value with format format. + */ + private static int encodedSize(PackedInts.Format format, int packedIntsVersion, int bitsPerValue) { + final long byteCount = format.byteCount(packedIntsVersion, BLOCK_SIZE, bitsPerValue); + assert byteCount >= 0 && byteCount <= Integer.MAX_VALUE : byteCount; + return (int) byteCount; + } + + private final int[] encodedSizes; + private final PackedInts.Encoder[] encoders; + private final Decoder[] decoders; + private final int[] iterations; + + /** Create a new {@link ForUtil} instance and save state into out. */ + ForUtil(float acceptableOverheadRatio, DataOutput out) throws IOException { + out.writeVInt(PackedInts.VERSION_CURRENT); + encodedSizes = new int[33]; + encoders = new PackedInts.Encoder[33]; + decoders = new Decoder[33]; + iterations = new int[33]; + + for (int bpv = 1; bpv <= 32; ++bpv) { + final FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(BLOCK_SIZE, bpv, acceptableOverheadRatio); + assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue); + assert formatAndBits.bitsPerValue <= 32; + encodedSizes[bpv] = encodedSize(formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue); + encoders[bpv] = PackedInts.getEncoder(formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue); + decoders[bpv] = PackedInts.getDecoder(formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue); + iterations[bpv] = computeIterations(decoders[bpv]); + + out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1)); + } + } + + /** Restore a {@link ForUtil} from a {@link DataInput}. */ + ForUtil(DataInput in) throws IOException { + int packedIntsVersion = in.readVInt(); + PackedInts.checkVersion(packedIntsVersion); + encodedSizes = new int[33]; + encoders = new PackedInts.Encoder[33]; + decoders = new Decoder[33]; + iterations = new int[33]; + + for (int bpv = 1; bpv <= 32; ++bpv) { + final int code = in.readVInt(); + final int formatId = code >>> 5; + final int bitsPerValue = (code & 31) + 1; + + final PackedInts.Format format = PackedInts.Format.byId(formatId); + assert format.isSupported(bitsPerValue); + encodedSizes[bpv] = encodedSize(format, packedIntsVersion, bitsPerValue); + encoders[bpv] = PackedInts.getEncoder(format, packedIntsVersion, bitsPerValue); + decoders[bpv] = PackedInts.getDecoder(format, packedIntsVersion, bitsPerValue); + iterations[bpv] = computeIterations(decoders[bpv]); + } + } + + /** + * Write a block of data (For format). + * + * @param data the data to write + * @param encoded a buffer to use to encode data + * @param out the destination output + * @throws IOException If there is a low-level I/O error + */ + void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException { + if (isAllEqual(data)) { + out.writeByte((byte) ALL_VALUES_EQUAL); + out.writeVInt(data[0]); + return; + } + + final int numBits = bitsRequired(data); + assert numBits > 0 && numBits <= 32 : numBits; + final PackedInts.Encoder encoder = encoders[numBits]; + final int iters = iterations[numBits]; + assert iters * encoder.byteValueCount() >= BLOCK_SIZE; + final int encodedSize = encodedSizes[numBits]; + assert iters * encoder.byteBlockCount() >= encodedSize; + + out.writeByte((byte) numBits); + + encoder.encode(data, 0, encoded, 0, iters); + out.writeBytes(encoded, encodedSize); + } + + /** + * Read the next block of data (For format). + * + * @param in the input to use to read data + * @param encoded a buffer that can be used to store encoded data + * @param decoded where to write decoded data + * @throws IOException If there is a low-level I/O error + */ + void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException { + final int numBits = in.readByte(); + assert numBits <= 32 : numBits; + + if (numBits == ALL_VALUES_EQUAL) { + final int value = in.readVInt(); + Arrays.fill(decoded, 0, BLOCK_SIZE, value); + return; + } + + final int encodedSize = encodedSizes[numBits]; + in.readBytes(encoded, 0, encodedSize); + + final Decoder decoder = decoders[numBits]; + final int iters = iterations[numBits]; + assert iters * decoder.byteValueCount() >= BLOCK_SIZE; + + decoder.decode(encoded, 0, decoded, 0, iters); + } + + /** + * Skip the next block of data. + * + * @param in the input where to read data + * @throws IOException If there is a low-level I/O error + */ + void skipBlock(IndexInput in) throws IOException { + final int numBits = in.readByte(); + if (numBits == ALL_VALUES_EQUAL) { + in.readVInt(); + return; + } + assert numBits > 0 && numBits <= 32 : numBits; + final int encodedSize = encodedSizes[numBits]; + in.seek(in.getFilePointer() + encodedSize); + } + + private static boolean isAllEqual(final int[] data) { + final int v = data[0]; + for (int i = 1; i < BLOCK_SIZE; ++i) { + if (data[i] != v) { + return false; + } + } + return true; + } + + /** Compute the number of bits required to serialize any of the longs in data. */ + private static int bitsRequired(final int[] data) { + long or = 0; + for (int i = 0; i < BLOCK_SIZE; ++i) { + assert data[i] >= 0; + or |= data[i]; + } + return PackedInts.bitsRequired(or); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsReader.java new file mode 100644 index 0000000000000..206f5e1ae943b --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsReader.java @@ -0,0 +1,1787 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SlowImpactsEnum; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.core.internal.io.IOUtils; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.IntBlockTermState; + +import java.io.IOException; +import java.util.Arrays; + +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.BLOCK_SIZE; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.DOC_CODEC; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.MAX_SKIP_LEVELS; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.PAY_CODEC; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.POS_CODEC; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.TERMS_CODEC; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.VERSION_CURRENT; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.VERSION_START; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.ForUtil.MAX_DATA_SIZE; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.ForUtil.MAX_ENCODED_SIZE; + +/** + * Concrete class that reads docId(maybe frq,pos,offset,payloads) list with postings format. + * + * @lucene.experimental + */ +public final class Lucene50PostingsReader extends PostingsReaderBase { + + private final IndexInput docIn; + private final IndexInput posIn; + private final IndexInput payIn; + + final ForUtil forUtil; + private int version; + + /** Sole constructor. */ + public Lucene50PostingsReader(SegmentReadState state) throws IOException { + boolean success = false; + IndexInput docIn = null; + IndexInput posIn = null; + IndexInput payIn = null; + + // NOTE: these data files are too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + + String docName = IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + BWCLucene50PostingsFormat.DOC_EXTENSION + ); + try { + docIn = EndiannessReverserUtil.openInput(state.directory, docName, state.context); + version = CodecUtil.checkIndexHeader( + docIn, + DOC_CODEC, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + forUtil = new ForUtil(docIn); + CodecUtil.retrieveChecksum(docIn); + + if (state.fieldInfos.hasProx()) { + String proxName = IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + BWCLucene50PostingsFormat.POS_EXTENSION + ); + posIn = EndiannessReverserUtil.openInput(state.directory, proxName, state.context); + CodecUtil.checkIndexHeader(posIn, POS_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(posIn); + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payName = IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + BWCLucene50PostingsFormat.PAY_EXTENSION + ); + payIn = EndiannessReverserUtil.openInput(state.directory, payName, state.context); + CodecUtil.checkIndexHeader(payIn, PAY_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(payIn); + } + } + + this.docIn = docIn; + this.posIn = posIn; + this.payIn = payIn; + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(docIn, posIn, payIn); + } + } + } + + @Override + public void init(IndexInput termsIn, SegmentReadState state) throws IOException { + // Make sure we are talking to the matching postings writer + CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + final int indexBlockSize = termsIn.readVInt(); + if (indexBlockSize != BLOCK_SIZE) { + throw new IllegalStateException("index-time BLOCK_SIZE (" + indexBlockSize + ") != read-time BLOCK_SIZE (" + BLOCK_SIZE + ")"); + } + } + + /** Read values that have been written using variable-length encoding instead of bit-packing. */ + static void readVIntBlock(IndexInput docIn, int[] docBuffer, int[] freqBuffer, int num, boolean indexHasFreq) throws IOException { + if (indexHasFreq) { + for (int i = 0; i < num; i++) { + final int code = docIn.readVInt(); + docBuffer[i] = code >>> 1; + if ((code & 1) != 0) { + freqBuffer[i] = 1; + } else { + freqBuffer[i] = docIn.readVInt(); + } + } + } else { + for (int i = 0; i < num; i++) { + docBuffer[i] = docIn.readVInt(); + } + } + } + + @Override + public BlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void close() throws IOException { + IOUtils.close(docIn, posIn, payIn); + } + + @Override + public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { + final IntBlockTermState termState = (IntBlockTermState) _termState; + final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + final boolean fieldHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + final boolean fieldHasPayloads = fieldInfo.hasPayloads(); + + if (absolute) { + termState.docStartFP = 0; + termState.posStartFP = 0; + termState.payStartFP = 0; + } + + termState.docStartFP += in.readVLong(); + if (fieldHasPositions) { + termState.posStartFP += in.readVLong(); + if (fieldHasOffsets || fieldHasPayloads) { + termState.payStartFP += in.readVLong(); + } + } + if (termState.docFreq == 1) { + termState.singletonDocID = in.readVInt(); + } else { + termState.singletonDocID = -1; + } + if (fieldHasPositions) { + if (termState.totalTermFreq > BLOCK_SIZE) { + termState.lastPosBlockOffset = in.readVLong(); + } else { + termState.lastPosBlockOffset = -1; + } + } + if (termState.docFreq > BLOCK_SIZE) { + termState.skipOffset = in.readVLong(); + } else { + termState.skipOffset = -1; + } + } + + @Override + public PostingsEnum postings(FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException { + + boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + + if (indexHasPositions == false || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { + BlockDocsEnum docsEnum; + if (reuse instanceof BlockDocsEnum) { + docsEnum = (BlockDocsEnum) reuse; + if (docsEnum.canReuse(docIn, fieldInfo) == false) { + docsEnum = new BlockDocsEnum(fieldInfo); + } + } else { + docsEnum = new BlockDocsEnum(fieldInfo); + } + return docsEnum.reset((IntBlockTermState) termState, flags); + } else { + EverythingEnum everythingEnum; + if (reuse instanceof EverythingEnum) { + everythingEnum = (EverythingEnum) reuse; + if (everythingEnum.canReuse(docIn, fieldInfo) == false) { + everythingEnum = new EverythingEnum(fieldInfo); + } + } else { + everythingEnum = new EverythingEnum(fieldInfo); + } + return everythingEnum.reset((IntBlockTermState) termState, flags); + } + } + + @Override + public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int flags) throws IOException { + if (state.docFreq <= BLOCK_SIZE || version < BWCLucene50PostingsFormat.VERSION_IMPACT_SKIP_DATA) { + // no skip data + return new SlowImpactsEnum(postings(fieldInfo, state, null, flags)); + } + + final boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + final boolean indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + final boolean indexHasPayloads = fieldInfo.hasPayloads(); + + if (indexHasPositions + && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) + && (indexHasOffsets == false || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) + && (indexHasPayloads == false || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { + return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); + } + + return new BlockImpactsEverythingEnum(fieldInfo, (IntBlockTermState) state, flags); + } + + final class BlockDocsEnum extends PostingsEnum { + private final byte[] encoded; + + private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE]; + private final int[] freqBuffer = new int[MAX_DATA_SIZE]; + + private int docBufferUpto; + + private Lucene50SkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + IndexInput docIn; + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // sum of freqs in this posting list (or docFreq when omitted) + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private long skipOffset; + + // docID for next skip point, we won't use skipper if + // target docID is not larger than this + private int nextSkipDoc; + + private boolean needsFreq; // true if the caller actually needs frequencies + // as we read freqs lazily, isFreqsRead shows if freqs are read for the current block + // always true when we don't have freqs (indexHasFreq=false) or don't need freqs + // (needsFreq=false) + private boolean isFreqsRead; + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + BlockDocsEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = Lucene50PostingsReader.this.docIn; + this.docIn = null; + indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + encoded = new byte[MAX_ENCODED_SIZE]; + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn + && indexHasFreq == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) + && indexHasPos == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) + && indexHasPayloads == fieldInfo.hasPayloads(); + } + + public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + docFreq = termState.docFreq; + totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; + docTermStartFP = termState.docStartFP; + skipOffset = termState.skipOffset; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } + + doc = -1; + this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); + this.isFreqsRead = true; + if (indexHasFreq == false || needsFreq == false) { + Arrays.fill(freqBuffer, 1); + } + accum = 0; + docUpto = 0; + nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block + docBufferUpto = BLOCK_SIZE; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + if (isFreqsRead == false) { + forUtil.readBlock(docIn, encoded, freqBuffer); // read freqs for this block + isFreqsRead = true; + } + return freqBuffer[docBufferUpto - 1]; + } + + @Override + public int nextPosition() throws IOException { + return -1; + } + + @Override + public int startOffset() throws IOException { + return -1; + } + + @Override + public int endOffset() throws IOException { + return -1; + } + + @Override + public BytesRef getPayload() throws IOException { + return null; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + // Check if we skipped reading the previous block of freqs, and if yes, position docIn after + // it + if (isFreqsRead == false) { + forUtil.skipBlock(docIn); + isFreqsRead = true; + } + + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= BLOCK_SIZE) { + forUtil.readBlock(docIn, encoded, docDeltaBuffer); + + if (indexHasFreq) { + if (needsFreq) { + isFreqsRead = false; + } else { + forUtil.skipBlock(docIn); // skip over freqs if we don't need them at all + } + } + } else if (docFreq == 1) { + docDeltaBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; + } else { + // Read vInts: + readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, indexHasFreq); + } + docBufferUpto = 0; + } + + @Override + public int nextDoc() throws IOException { + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); // we don't need to load freqs for now (will be loaded later if necessary) + } + + accum += docDeltaBuffer[docBufferUpto]; + docUpto++; + + doc = accum; + docBufferUpto++; + return doc; + } + + @Override + public int advance(int target) throws IOException { + // current skip docID < docIDs generated from current buffer <= next skip docID + // we don't need to skip if target is buffered already + if (docFreq > BLOCK_SIZE && target > nextSkipDoc) { + + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + skipper = new Lucene50SkipReader( + version, + docIn.clone(), + MAX_SKIP_LEVELS, + indexHasPos, + indexHasOffsets, + indexHasPayloads + ); + } + + if (skipped == false) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + skipper.init(docTermStartFP + skipOffset, docTermStartFP, 0, 0, docFreq); + skipped = true; + } + + // always plus one to fix the result, since skip position in Lucene50SkipReader + // is a little different from MultiLevelSkipListReader + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > docUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); // actually, this is just lastSkipEntry + docIn.seek(skipper.getDocPointer()); // now point to the block we want to search + // even if freqs were not read from the previous block, we will mark them as read, + // as we don't need to skip the previous block freqs in refillDocs, + // as we have already positioned docIn where in needs to be. + isFreqsRead = true; + } + // next time we call advance, this is used to + // foresee whether skipper is necessary. + nextSkipDoc = skipper.getNextSkipDoc(); + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + // Now scan... this is an inlined/pared down version + // of nextDoc(): + while (true) { + accum += docDeltaBuffer[docBufferUpto]; + docUpto++; + + if (accum >= target) { + break; + } + docBufferUpto++; + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + } + + docBufferUpto++; + return doc = accum; + } + + @Override + public long cost() { + return docFreq; + } + } + + // Also handles payloads + offsets + final class EverythingEnum extends PostingsEnum { + + private final byte[] encoded; + + private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE]; + private final int[] freqBuffer = new int[MAX_DATA_SIZE]; + private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE]; + + private final int[] payloadLengthBuffer; + private final int[] offsetStartDeltaBuffer; + private final int[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastStartOffset; + private int startOffset; + private int endOffset; + + private int docBufferUpto; + private int posBufferUpto; + + private Lucene50SkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + IndexInput docIn; + final IndexInput posIn; + final IndexInput payIn; + final BytesRef payload; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // number of positions in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Lazy pay seek: if != -1 then we must seek to this FP + // before reading payloads/offsets: + private long payPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private long skipOffset; + + private int nextSkipDoc; + + private boolean needsOffsets; // true if we actually need offsets + private boolean needsPayloads; // true if we actually need payloads + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + EverythingEnum(FieldInfo fieldInfo) throws IOException { + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + + this.startDocIn = Lucene50PostingsReader.this.docIn; + this.docIn = null; + this.posIn = Lucene50PostingsReader.this.posIn.clone(); + if (indexHasOffsets || indexHasPayloads) { + this.payIn = Lucene50PostingsReader.this.payIn.clone(); + } else { + this.payIn = null; + } + encoded = new byte[MAX_ENCODED_SIZE]; + if (indexHasOffsets) { + offsetStartDeltaBuffer = new int[MAX_DATA_SIZE]; + offsetLengthBuffer = new int[MAX_DATA_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + if (indexHasPayloads) { + payloadLengthBuffer = new int[MAX_DATA_SIZE]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn + && indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) + && indexHasPayloads == fieldInfo.hasPayloads(); + } + + public EverythingEnum reset(IntBlockTermState termState, int flags) throws IOException { + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + skipOffset = termState.skipOffset; + totalTermFreq = termState.totalTermFreq; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } + posPendingFP = posTermStartFP; + payPendingFP = payTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + this.needsOffsets = PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); + this.needsPayloads = PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); + + doc = -1; + accum = 0; + docUpto = 0; + if (docFreq > BLOCK_SIZE) { + nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block + } else { + nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping + } + docBufferUpto = BLOCK_SIZE; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= BLOCK_SIZE) { + forUtil.readBlock(docIn, encoded, docDeltaBuffer); + forUtil.readBlock(docIn, encoded, freqBuffer); + } else if (docFreq == 1) { + docDeltaBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; + } else { + readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true); + } + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + int offsetLength = 0; + payloadByteUpto = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + payloadLengthBuffer[i] = payloadLength; + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + offsetLength = posIn.readVInt(); + } + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; + } + } + payloadByteUpto = 0; + } else { + forUtil.readBlock(posIn, encoded, posDeltaBuffer); + + if (indexHasPayloads) { + if (needsPayloads) { + forUtil.readBlock(payIn, encoded, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, numBytes); + } + payIn.readBytes(payloadBytes, 0, numBytes); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + forUtil.skipBlock(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets) { + if (needsOffsets) { + forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); + forUtil.readBlock(payIn, encoded, offsetLengthBuffer); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + forUtil.skipBlock(payIn); // skip over starts + forUtil.skipBlock(payIn); // skip over lengths + } + } + } + } + + @Override + public int nextDoc() throws IOException { + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + accum += docDeltaBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + doc = accum; + position = 0; + lastStartOffset = 0; + return doc; + } + + @Override + public int advance(int target) throws IOException { + // TODO: make frq block load lazy/skippable + + if (target > nextSkipDoc) { + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + skipper = new Lucene50SkipReader(version, docIn.clone(), MAX_SKIP_LEVELS, true, indexHasOffsets, indexHasPayloads); + } + + if (skipped == false) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + skipper.init(docTermStartFP + skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq); + skipped = true; + } + + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > docUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + posPendingFP = skipper.getPosPointer(); + payPendingFP = skipper.getPayPointer(); + posPendingCount = skipper.getPosBufferUpto(); + lastStartOffset = 0; // new document + payloadByteUpto = skipper.getPayloadByteUpto(); + } + nextSkipDoc = skipper.getNextSkipDoc(); + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + // Now scan: + while (true) { + accum += docDeltaBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + if (accum >= target) { + break; + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + } + + position = 0; + lastStartOffset = 0; + return doc = accum; + } + + // TODO: in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointer ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + // if (DEBUG) { + // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + // } + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + int end = posBufferUpto + toSkip; + while (posBufferUpto < end) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } else { + toSkip -= leftInBlock; + while (toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + forUtil.skipBlock(posIn); + + if (indexHasPayloads) { + // Skip payloadLength block: + forUtil.skipBlock(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets) { + forUtil.skipBlock(payIn); + forUtil.skipBlock(payIn); + } + toSkip -= BLOCK_SIZE; + } + refillPositions(); + payloadByteUpto = 0; + posBufferUpto = 0; + while (posBufferUpto < toSkip) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } + + position = 0; + lastStartOffset = 0; + } + + @Override + public int nextPosition() throws IOException { + assert posPendingCount > 0; + + if (posPendingFP != -1) { + posIn.seek(posPendingFP); + posPendingFP = -1; + + if (payPendingFP != -1 && payIn != null) { + payIn.seek(payPendingFP); + payPendingFP = -1; + } + + // Force buffer refill: + posBufferUpto = BLOCK_SIZE; + } + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto]; + + if (indexHasPayloads) { + payloadLength = payloadLengthBuffer[posBufferUpto]; + payload.bytes = payloadBytes; + payload.offset = payloadByteUpto; + payload.length = payloadLength; + payloadByteUpto += payloadLength; + } + + if (indexHasOffsets) { + startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto]; + endOffset = startOffset + offsetLengthBuffer[posBufferUpto]; + lastStartOffset = startOffset; + } + + posBufferUpto++; + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + return startOffset; + } + + @Override + public int endOffset() { + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (payloadLength == 0) { + return null; + } else { + return payload; + } + } + + @Override + public long cost() { + return docFreq; + } + } + + final class BlockImpactsPostingsEnum extends ImpactsEnum { + + private final byte[] encoded; + + private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE]; + private final int[] freqBuffer = new int[MAX_DATA_SIZE]; + private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE]; + + private int docBufferUpto; + private int posBufferUpto; + + private final Lucene50ScoreSkipReader skipper; + + final IndexInput docIn; + final IndexInput posIn; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // number of positions in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + private int nextSkipDoc = -1; + + private long seekTo = -1; + + BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) throws IOException { + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + + this.docIn = Lucene50PostingsReader.this.docIn.clone(); + + encoded = new byte[MAX_ENCODED_SIZE]; + + this.posIn = Lucene50PostingsReader.this.posIn.clone(); + + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + totalTermFreq = termState.totalTermFreq; + docIn.seek(docTermStartFP); + posPendingFP = posTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + doc = -1; + accum = 0; + docUpto = 0; + docBufferUpto = BLOCK_SIZE; + + skipper = new Lucene50ScoreSkipReader(version, docIn.clone(), MAX_SKIP_LEVELS, true, indexHasOffsets, indexHasPayloads); + skipper.init(docTermStartFP + termState.skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq); + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= BLOCK_SIZE) { + forUtil.readBlock(docIn, encoded, docDeltaBuffer); + forUtil.readBlock(docIn, encoded, freqBuffer); + } else { + readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true); + } + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + posIn.seek(posIn.getFilePointer() + payloadLength); + } + } else { + posDeltaBuffer[i] = code; + } + if (indexHasOffsets) { + if ((posIn.readVInt() & 1) != 0) { + // offset length changed + posIn.readVInt(); + } + } + } + } else { + forUtil.readBlock(posIn, encoded, posDeltaBuffer); + } + } + + @Override + public void advanceShallow(int target) throws IOException { + if (target > nextSkipDoc) { + // always plus one to fix the result, since skip position in Lucene50SkipReader + // is a little different from MultiLevelSkipListReader + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > docUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); + posPendingFP = skipper.getPosPointer(); + posPendingCount = skipper.getPosBufferUpto(); + seekTo = skipper.getDocPointer(); // delay the seek + } + // next time we call advance, this is used to + // foresee whether skipper is necessary. + nextSkipDoc = skipper.getNextSkipDoc(); + } + assert nextSkipDoc >= target; + } + + @Override + public Impacts getImpacts() throws IOException { + advanceShallow(doc); + return skipper.getImpacts(); + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target > nextSkipDoc) { + advanceShallow(target); + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + if (seekTo >= 0) { + docIn.seek(seekTo); + seekTo = -1; + } + refillDocs(); + } + + // Now scan: + while (true) { + accum += docDeltaBuffer[docBufferUpto]; + freq = freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + docUpto++; + + if (accum >= target) { + break; + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + } + position = 0; + + return doc = accum; + } + + // TODO: in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointer ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + posBufferUpto += toSkip; + } else { + toSkip -= leftInBlock; + while (toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + forUtil.skipBlock(posIn); + toSkip -= BLOCK_SIZE; + } + refillPositions(); + posBufferUpto = toSkip; + } + + position = 0; + } + + @Override + public int nextPosition() throws IOException { + assert posPendingCount > 0; + + if (posPendingFP != -1) { + posIn.seek(posPendingFP); + posPendingFP = -1; + + // Force buffer refill: + posBufferUpto = BLOCK_SIZE; + } + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto++]; + + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override + public BytesRef getPayload() { + return null; + } + + @Override + public long cost() { + return docFreq; + } + } + + final class BlockImpactsEverythingEnum extends ImpactsEnum { + + private final byte[] encoded; + + private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE]; + private final int[] freqBuffer = new int[MAX_DATA_SIZE]; + private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE]; + + private final int[] payloadLengthBuffer; + private final int[] offsetStartDeltaBuffer; + private final int[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastStartOffset; + private int startOffset = -1; + private int endOffset = -1; + + private int docBufferUpto; + private int posBufferUpto; + + private final Lucene50ScoreSkipReader skipper; + + final IndexInput docIn; + final IndexInput posIn; + final IndexInput payIn; + final BytesRef payload; + + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // number of positions in this posting list + private int docUpto; // how many docs we've read + private int posDocUpTo; // for how many docs we've read positions, offsets, and payloads + private int doc; // doc we last read + private int accum; // accumulator for doc deltas + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Lazy pay seek: if != -1 then we must seek to this FP + // before reading payloads/offsets: + private long payPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + private int nextSkipDoc = -1; + + private final boolean needsPositions; + private final boolean needsOffsets; // true if we actually need offsets + private final boolean needsPayloads; // true if we actually need payloads + + private boolean isFreqsRead; // shows if freqs for the current doc block are read into freqBuffer + + private long seekTo = -1; + + BlockImpactsEverythingEnum(FieldInfo fieldInfo, IntBlockTermState termState, int flags) throws IOException { + indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + + needsPositions = PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS); + needsOffsets = PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); + needsPayloads = PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); + + this.docIn = Lucene50PostingsReader.this.docIn.clone(); + + encoded = new byte[MAX_ENCODED_SIZE]; + + if (indexHasPos && needsPositions) { + this.posIn = Lucene50PostingsReader.this.posIn.clone(); + } else { + this.posIn = null; + } + + if ((indexHasOffsets && needsOffsets) || (indexHasPayloads && needsPayloads)) { + this.payIn = Lucene50PostingsReader.this.payIn.clone(); + } else { + this.payIn = null; + } + + if (indexHasOffsets) { + offsetStartDeltaBuffer = new int[MAX_DATA_SIZE]; + offsetLengthBuffer = new int[MAX_DATA_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + if (indexHasPayloads) { + payloadLengthBuffer = new int[MAX_DATA_SIZE]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } + + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + totalTermFreq = termState.totalTermFreq; + docIn.seek(docTermStartFP); + posPendingFP = posTermStartFP; + payPendingFP = payTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + doc = -1; + accum = 0; + docUpto = 0; + posDocUpTo = 0; + isFreqsRead = true; + docBufferUpto = BLOCK_SIZE; + + skipper = new Lucene50ScoreSkipReader(version, docIn.clone(), MAX_SKIP_LEVELS, indexHasPos, indexHasOffsets, indexHasPayloads); + skipper.init(docTermStartFP + termState.skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq); + + if (indexHasFreq == false) { + Arrays.fill(freqBuffer, 1); + } + } + + @Override + public int freq() throws IOException { + if (indexHasFreq && (isFreqsRead == false)) { + forUtil.readBlock(docIn, encoded, freqBuffer); // read freqs for this block + isFreqsRead = true; + } + return freqBuffer[docBufferUpto - 1]; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + if (indexHasFreq) { + if (isFreqsRead == false) { // previous freq block was not read + // check if we need to load the previous freq block to catch up on positions or we can + // skip it + if (indexHasPos && needsPositions && (posDocUpTo < docUpto)) { + forUtil.readBlock(docIn, encoded, freqBuffer); // load the previous freq block + } else { + forUtil.skipBlock(docIn); // skip it + } + isFreqsRead = true; + } + if (indexHasPos && needsPositions) { + while (posDocUpTo < docUpto) { // catch on positions, bring posPendingCount upto the current doc + posPendingCount += freqBuffer[docBufferUpto - (docUpto - posDocUpTo)]; + posDocUpTo++; + } + } + } + + final int left = docFreq - docUpto; + assert left > 0; + + if (left >= BLOCK_SIZE) { + forUtil.readBlock(docIn, encoded, docDeltaBuffer); + if (indexHasFreq) { + isFreqsRead = false; // freq block will be loaded lazily when necessary, we don't load it here + } + } else { + readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, indexHasFreq); + } + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + int offsetLength = 0; + payloadByteUpto = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + payloadLengthBuffer[i] = payloadLength; + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + offsetLength = posIn.readVInt(); + } + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; + } + } + payloadByteUpto = 0; + } else { + forUtil.readBlock(posIn, encoded, posDeltaBuffer); + + if (indexHasPayloads && payIn != null) { + if (needsPayloads) { + forUtil.readBlock(payIn, encoded, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, numBytes); + } + payIn.readBytes(payloadBytes, 0, numBytes); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + forUtil.skipBlock(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets && payIn != null) { + if (needsOffsets) { + forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); + forUtil.readBlock(payIn, encoded, offsetLengthBuffer); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + forUtil.skipBlock(payIn); // skip over starts + forUtil.skipBlock(payIn); // skip over lengths + } + } + } + } + + @Override + public void advanceShallow(int target) throws IOException { + if (target > nextSkipDoc) { + // always plus one to fix the result, since skip position in Lucene50SkipReader + // is a little different from MultiLevelSkipListReader + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > docUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; + posDocUpTo = docUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); + posPendingFP = skipper.getPosPointer(); + payPendingFP = skipper.getPayPointer(); + posPendingCount = skipper.getPosBufferUpto(); + lastStartOffset = 0; // new document + payloadByteUpto = skipper.getPayloadByteUpto(); // actually, this is just lastSkipEntry + seekTo = skipper.getDocPointer(); // delay the seek + } + // next time we call advance, this is used to + // foresee whether skipper is necessary. + nextSkipDoc = skipper.getNextSkipDoc(); + } + assert nextSkipDoc >= target; + } + + @Override + public Impacts getImpacts() throws IOException { + advanceShallow(doc); + return skipper.getImpacts(); + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target > nextSkipDoc) { + advanceShallow(target); + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + if (docBufferUpto == BLOCK_SIZE) { + if (seekTo >= 0) { + docIn.seek(seekTo); + seekTo = -1; + isFreqsRead = true; // reset isFreqsRead + } + refillDocs(); + } + + // Now scan: + while (true) { + accum += docDeltaBuffer[docBufferUpto]; + docBufferUpto++; + docUpto++; + + if (accum >= target) { + break; + } + if (docUpto == docFreq) { + return doc = NO_MORE_DOCS; + } + } + position = 0; + lastStartOffset = 0; + + return doc = accum; + } + + // TODO: in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointer ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freqBuffer[docBufferUpto - 1]; + // if (DEBUG) { + // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + // } + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + int end = posBufferUpto + toSkip; + while (posBufferUpto < end) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } else { + toSkip -= leftInBlock; + while (toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + forUtil.skipBlock(posIn); + + if (indexHasPayloads && payIn != null) { + // Skip payloadLength block: + forUtil.skipBlock(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets && payIn != null) { + forUtil.skipBlock(payIn); + forUtil.skipBlock(payIn); + } + toSkip -= BLOCK_SIZE; + } + refillPositions(); + payloadByteUpto = 0; + posBufferUpto = 0; + while (posBufferUpto < toSkip) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } + + position = 0; + lastStartOffset = 0; + } + + @Override + public int nextPosition() throws IOException { + if (indexHasPos == false || needsPositions == false) { + return -1; + } + + if (isFreqsRead == false) { + forUtil.readBlock(docIn, encoded, freqBuffer); // read freqs for this docs block + isFreqsRead = true; + } + while (posDocUpTo < docUpto) { // bring posPendingCount upto the current doc + posPendingCount += freqBuffer[docBufferUpto - (docUpto - posDocUpTo)]; + posDocUpTo++; + } + + assert posPendingCount > 0; + + if (posPendingFP != -1) { + posIn.seek(posPendingFP); + posPendingFP = -1; + + if (payPendingFP != -1 && payIn != null) { + payIn.seek(payPendingFP); + payPendingFP = -1; + } + + // Force buffer refill: + posBufferUpto = BLOCK_SIZE; + } + + if (posPendingCount > freqBuffer[docBufferUpto - 1]) { + skipPositions(); + posPendingCount = freqBuffer[docBufferUpto - 1]; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto]; + + if (indexHasPayloads) { + payloadLength = payloadLengthBuffer[posBufferUpto]; + payload.bytes = payloadBytes; + payload.offset = payloadByteUpto; + payload.length = payloadLength; + payloadByteUpto += payloadLength; + } + + if (indexHasOffsets && needsOffsets) { + startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto]; + endOffset = startOffset + offsetLengthBuffer[posBufferUpto]; + lastStartOffset = startOffset; + } + + posBufferUpto++; + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + return startOffset; + } + + @Override + public int endOffset() { + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (payloadLength == 0) { + return null; + } else { + return payload; + } + } + + @Override + public long cost() { + return docFreq; + } + } + + @Override + public void checkIntegrity() throws IOException { + if (docIn != null) { + CodecUtil.checksumEntireFile(docIn); + } + if (posIn != null) { + CodecUtil.checksumEntireFile(posIn); + } + if (payIn != null) { + CodecUtil.checksumEntireFile(payIn); + } + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(positions=" + (posIn != null) + ",payloads=" + (payIn != null) + ")"; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50ScoreSkipReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50ScoreSkipReader.java new file mode 100644 index 0000000000000..e27e95f2601a2 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50ScoreSkipReader.java @@ -0,0 +1,167 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; + +import java.io.IOException; +import java.util.AbstractList; +import java.util.Arrays; +import java.util.List; +import java.util.RandomAccess; + +final class Lucene50ScoreSkipReader extends Lucene50SkipReader { + + private final byte[][] impactData; + private final int[] impactDataLength; + private final ByteArrayDataInput badi = new ByteArrayDataInput(); + private final Impacts impacts; + private int numLevels = 1; + private final MutableImpactList[] perLevelImpacts; + + Lucene50ScoreSkipReader( + int version, + IndexInput skipStream, + int maxSkipLevels, + boolean hasPos, + boolean hasOffsets, + boolean hasPayloads + ) { + super(version, skipStream, maxSkipLevels, hasPos, hasOffsets, hasPayloads); + if (version < BWCLucene50PostingsFormat.VERSION_IMPACT_SKIP_DATA) { + throw new IllegalStateException("Cannot skip based on scores if impacts are not indexed"); + } + this.impactData = new byte[maxSkipLevels][]; + Arrays.fill(impactData, new byte[0]); + this.impactDataLength = new int[maxSkipLevels]; + this.perLevelImpacts = new MutableImpactList[maxSkipLevels]; + for (int i = 0; i < perLevelImpacts.length; ++i) { + perLevelImpacts[i] = new MutableImpactList(); + } + impacts = new Impacts() { + + @Override + public int numLevels() { + return numLevels; + } + + @Override + public int getDocIdUpTo(int level) { + return skipDoc[level]; + } + + @Override + public List getImpacts(int level) { + assert level < numLevels; + if (impactDataLength[level] > 0) { + badi.reset(impactData[level], 0, impactDataLength[level]); + perLevelImpacts[level] = readImpacts(badi, perLevelImpacts[level]); + impactDataLength[level] = 0; + } + return perLevelImpacts[level]; + } + }; + } + + @Override + public int skipTo(int target) throws IOException { + int result = super.skipTo(target); + if (numberOfSkipLevels > 0) { + numLevels = numberOfSkipLevels; + } else { + // End of postings don't have skip data anymore, so we fill with dummy data + // like SlowImpactsEnum. + numLevels = 1; + perLevelImpacts[0].length = 1; + perLevelImpacts[0].impacts[0].freq = Integer.MAX_VALUE; + perLevelImpacts[0].impacts[0].norm = 1L; + impactDataLength[0] = 0; + } + return result; + } + + Impacts getImpacts() { + return impacts; + } + + @Override + protected void readImpacts(int level, IndexInput skipStream) throws IOException { + int length = skipStream.readVInt(); + if (impactData[level].length < length) { + impactData[level] = new byte[ArrayUtil.oversize(length, Byte.BYTES)]; + } + skipStream.readBytes(impactData[level], 0, length); + impactDataLength[level] = length; + } + + static MutableImpactList readImpacts(ByteArrayDataInput in, MutableImpactList reuse) { + int maxNumImpacts = in.length(); // at most one impact per byte + if (reuse.impacts.length < maxNumImpacts) { + int oldLength = reuse.impacts.length; + reuse.impacts = ArrayUtil.grow(reuse.impacts, maxNumImpacts); + for (int i = oldLength; i < reuse.impacts.length; ++i) { + reuse.impacts[i] = new Impact(Integer.MAX_VALUE, 1L); + } + } + + int freq = 0; + long norm = 0; + int length = 0; + while (in.getPosition() < in.length()) { + int freqDelta = in.readVInt(); + if ((freqDelta & 0x01) != 0) { + freq += 1 + (freqDelta >>> 1); + try { + norm += 1 + in.readZLong(); + } catch (IOException e) { + throw new RuntimeException(e); // cannot happen on a BADI + } + } else { + freq += 1 + (freqDelta >>> 1); + norm++; + } + Impact impact = reuse.impacts[length]; + impact.freq = freq; + impact.norm = norm; + length++; + } + reuse.length = length; + return reuse; + } + + static class MutableImpactList extends AbstractList implements RandomAccess { + int length = 1; + Impact[] impacts = new Impact[] { new Impact(Integer.MAX_VALUE, 1L) }; + + @Override + public Impact get(int index) { + return impacts[index]; + } + + @Override + public int size() { + return length; + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50SkipReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50SkipReader.java new file mode 100644 index 0000000000000..369d7e103d839 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50SkipReader.java @@ -0,0 +1,210 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.codecs.MultiLevelSkipListReader; +import org.apache.lucene.store.IndexInput; + +import java.io.IOException; +import java.util.Arrays; + +import static org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat.BLOCK_SIZE; + +/** + * Implements the skip list reader for block postings format that stores positions and payloads. + * + *

Although this skipper uses MultiLevelSkipListReader as an interface, its definition of skip + * position will be a little different. + * + *

For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6, + * + *

+ * 0 1 2 3 4 5
+ * d d d d d d    (posting list)
+ *     ^     ^    (skip point in MultiLeveSkipWriter)
+ *       ^        (skip point in Lucene50SkipWriter)
+ * 
+ * + * In this case, MultiLevelSkipListReader will use the last document as a skip point, while + * Lucene50SkipReader should assume no skip point will comes. + * + *

If we use the interface directly in Lucene50SkipReader, it may silly try to read another skip + * data after the only skip point is loaded. + * + *

To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId, and + * numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list isn't exhausted + * yet, and try to load a non-existed skip point + * + *

Therefore, we'll trim df before passing it to the interface. see trim(int) + */ +class Lucene50SkipReader extends MultiLevelSkipListReader { + private final int version; + private long[] docPointer; + private long[] posPointer; + private long[] payPointer; + private int[] posBufferUpto; + private int[] payloadByteUpto; + + private long lastPosPointer; + private long lastPayPointer; + private int lastPayloadByteUpto; + private long lastDocPointer; + private int lastPosBufferUpto; + + Lucene50SkipReader(int version, IndexInput skipStream, int maxSkipLevels, boolean hasPos, boolean hasOffsets, boolean hasPayloads) { + super(skipStream, maxSkipLevels, BLOCK_SIZE, 8); + this.version = version; + docPointer = new long[maxSkipLevels]; + if (hasPos) { + posPointer = new long[maxSkipLevels]; + posBufferUpto = new int[maxSkipLevels]; + if (hasPayloads) { + payloadByteUpto = new int[maxSkipLevels]; + } else { + payloadByteUpto = null; + } + if (hasOffsets || hasPayloads) { + payPointer = new long[maxSkipLevels]; + } else { + payPointer = null; + } + } else { + posPointer = null; + } + } + + /** + * Trim original docFreq to tell skipReader read proper number of skip points. + * + *

Since our definition in Lucene50Skip* is a little different from MultiLevelSkip* This + * trimmed docFreq will prevent skipReader from: 1. silly reading a non-existed skip point after + * the last block boundary 2. moving into the vInt block + */ + protected int trim(int df) { + return df % BLOCK_SIZE == 0 ? df - 1 : df; + } + + public void init(long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) throws IOException { + super.init(skipPointer, trim(df)); + lastDocPointer = docBasePointer; + lastPosPointer = posBasePointer; + lastPayPointer = payBasePointer; + + Arrays.fill(docPointer, docBasePointer); + if (posPointer != null) { + Arrays.fill(posPointer, posBasePointer); + if (payPointer != null) { + Arrays.fill(payPointer, payBasePointer); + } + } else { + assert posBasePointer == 0; + } + } + + /** + * Returns the doc pointer of the doc to which the last call of {@link + * MultiLevelSkipListReader#skipTo(int)} has skipped. + */ + public long getDocPointer() { + return lastDocPointer; + } + + public long getPosPointer() { + return lastPosPointer; + } + + public int getPosBufferUpto() { + return lastPosBufferUpto; + } + + public long getPayPointer() { + return lastPayPointer; + } + + public int getPayloadByteUpto() { + return lastPayloadByteUpto; + } + + public int getNextSkipDoc() { + return skipDoc[0]; + } + + @Override + protected void seekChild(int level) throws IOException { + super.seekChild(level); + docPointer[level] = lastDocPointer; + if (posPointer != null) { + posPointer[level] = lastPosPointer; + posBufferUpto[level] = lastPosBufferUpto; + if (payloadByteUpto != null) { + payloadByteUpto[level] = lastPayloadByteUpto; + } + if (payPointer != null) { + payPointer[level] = lastPayPointer; + } + } + } + + @Override + protected void setLastSkipData(int level) { + super.setLastSkipData(level); + lastDocPointer = docPointer[level]; + + if (posPointer != null) { + lastPosPointer = posPointer[level]; + lastPosBufferUpto = posBufferUpto[level]; + if (payPointer != null) { + lastPayPointer = payPointer[level]; + } + if (payloadByteUpto != null) { + lastPayloadByteUpto = payloadByteUpto[level]; + } + } + } + + @Override + protected int readSkipData(int level, IndexInput skipStream) throws IOException { + int delta = skipStream.readVInt(); + docPointer[level] += skipStream.readVLong(); + + if (posPointer != null) { + posPointer[level] += skipStream.readVLong(); + posBufferUpto[level] = skipStream.readVInt(); + + if (payloadByteUpto != null) { + payloadByteUpto[level] = skipStream.readVInt(); + } + + if (payPointer != null) { + payPointer[level] += skipStream.readVLong(); + } + } + readImpacts(level, skipStream); + return delta; + } + + // The default impl skips impacts + protected void readImpacts(int level, IndexInput skipStream) throws IOException { + if (version >= BWCLucene50PostingsFormat.VERSION_IMPACT_SKIP_DATA) { + // The base implementation skips impacts, they are not used + skipStream.skipBytes(skipStream.readVInt()); + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java index 43a24574297c3..55fe5c3b98f64 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java @@ -27,10 +27,13 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec; +import org.elasticsearch.xpack.lucene.bwc.codecs.LegacyAdaptingPerFieldPostingsFormat; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat; import org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.Lucene50SegmentInfoFormat; import org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat; @@ -55,6 +58,21 @@ public DocValuesFormat getDocValuesFormatForField(String field) { return defaultDocValuesFormat; } }; + private final PostingsFormat postingsFormat = new LegacyAdaptingPerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + throw new IllegalStateException("This codec should only be used for reading, not writing"); + } + + @Override + protected PostingsFormat getPostingsFormat(String formatName) { + if (formatName.equals("Lucene50")) { + return new BWCLucene50PostingsFormat(); + } else { + return super.getPostingsFormat(formatName); + } + } + }; /** * Instantiates a new codec. @@ -104,4 +122,8 @@ public DocValuesFormat docValuesFormat() { return docValuesFormat; } + @Override + public PostingsFormat postingsFormat() { + return postingsFormat; + } } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java index 2f805a4881744..e3317a1c00c8c 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java @@ -27,10 +27,13 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec; +import org.elasticsearch.xpack.lucene.bwc.codecs.LegacyAdaptingPerFieldPostingsFormat; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat; import org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat; import java.util.Objects; @@ -54,6 +57,21 @@ public DocValuesFormat getDocValuesFormatForField(String field) { return defaultDocValuesFormat; } }; + private final PostingsFormat postingsFormat = new LegacyAdaptingPerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + throw new IllegalStateException("This codec should only be used for reading, not writing"); + } + + @Override + protected PostingsFormat getPostingsFormat(String formatName) { + if (formatName.equals("Lucene50")) { + return new BWCLucene50PostingsFormat(); + } else { + return super.getPostingsFormat(formatName); + } + } + }; public Lucene62Codec() { this(Lucene50StoredFieldsFormat.Mode.BEST_SPEED); @@ -93,4 +111,9 @@ public final CompoundFormat compoundFormat() { public DocValuesFormat docValuesFormat() { return docValuesFormat; } + + @Override + public PostingsFormat postingsFormat() { + return postingsFormat; + } } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java index bc9fa098476c1..90739206b5643 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java @@ -18,9 +18,11 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec; public class BWCLucene70Codec extends BWCCodec { @@ -37,6 +39,12 @@ public DocValuesFormat getDocValuesFormatForField(String field) { return defaultDVFormat; } }; + private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + throw new IllegalStateException("This codec should only be used for reading, not writing"); + } + }; public BWCLucene70Codec() { super("BWCLucene70Codec"); @@ -72,4 +80,9 @@ public CompoundFormat compoundFormat() { public final DocValuesFormat docValuesFormat() { return docValuesFormat; } + + @Override + public PostingsFormat postingsFormat() { + return postingsFormat; + } } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BitTableUtil.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BitTableUtil.java new file mode 100644 index 0000000000000..728191932763c --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BitTableUtil.java @@ -0,0 +1,176 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import java.io.IOException; + +/** + * Static helper methods for {@link FST.Arc.BitTable}. + * + * @lucene.experimental + */ +class BitTableUtil { + + /** + * Returns whether the bit at given zero-based index is set.
+ * Example: bitIndex 10 means the third bit on the right of the second byte. + * + * @param bitIndex The bit zero-based index. It must be greater than or equal to 0, and strictly + * less than {@code number of bit-table bytes * Byte.SIZE}. + * @param reader The {@link FST.BytesReader} to read. It must be positioned at the beginning of + * the bit-table. + */ + static boolean isBitSet(int bitIndex, FST.BytesReader reader) throws IOException { + assert bitIndex >= 0 : "bitIndex=" + bitIndex; + reader.skipBytes(bitIndex >> 3); + return (readByte(reader) & (1L << (bitIndex & (Byte.SIZE - 1)))) != 0; + } + + /** + * Counts all bits set in the bit-table. + * + * @param bitTableBytes The number of bytes in the bit-table. + * @param reader The {@link FST.BytesReader} to read. It must be positioned at the beginning of + * the bit-table. + */ + static int countBits(int bitTableBytes, FST.BytesReader reader) throws IOException { + assert bitTableBytes >= 0 : "bitTableBytes=" + bitTableBytes; + int bitCount = 0; + for (int i = bitTableBytes >> 3; i > 0; i--) { + // Count the bits set for all plain longs. + bitCount += bitCount8Bytes(reader); + } + int numRemainingBytes; + if ((numRemainingBytes = bitTableBytes & (Long.BYTES - 1)) != 0) { + bitCount += Long.bitCount(readUpTo8Bytes(numRemainingBytes, reader)); + } + return bitCount; + } + + /** + * Counts the bits set up to the given bit zero-based index, exclusive.
+ * In other words, how many 1s there are up to the bit at the given index excluded.
+ * Example: bitIndex 10 means the third bit on the right of the second byte. + * + * @param bitIndex The bit zero-based index, exclusive. It must be greater than or equal to 0, and + * less than or equal to {@code number of bit-table bytes * Byte.SIZE}. + * @param reader The {@link FST.BytesReader} to read. It must be positioned at the beginning of + * the bit-table. + */ + static int countBitsUpTo(int bitIndex, FST.BytesReader reader) throws IOException { + assert bitIndex >= 0 : "bitIndex=" + bitIndex; + int bitCount = 0; + for (int i = bitIndex >> 6; i > 0; i--) { + // Count the bits set for all plain longs. + bitCount += bitCount8Bytes(reader); + } + int remainingBits; + if ((remainingBits = bitIndex & (Long.SIZE - 1)) != 0) { + int numRemainingBytes = (remainingBits + (Byte.SIZE - 1)) >> 3; + // Prepare a mask with 1s on the right up to bitIndex exclusive. + long mask = (1L << bitIndex) - 1L; // Shifts are mod 64. + // Count the bits set only within the mask part, so up to bitIndex exclusive. + bitCount += Long.bitCount(readUpTo8Bytes(numRemainingBytes, reader) & mask); + } + return bitCount; + } + + /** + * Returns the index of the next bit set following the given bit zero-based index.
+ * For example with bits 100011: the next bit set after index=-1 is at index=0; the next bit set + * after index=0 is at index=1; the next bit set after index=1 is at index=5; there is no next bit + * set after index=5. + * + * @param bitIndex The bit zero-based index. It must be greater than or equal to -1, and strictly + * less than {@code number of bit-table bytes * Byte.SIZE}. + * @param bitTableBytes The number of bytes in the bit-table. + * @param reader The {@link FST.BytesReader} to read. It must be positioned at the beginning of + * the bit-table. + * @return The zero-based index of the next bit set after the provided {@code bitIndex}; or -1 if + * none. + */ + static int nextBitSet(int bitIndex, int bitTableBytes, FST.BytesReader reader) throws IOException { + assert bitIndex >= -1 && bitIndex < bitTableBytes * Byte.SIZE : "bitIndex=" + bitIndex + " bitTableBytes=" + bitTableBytes; + int byteIndex = bitIndex / Byte.SIZE; + int mask = -1 << ((bitIndex + 1) & (Byte.SIZE - 1)); + int i; + if (mask == -1 && bitIndex != -1) { + reader.skipBytes(byteIndex + 1); + i = 0; + } else { + reader.skipBytes(byteIndex); + i = (reader.readByte() & 0xFF) & mask; + } + while (i == 0) { + if (++byteIndex == bitTableBytes) { + return -1; + } + i = reader.readByte() & 0xFF; + } + return Integer.numberOfTrailingZeros(i) + (byteIndex << 3); + } + + /** + * Returns the index of the previous bit set preceding the given bit zero-based index.
+ * For example with bits 100011: there is no previous bit set before index=0. the previous bit set + * before index=1 is at index=0; the previous bit set before index=5 is at index=1; the previous + * bit set before index=64 is at index=5; + * + * @param bitIndex The bit zero-based index. It must be greater than or equal to 0, and less than + * or equal to {@code number of bit-table bytes * Byte.SIZE}. + * @param reader The {@link FST.BytesReader} to read. It must be positioned at the beginning of + * the bit-table. + * @return The zero-based index of the previous bit set before the provided {@code bitIndex}; or + * -1 if none. + */ + static int previousBitSet(int bitIndex, FST.BytesReader reader) throws IOException { + assert bitIndex >= 0 : "bitIndex=" + bitIndex; + int byteIndex = bitIndex >> 3; + reader.skipBytes(byteIndex); + int mask = (1 << (bitIndex & (Byte.SIZE - 1))) - 1; + int i = (reader.readByte() & 0xFF) & mask; + while (i == 0) { + if (byteIndex-- == 0) { + return -1; + } + reader.skipBytes(-2); // FST.BytesReader implementations support negative skip. + i = reader.readByte() & 0xFF; + } + return (Integer.SIZE - 1) - Integer.numberOfLeadingZeros(i) + (byteIndex << 3); + } + + private static long readByte(FST.BytesReader reader) throws IOException { + return reader.readByte() & 0xFFL; + } + + private static long readUpTo8Bytes(int numBytes, FST.BytesReader reader) throws IOException { + assert numBytes > 0 && numBytes <= 8 : "numBytes=" + numBytes; + long l = readByte(reader); + int shift = 0; + while (--numBytes != 0) { + l |= readByte(reader) << (shift += 8); + } + return l; + } + + private static int bitCount8Bytes(FST.BytesReader reader) throws IOException { + return Long.bitCount(reader.readLong()); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ByteSequenceOutputs.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ByteSequenceOutputs.java new file mode 100644 index 0000000000000..7a58a350fcab1 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ByteSequenceOutputs.java @@ -0,0 +1,164 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.StringHelper; + +import java.io.IOException; + +/** + * An FST {@link Outputs} implementation where each output is a sequence of bytes. + * + * @lucene.experimental + */ +public final class ByteSequenceOutputs extends Outputs { + + private static final BytesRef NO_OUTPUT = new BytesRef(); + private static final ByteSequenceOutputs singleton = new ByteSequenceOutputs(); + + private ByteSequenceOutputs() {} + + public static ByteSequenceOutputs getSingleton() { + return singleton; + } + + @Override + public BytesRef common(BytesRef output1, BytesRef output2) { + assert output1 != null; + assert output2 != null; + + int pos1 = output1.offset; + int pos2 = output2.offset; + int stopAt1 = pos1 + Math.min(output1.length, output2.length); + while (pos1 < stopAt1) { + if (output1.bytes[pos1] != output2.bytes[pos2]) { + break; + } + pos1++; + pos2++; + } + + if (pos1 == output1.offset) { + // no common prefix + return NO_OUTPUT; + } else if (pos1 == output1.offset + output1.length) { + // output1 is a prefix of output2 + return output1; + } else if (pos2 == output2.offset + output2.length) { + // output2 is a prefix of output1 + return output2; + } else { + return new BytesRef(output1.bytes, output1.offset, pos1 - output1.offset); + } + } + + @Override + public BytesRef subtract(BytesRef output, BytesRef inc) { + assert output != null; + assert inc != null; + if (inc == NO_OUTPUT) { + // no prefix removed + return output; + } else { + assert StringHelper.startsWith(output, inc); + if (inc.length == output.length) { + // entire output removed + return NO_OUTPUT; + } else { + assert inc.length < output.length : "inc.length=" + inc.length + " vs output.length=" + output.length; + assert inc.length > 0; + return new BytesRef(output.bytes, output.offset + inc.length, output.length - inc.length); + } + } + } + + @Override + public BytesRef add(BytesRef prefix, BytesRef output) { + assert prefix != null; + assert output != null; + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + assert prefix.length > 0; + assert output.length > 0; + BytesRef result = new BytesRef(prefix.length + output.length); + System.arraycopy(prefix.bytes, prefix.offset, result.bytes, 0, prefix.length); + System.arraycopy(output.bytes, output.offset, result.bytes, prefix.length, output.length); + result.length = prefix.length + output.length; + return result; + } + } + + @Override + public void write(BytesRef prefix, DataOutput out) throws IOException { + assert prefix != null; + out.writeVInt(prefix.length); + out.writeBytes(prefix.bytes, prefix.offset, prefix.length); + } + + @Override + public BytesRef read(DataInput in) throws IOException { + final int len = in.readVInt(); + if (len == 0) { + return NO_OUTPUT; + } else { + final BytesRef output = new BytesRef(len); + in.readBytes(output.bytes, 0, len); + output.length = len; + return output; + } + } + + @Override + public void skipOutput(DataInput in) throws IOException { + final int len = in.readVInt(); + if (len != 0) { + in.skipBytes(len); + } + } + + @Override + public BytesRef getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(BytesRef output) { + return output.toString(); + } + + private static final long BASE_NUM_BYTES = RamUsageEstimator.shallowSizeOf(NO_OUTPUT); + + @Override + public long ramBytesUsed(BytesRef output) { + return BASE_NUM_BYTES + RamUsageEstimator.sizeOf(output.bytes); + } + + @Override + public String toString() { + return "ByteSequenceOutputs"; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesRefFSTEnum.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesRefFSTEnum.java new file mode 100644 index 0000000000000..955327af17ba0 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesRefFSTEnum.java @@ -0,0 +1,129 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +/** + * Enumerates all input (BytesRef) + output pairs in an FST. + * + * @lucene.experimental + */ +public final class BytesRefFSTEnum extends FSTEnum { + private final BytesRef current = new BytesRef(10); + private final InputOutput result = new InputOutput<>(); + private BytesRef target; + + /** Holds a single input (BytesRef) + output pair. */ + public static class InputOutput { + public BytesRef input; + public T output; + } + + /** + * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to + * the biggest term before target. + */ + public BytesRefFSTEnum(FST fst) { + super(fst); + result.input = current; + current.offset = 1; + } + + public InputOutput current() { + return result; + } + + public InputOutput next() throws IOException { + // System.out.println(" enum.next"); + doNext(); + return setResult(); + } + + /** Seeks to smallest term that's >= target. */ + public InputOutput seekCeil(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekCeil(); + return setResult(); + } + + /** Seeks to biggest term that's <= target. */ + public InputOutput seekFloor(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + super.doSeekFloor(); + return setResult(); + } + + /** + * Seeks to exactly this term, returning null if the term doesn't exist. This is faster than using + * {@link #seekFloor} or {@link #seekCeil} because it short-circuits as soon the match is not + * found. + */ + public InputOutput seekExact(BytesRef target) throws IOException { + this.target = target; + targetLength = target.length; + if (doSeekExact()) { + assert upto == 1 + target.length; + return setResult(); + } else { + return null; + } + } + + @Override + protected int getTargetLabel() { + if (upto - 1 == target.length) { + return FST.END_LABEL; + } else { + return target.bytes[target.offset + upto - 1] & 0xFF; + } + } + + @Override + protected int getCurrentLabel() { + // current.offset fixed at 1 + return current.bytes[upto] & 0xFF; + } + + @Override + protected void setCurrentLabel(int label) { + current.bytes[upto] = (byte) label; + } + + @Override + protected void grow() { + current.bytes = ArrayUtil.grow(current.bytes, upto + 1); + } + + private InputOutput setResult() { + if (upto == 0) { + return null; + } else { + current.length = upto - 1; + result.output = output[upto]; + return result; + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesStore.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesStore.java new file mode 100644 index 0000000000000..b2aaa9894466e --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesStore.java @@ -0,0 +1,520 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +// TODO: merge with PagedBytes, except PagedBytes doesn't +// let you read while writing which FST needs + +class BytesStore extends DataOutput implements Accountable { + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(BytesStore.class) + RamUsageEstimator + .shallowSizeOfInstance(ArrayList.class); + + private final List blocks = new ArrayList<>(); + + private final int blockSize; + private final int blockBits; + private final int blockMask; + + private byte[] current; + private int nextWrite; + + BytesStore(int blockBits) { + this.blockBits = blockBits; + blockSize = 1 << blockBits; + blockMask = blockSize - 1; + nextWrite = blockSize; + } + + /** Pulls bytes from the provided IndexInput. */ + BytesStore(DataInput in, long numBytes, int maxBlockSize) throws IOException { + int blockSize = 2; + int blockBits = 1; + while (blockSize < numBytes && blockSize < maxBlockSize) { + blockSize *= 2; + blockBits++; + } + this.blockBits = blockBits; + this.blockSize = blockSize; + this.blockMask = blockSize - 1; + long left = numBytes; + while (left > 0) { + final int chunk = (int) Math.min(blockSize, left); + byte[] block = new byte[chunk]; + in.readBytes(block, 0, block.length); + blocks.add(block); + left -= chunk; + } + + // So .getPosition still works + nextWrite = blocks.get(blocks.size() - 1).length; + } + + /** Absolute write byte; you must ensure dest is < max position written so far. */ + public void writeByte(long dest, byte b) { + int blockIndex = (int) (dest >> blockBits); + byte[] block = blocks.get(blockIndex); + block[(int) (dest & blockMask)] = b; + } + + @Override + public void writeByte(byte b) { + if (nextWrite == blockSize) { + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + current[nextWrite++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int len) { + while (len > 0) { + int chunk = blockSize - nextWrite; + if (len <= chunk) { + assert b != null; + assert current != null; + System.arraycopy(b, offset, current, nextWrite, len); + nextWrite += len; + break; + } else { + if (chunk > 0) { + System.arraycopy(b, offset, current, nextWrite, chunk); + offset += chunk; + len -= chunk; + } + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + } + } + + int getBlockBits() { + return blockBits; + } + + /** + * Absolute writeBytes without changing the current position. Note: this cannot "grow" the bytes, + * so you must only call it on already written parts. + */ + void writeBytes(long dest, byte[] b, int offset, int len) { + // System.out.println(" BS.writeBytes dest=" + dest + " offset=" + offset + " len=" + len); + assert dest + len <= getPosition() : "dest=" + dest + " pos=" + getPosition() + " len=" + len; + + // Note: weird: must go "backwards" because copyBytes + // calls us with overlapping src/dest. If we + // go forwards then we overwrite bytes before we can + // copy them: + + /* + int blockIndex = dest >> blockBits; + int upto = dest & blockMask; + byte[] block = blocks.get(blockIndex); + while (len > 0) { + int chunk = blockSize - upto; + System.out.println(" cycle chunk=" + chunk + " len=" + len); + if (len <= chunk) { + System.arraycopy(b, offset, block, upto, len); + break; + } else { + System.arraycopy(b, offset, block, upto, chunk); + offset += chunk; + len -= chunk; + blockIndex++; + block = blocks.get(blockIndex); + upto = 0; + } + } + */ + + final long end = dest + len; + int blockIndex = (int) (end >> blockBits); + int downTo = (int) (end & blockMask); + if (downTo == 0) { + blockIndex--; + downTo = blockSize; + } + byte[] block = blocks.get(blockIndex); + + while (len > 0) { + // System.out.println(" cycle downTo=" + downTo + " len=" + len); + if (len <= downTo) { + // System.out.println(" final: offset=" + offset + " len=" + len + " dest=" + + // (downTo-len)); + System.arraycopy(b, offset, block, downTo - len, len); + break; + } else { + len -= downTo; + // System.out.println(" partial: offset=" + (offset + len) + " len=" + downTo + " + // dest=0"); + System.arraycopy(b, offset + len, block, 0, downTo); + blockIndex--; + block = blocks.get(blockIndex); + downTo = blockSize; + } + } + } + + /** + * Absolute copy bytes self to self, without changing the position. Note: this cannot "grow" the + * bytes, so must only call it on already written parts. + */ + public void copyBytes(long src, long dest, int len) { + // System.out.println("BS.copyBytes src=" + src + " dest=" + dest + " len=" + len); + assert src < dest; + + // Note: weird: must go "backwards" because copyBytes + // calls us with overlapping src/dest. If we + // go forwards then we overwrite bytes before we can + // copy them: + + /* + int blockIndex = src >> blockBits; + int upto = src & blockMask; + byte[] block = blocks.get(blockIndex); + while (len > 0) { + int chunk = blockSize - upto; + System.out.println(" cycle: chunk=" + chunk + " len=" + len); + if (len <= chunk) { + writeBytes(dest, block, upto, len); + break; + } else { + writeBytes(dest, block, upto, chunk); + blockIndex++; + block = blocks.get(blockIndex); + upto = 0; + len -= chunk; + dest += chunk; + } + } + */ + + long end = src + len; + + int blockIndex = (int) (end >> blockBits); + int downTo = (int) (end & blockMask); + if (downTo == 0) { + blockIndex--; + downTo = blockSize; + } + byte[] block = blocks.get(blockIndex); + + while (len > 0) { + // System.out.println(" cycle downTo=" + downTo); + if (len <= downTo) { + // System.out.println(" finish"); + writeBytes(dest, block, downTo - len, len); + break; + } else { + // System.out.println(" partial"); + len -= downTo; + writeBytes(dest + len, block, 0, downTo); + blockIndex--; + block = blocks.get(blockIndex); + downTo = blockSize; + } + } + } + + /** Copies bytes from this store to a target byte array. */ + public void copyBytes(long src, byte[] dest, int offset, int len) { + int blockIndex = (int) (src >> blockBits); + int upto = (int) (src & blockMask); + byte[] block = blocks.get(blockIndex); + while (len > 0) { + int chunk = blockSize - upto; + if (len <= chunk) { + System.arraycopy(block, upto, dest, offset, len); + break; + } else { + System.arraycopy(block, upto, dest, offset, chunk); + blockIndex++; + block = blocks.get(blockIndex); + upto = 0; + len -= chunk; + offset += chunk; + } + } + } + + /** Writes an int at the absolute position without changing the current pointer. */ + public void writeInt(long pos, int value) { + int blockIndex = (int) (pos >> blockBits); + int upto = (int) (pos & blockMask); + byte[] block = blocks.get(blockIndex); + int shift = 24; + for (int i = 0; i < 4; i++) { + block[upto++] = (byte) (value >> shift); + shift -= 8; + if (upto == blockSize) { + upto = 0; + blockIndex++; + block = blocks.get(blockIndex); + } + } + } + + /** Reverse from srcPos, inclusive, to destPos, inclusive. */ + public void reverse(long srcPos, long destPos) { + assert srcPos < destPos; + assert destPos < getPosition(); + // System.out.println("reverse src=" + srcPos + " dest=" + destPos); + + int srcBlockIndex = (int) (srcPos >> blockBits); + int src = (int) (srcPos & blockMask); + byte[] srcBlock = blocks.get(srcBlockIndex); + + int destBlockIndex = (int) (destPos >> blockBits); + int dest = (int) (destPos & blockMask); + byte[] destBlock = blocks.get(destBlockIndex); + // System.out.println(" srcBlock=" + srcBlockIndex + " destBlock=" + destBlockIndex); + + int limit = (int) (destPos - srcPos + 1) / 2; + for (int i = 0; i < limit; i++) { + // System.out.println(" cycle src=" + src + " dest=" + dest); + byte b = srcBlock[src]; + srcBlock[src] = destBlock[dest]; + destBlock[dest] = b; + src++; + if (src == blockSize) { + srcBlockIndex++; + srcBlock = blocks.get(srcBlockIndex); + // System.out.println(" set destBlock=" + destBlock + " srcBlock=" + srcBlock); + src = 0; + } + + dest--; + if (dest == -1) { + destBlockIndex--; + destBlock = blocks.get(destBlockIndex); + // System.out.println(" set destBlock=" + destBlock + " srcBlock=" + srcBlock); + dest = blockSize - 1; + } + } + } + + public void skipBytes(int len) { + while (len > 0) { + int chunk = blockSize - nextWrite; + if (len <= chunk) { + nextWrite += len; + break; + } else { + len -= chunk; + current = new byte[blockSize]; + blocks.add(current); + nextWrite = 0; + } + } + } + + public long getPosition() { + return ((long) blocks.size() - 1) * blockSize + nextWrite; + } + + /** + * Pos must be less than the max position written so far! Ie, you cannot "grow" the file with + * this! + */ + public void truncate(long newLen) { + assert newLen <= getPosition(); + assert newLen >= 0; + int blockIndex = (int) (newLen >> blockBits); + nextWrite = (int) (newLen & blockMask); + if (nextWrite == 0) { + blockIndex--; + nextWrite = blockSize; + } + blocks.subList(blockIndex + 1, blocks.size()).clear(); + if (newLen == 0) { + current = null; + } else { + current = blocks.get(blockIndex); + } + assert newLen == getPosition(); + } + + public void finish() { + if (current != null) { + byte[] lastBuffer = new byte[nextWrite]; + System.arraycopy(current, 0, lastBuffer, 0, nextWrite); + blocks.set(blocks.size() - 1, lastBuffer); + current = null; + } + } + + /** Writes all of our bytes to the target {@link DataOutput}. */ + public void writeTo(DataOutput out) throws IOException { + for (byte[] block : blocks) { + out.writeBytes(block, 0, block.length); + } + } + + public FST.BytesReader getForwardReader() { + if (blocks.size() == 1) { + return new ForwardBytesReader(blocks.get(0)); + } + return new FST.BytesReader() { + private byte[] current; + private int nextBuffer; + private int nextRead = blockSize; + + @Override + public byte readByte() { + if (nextRead == blockSize) { + current = blocks.get(nextBuffer++); + nextRead = 0; + } + return current[nextRead++]; + } + + @Override + public void skipBytes(long count) { + setPosition(getPosition() + count); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + while (len > 0) { + int chunkLeft = blockSize - nextRead; + if (len <= chunkLeft) { + System.arraycopy(current, nextRead, b, offset, len); + nextRead += len; + break; + } else { + if (chunkLeft > 0) { + System.arraycopy(current, nextRead, b, offset, chunkLeft); + offset += chunkLeft; + len -= chunkLeft; + } + current = blocks.get(nextBuffer++); + nextRead = 0; + } + } + } + + @Override + public long getPosition() { + return ((long) nextBuffer - 1) * blockSize + nextRead; + } + + @Override + public void setPosition(long pos) { + int bufferIndex = (int) (pos >> blockBits); + if (nextBuffer != bufferIndex + 1) { + nextBuffer = bufferIndex + 1; + current = blocks.get(bufferIndex); + } + nextRead = (int) (pos & blockMask); + assert getPosition() == pos; + } + + @Override + public boolean reversed() { + return false; + } + }; + } + + public FST.BytesReader getReverseReader() { + return getReverseReader(true); + } + + FST.BytesReader getReverseReader(boolean allowSingle) { + if (allowSingle && blocks.size() == 1) { + return new ReverseBytesReader(blocks.get(0)); + } + return new FST.BytesReader() { + private byte[] current = blocks.size() == 0 ? null : blocks.get(0); + private int nextBuffer = -1; + private int nextRead = 0; + + @Override + public byte readByte() { + if (nextRead == -1) { + current = blocks.get(nextBuffer--); + nextRead = blockSize - 1; + } + return current[nextRead--]; + } + + @Override + public void skipBytes(long count) { + setPosition(getPosition() - count); + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for (int i = 0; i < len; i++) { + b[offset + i] = readByte(); + } + } + + @Override + public long getPosition() { + return ((long) nextBuffer + 1) * blockSize + nextRead; + } + + @Override + public void setPosition(long pos) { + // NOTE: a little weird because if you + // setPosition(0), the next byte you read is + // bytes[0] ... but I would expect bytes[-1] (ie, + // EOF)...? + int bufferIndex = (int) (pos >> blockBits); + if (nextBuffer != bufferIndex - 1) { + nextBuffer = bufferIndex - 1; + current = blocks.get(bufferIndex); + } + nextRead = (int) (pos & blockMask); + assert getPosition() == pos : "pos=" + pos + " getPos()=" + getPosition(); + } + + @Override + public boolean reversed() { + return true; + } + }; + } + + @Override + public long ramBytesUsed() { + long size = BASE_RAM_BYTES_USED; + for (byte[] block : blocks) { + size += RamUsageEstimator.sizeOf(block); + } + return size; + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(numBlocks=" + blocks.size() + ")"; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FST.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FST.java new file mode 100644 index 0000000000000..9fb73edb5a118 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FST.java @@ -0,0 +1,1569 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Constants; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +// TODO: break this into WritableFST and ReadOnlyFST.. then +// we can have subclasses of ReadOnlyFST to handle the +// different byte[] level encodings (packed or +// not)... and things like nodeCount, arcCount are read only + +// TODO: if FST is pure prefix trie we can do a more compact +// job, ie, once we are at a 'suffix only', just store the +// completion labels as a string not as a series of arcs. + +// NOTE: while the FST is able to represent a non-final +// dead-end state (NON_FINAL_END_NODE=0), the layers above +// (FSTEnum, Util) have problems with this!! + +/** + * Represents an finite state machine (FST), using a compact byte[] format. + * + *

The format is similar to what's used by Morfologik + * (https://github.com/morfologik/morfologik-stemming). + * + *

See the {@link org.apache.lucene.util.fst package documentation} for some simple examples. + * + * @lucene.experimental + */ +public final class FST implements Accountable { + + /** Specifies allowed range of each int input label for this FST. */ + public enum INPUT_TYPE { + BYTE1, + BYTE2, + BYTE4 + } + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FST.class); + + private static final int BIT_FINAL_ARC = 1 << 0; + static final int BIT_LAST_ARC = 1 << 1; + static final int BIT_TARGET_NEXT = 1 << 2; + + // TODO: we can free up a bit if we can nuke this: + private static final int BIT_STOP_NODE = 1 << 3; + + /** This flag is set if the arc has an output. */ + public static final int BIT_ARC_HAS_OUTPUT = 1 << 4; + + private static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; + + /** Value of the arc flags to declare a node with fixed length arcs designed for binary search. */ + // We use this as a marker because this one flag is illegal by itself. + public static final byte ARCS_FOR_BINARY_SEARCH = BIT_ARC_HAS_FINAL_OUTPUT; + + /** + * Value of the arc flags to declare a node with fixed length arcs and bit table designed for + * direct addressing. + */ + static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6; + + /** @see #shouldExpandNodeWithFixedLengthArcs */ + static final int FIXED_LENGTH_ARC_SHALLOW_DEPTH = 3; // 0 => only root node. + + /** @see #shouldExpandNodeWithFixedLengthArcs */ + static final int FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS = 5; + + /** @see #shouldExpandNodeWithFixedLengthArcs */ + static final int FIXED_LENGTH_ARC_DEEP_NUM_ARCS = 10; + + /** + * Maximum oversizing factor allowed for direct addressing compared to binary search when + * expansion credits allow the oversizing. This factor prevents expansions that are obviously too + * costly even if there are sufficient credits. + * + * @see #shouldExpandNodeWithDirectAddressing + */ + private static final float DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR = 1.66f; + + // Increment version to change it + private static final String FILE_FORMAT_NAME = "FST"; + private static final int VERSION_START = 0; + /** Changed numBytesPerArc for array'd case from byte to int. */ + private static final int VERSION_INT_NUM_BYTES_PER_ARC = 1; + + /** Write BYTE2 labels as 2-byte short, not vInt. */ + private static final int VERSION_SHORT_BYTE2_LABELS = 2; + + /** Added optional packed format. */ + private static final int VERSION_PACKED = 3; + + /** Changed from int to vInt for encoding arc targets. + * Also changed maxBytesPerArc from int to vInt in the array case. */ + private static final int VERSION_VINT_TARGET = 4; + + /** Don't store arcWithOutputCount anymore */ + private static final int VERSION_NO_NODE_ARC_COUNTS = 5; + + private static final int VERSION_PACKED_REMOVED = 6; + + private static final int VERSION_LITTLE_ENDIAN = 8; + private static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN; + + // Never serialized; just used to represent the virtual + // final node w/ no arcs: + private static final long FINAL_END_NODE = -1; + + // Never serialized; just used to represent the virtual + // non-final node w/ no arcs: + private static final long NON_FINAL_END_NODE = 0; + + /** If arc has this label then that arc is final/accepted */ + public static final int END_LABEL = -1; + + final INPUT_TYPE inputType; + + // if non-null, this FST accepts the empty string and + // produces this output + T emptyOutput; + + /** + * A {@link BytesStore}, used during building, or during reading when the FST is very large (more + * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead. + */ + final BytesStore bytes; + + private final FSTStore fstStore; + + private long startNode = -1; + + public final Outputs outputs; + + private final int version; + + /** Represents a single arc. */ + public static final class Arc { + + // *** Arc fields. + + private int label; + + private T output; + + private long target; + + private byte flags; + + private T nextFinalOutput; + + private long nextArc; + + private byte nodeFlags; + + // *** Fields for arcs belonging to a node with fixed length arcs. + // So only valid when bytesPerArc != 0. + // nodeFlags == ARCS_FOR_BINARY_SEARCH || nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. + + private int bytesPerArc; + + private long posArcsStart; + + private int arcIdx; + + private int numArcs; + + // *** Fields for a direct addressing node. nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. + + /** + * Start position in the {@link BytesReader} of the presence bits for a direct addressing + * node, aka the bit-table + */ + private long bitTableStart; + + /** First label of a direct addressing node. */ + private int firstLabel; + + /** + * Index of the current label of a direct addressing node. While {@link #arcIdx} is the current + * index in the label range, {@link #presenceIndex} is its corresponding index in the list of + * actually present labels. It is equal to the number of bits set before the bit at {@link + * #arcIdx} in the bit-table. This field is a cache to avoid to count bits set repeatedly when + * iterating the next arcs. + */ + private int presenceIndex; + + /** Returns this */ + public Arc copyFrom(Arc other) { + label = other.label(); + target = other.target(); + flags = other.flags(); + output = other.output(); + nextFinalOutput = other.nextFinalOutput(); + nextArc = other.nextArc(); + nodeFlags = other.nodeFlags(); + bytesPerArc = other.bytesPerArc(); + + // Fields for arcs belonging to a node with fixed length arcs. + // We could avoid copying them if bytesPerArc() == 0 (this was the case with previous code, + // and the current code + // still supports that), but it may actually help external uses of FST to have consistent arc + // state, and debugging + // is easier. + posArcsStart = other.posArcsStart(); + arcIdx = other.arcIdx(); + numArcs = other.numArcs(); + bitTableStart = other.bitTableStart; + firstLabel = other.firstLabel(); + presenceIndex = other.presenceIndex; + + return this; + } + + boolean flag(int flag) { + return FST.flag(flags, flag); + } + + public boolean isLast() { + return flag(BIT_LAST_ARC); + } + + public boolean isFinal() { + return flag(BIT_FINAL_ARC); + } + + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append(" target=").append(target()); + b.append(" label=0x").append(Integer.toHexString(label())); + if (flag(BIT_FINAL_ARC)) { + b.append(" final"); + } + if (flag(BIT_LAST_ARC)) { + b.append(" last"); + } + if (flag(BIT_TARGET_NEXT)) { + b.append(" targetNext"); + } + if (flag(BIT_STOP_NODE)) { + b.append(" stop"); + } + if (flag(BIT_ARC_HAS_OUTPUT)) { + b.append(" output=").append(output()); + } + if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + b.append(" nextFinalOutput=").append(nextFinalOutput()); + } + if (bytesPerArc() != 0) { + b.append(" arcArray(idx=") + .append(arcIdx()) + .append(" of ") + .append(numArcs()) + .append(")") + .append("(") + .append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs") + .append(")"); + } + return b.toString(); + } + + public int label() { + return label; + } + + public T output() { + return output; + } + + /** Ord/address to target node. */ + public long target() { + return target; + } + + public byte flags() { + return flags; + } + + public T nextFinalOutput() { + return nextFinalOutput; + } + + /** + * Address (into the byte[]) of the next arc - only for list of variable length arc. Or + * ord/address to the next node if label == {@link #END_LABEL}. + */ + long nextArc() { + return nextArc; + } + + /** Where we are in the array; only valid if bytesPerArc != 0. */ + public int arcIdx() { + return arcIdx; + } + + /** + * Node header flags. Only meaningful to check if the value is either {@link + * #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} (other value when bytesPerArc + * == 0). + */ + public byte nodeFlags() { + return nodeFlags; + } + + /** Where the first arc in the array starts; only valid if bytesPerArc != 0 */ + public long posArcsStart() { + return posArcsStart; + } + + /** + * Non-zero if this arc is part of a node with fixed length arcs, which means all arcs for the + * node are encoded with a fixed number of bytes so that we binary search or direct address. We + * do when there are enough arcs leaving one node. It wastes some bytes but gives faster + * lookups. + */ + public int bytesPerArc() { + return bytesPerArc; + } + + /** + * How many arcs; only valid if bytesPerArc != 0 (fixed length arcs). For a node designed for + * binary search this is the array size. For a node designed for direct addressing, this is the + * label range. + */ + public int numArcs() { + return numArcs; + } + + /** + * First label of a direct addressing node. Only valid if nodeFlags == {@link + * #ARCS_FOR_DIRECT_ADDRESSING}. + */ + int firstLabel() { + return firstLabel; + } + + /** + * Helper methods to read the bit-table of a direct addressing node. Only valid for {@link Arc} + * with {@link Arc#nodeFlags()} == {@code ARCS_FOR_DIRECT_ADDRESSING}. + */ + static class BitTable { + + /** See {@link BitTableUtil#isBitSet(int, BytesReader)}. */ + static boolean isBitSet(int bitIndex, Arc arc, BytesReader in) throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.isBitSet(bitIndex, in); + } + + /** + * See {@link BitTableUtil#countBits(int, BytesReader)}. The count of bit set is the + * number of arcs of a direct addressing node. + */ + static int countBits(Arc arc, BytesReader in) throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.countBits(getNumPresenceBytes(arc.numArcs()), in); + } + + /** See {@link BitTableUtil#countBitsUpTo(int, BytesReader)}. */ + static int countBitsUpTo(int bitIndex, Arc arc, BytesReader in) throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.countBitsUpTo(bitIndex, in); + } + + /** See {@link BitTableUtil#nextBitSet(int, int, BytesReader)}. */ + static int nextBitSet(int bitIndex, Arc arc, BytesReader in) throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.nextBitSet(bitIndex, getNumPresenceBytes(arc.numArcs()), in); + } + + /** See {@link BitTableUtil#previousBitSet(int, BytesReader)}. */ + static int previousBitSet(int bitIndex, Arc arc, BytesReader in) throws IOException { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + in.setPosition(arc.bitTableStart); + return BitTableUtil.previousBitSet(bitIndex, in); + } + + /** Asserts the bit-table of the provided {@link Arc} is valid. */ + static boolean assertIsValid(Arc arc, BytesReader in) throws IOException { + assert arc.bytesPerArc() > 0; + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + // First bit must be set. + assert isBitSet(0, arc, in); + // Last bit must be set. + assert isBitSet(arc.numArcs() - 1, arc, in); + // No bit set after the last arc. + assert nextBitSet(arc.numArcs() - 1, arc, in) == -1; + return true; + } + } + } + + private static boolean flag(int flags, int bit) { + return (flags & bit) != 0; + } + + // make a new empty FST, for building; Builder invokes this + FST(INPUT_TYPE inputType, Outputs outputs, int bytesPageBits) { + this.inputType = inputType; + this.outputs = outputs; + fstStore = null; + bytes = new BytesStore(bytesPageBits); + // pad: ensure no node gets address 0 which is reserved to mean + // the stop state w/ no arcs + bytes.writeByte((byte) 0); + emptyOutput = null; + this.version = VERSION_CURRENT; + } + + private static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28; + + /** Load a previously saved FST. */ + public FST(DataInput metaIn, DataInput in, Outputs outputs) throws IOException { + this(metaIn, in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS)); + } + + /** + * Load a previously saved FST; maxBlockBits allows you to control the size of the byte[] pages + * used to hold the FST bytes. + */ + public FST(DataInput metaIn, DataInput in, Outputs outputs, FSTStore fstStore) throws IOException { + bytes = null; + this.fstStore = fstStore; + this.outputs = outputs; + + // NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have + // back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it + this.version = CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT); + if (version < VERSION_PACKED_REMOVED) { + if (in.readByte() == 1) { + throw new CorruptIndexException("Cannot read packed FSTs anymore", in); + } + } + if (metaIn.readByte() == 1) { + // accepts empty string + // 1 KB blocks: + BytesStore emptyBytes = new BytesStore(10); + int numBytes = metaIn.readVInt(); + emptyBytes.copyBytes(metaIn, numBytes); + + // De-serialize empty-string output: + BytesReader reader = emptyBytes.getReverseReader(); + // NoOutputs uses 0 bytes when writing its output, + // so we have to check here else BytesStore gets + // angry: + if (numBytes > 0) { + reader.setPosition(numBytes - 1); + } + emptyOutput = outputs.readFinalOutput(reader); + } else { + emptyOutput = null; + } + final byte t = metaIn.readByte(); + switch (t) { + case 0: + inputType = INPUT_TYPE.BYTE1; + break; + case 1: + inputType = INPUT_TYPE.BYTE2; + break; + case 2: + inputType = INPUT_TYPE.BYTE4; + break; + default: + throw new CorruptIndexException("invalid input type " + t, in); + } + startNode = metaIn.readVLong(); + if (version < VERSION_NO_NODE_ARC_COUNTS) { + metaIn.readVLong(); + metaIn.readVLong(); + metaIn.readVLong(); + } + + long numBytes = metaIn.readVLong(); + this.fstStore.init(in, numBytes); + } + + @Override + public long ramBytesUsed() { + long size = BASE_RAM_BYTES_USED; + if (this.fstStore != null) { + size += this.fstStore.ramBytesUsed(); + } else { + size += bytes.ramBytesUsed(); + } + + return size; + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs; + } + + void finish(long newStartNode) throws IOException { + assert newStartNode <= bytes.getPosition(); + if (startNode != -1) { + throw new IllegalStateException("already finished"); + } + if (newStartNode == FINAL_END_NODE && emptyOutput != null) { + newStartNode = 0; + } + startNode = newStartNode; + bytes.finish(); + } + + public T getEmptyOutput() { + return emptyOutput; + } + + void setEmptyOutput(T v) { + if (emptyOutput != null) { + emptyOutput = outputs.merge(emptyOutput, v); + } else { + emptyOutput = v; + } + } + + public void save(DataOutput metaOut, DataOutput out) throws IOException { + if (startNode == -1) { + throw new IllegalStateException("call finish first"); + } + CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT); + // TODO: really we should encode this as an arc, arriving + // to the root node, instead of special casing here: + if (emptyOutput != null) { + // Accepts empty string + metaOut.writeByte((byte) 1); + + // Serialize empty-string output: + ByteBuffersDataOutput ros = new ByteBuffersDataOutput(); + outputs.writeFinalOutput(emptyOutput, ros); + byte[] emptyOutputBytes = ros.toArrayCopy(); + int emptyLen = emptyOutputBytes.length; + + // reverse + final int stopAt = emptyLen / 2; + int upto = 0; + while (upto < stopAt) { + final byte b = emptyOutputBytes[upto]; + emptyOutputBytes[upto] = emptyOutputBytes[emptyLen - upto - 1]; + emptyOutputBytes[emptyLen - upto - 1] = b; + upto++; + } + metaOut.writeVInt(emptyLen); + metaOut.writeBytes(emptyOutputBytes, 0, emptyLen); + } else { + metaOut.writeByte((byte) 0); + } + final byte t; + if (inputType == FST.INPUT_TYPE.BYTE1) { + t = 0; + } else if (inputType == FST.INPUT_TYPE.BYTE2) { + t = 1; + } else { + t = 2; + } + metaOut.writeByte(t); + metaOut.writeVLong(startNode); + if (bytes != null) { + long numBytes = bytes.getPosition(); + metaOut.writeVLong(numBytes); + bytes.writeTo(out); + } else { + assert fstStore != null; + fstStore.writeTo(out); + } + } + + /** Writes an automaton to a file. */ + public void save(final Path path) throws IOException { + try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) { + DataOutput out = new OutputStreamDataOutput(os); + save(out, out); + } + } + + /** Reads an automaton from a file. */ + public static FST read(Path path, Outputs outputs) throws IOException { + try (InputStream is = Files.newInputStream(path)) { + DataInput in = new InputStreamDataInput(new BufferedInputStream(is)); + return new FST<>(in, in, outputs); + } + } + + private void writeLabel(DataOutput out, int v) throws IOException { + assert v >= 0 : "v=" + v; + if (inputType == FST.INPUT_TYPE.BYTE1) { + assert v <= 255 : "v=" + v; + out.writeByte((byte) v); + } else if (inputType == FST.INPUT_TYPE.BYTE2) { + assert v <= 65535 : "v=" + v; + out.writeShort((short) v); + } else { + out.writeVInt(v); + } + } + + /** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */ + public int readLabel(DataInput in) throws IOException { + final int v; + if (inputType == INPUT_TYPE.BYTE1) { + // Unsigned byte: + v = in.readByte() & 0xFF; + } else if (inputType == INPUT_TYPE.BYTE2) { + // Unsigned short: + if (version < VERSION_LITTLE_ENDIAN) { + v = Short.reverseBytes(in.readShort()) & 0xFFFF; + } else { + v = in.readShort() & 0xFFFF; + } + } else { + v = in.readVInt(); + } + return v; + } + + /** returns true if the node at this address has any outgoing arcs */ + public static boolean targetHasArcs(Arc arc) { + return arc.target() > 0; + } + + // serializes new node by appending its bytes to the end + // of the current byte[] + long addNode(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode nodeIn) throws IOException { + T NO_OUTPUT = outputs.getNoOutput(); + + // System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); + if (nodeIn.numArcs == 0) { + if (nodeIn.isFinal) { + return FINAL_END_NODE; + } else { + return NON_FINAL_END_NODE; + } + } + final long startAddress = fstCompiler.bytes.getPosition(); + // System.out.println(" startAddr=" + startAddress); + + final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(fstCompiler, nodeIn); + if (doFixedLengthArcs) { + // System.out.println(" fixed length arcs"); + if (fstCompiler.numBytesPerArc.length < nodeIn.numArcs) { + fstCompiler.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)]; + fstCompiler.numLabelBytesPerArc = new int[fstCompiler.numBytesPerArc.length]; + } + } + + fstCompiler.arcCount += nodeIn.numArcs; + + final int lastArc = nodeIn.numArcs - 1; + + long lastArcStart = fstCompiler.bytes.getPosition(); + int maxBytesPerArc = 0; + int maxBytesPerArcWithoutLabel = 0; + for (int arcIdx = 0; arcIdx < nodeIn.numArcs; arcIdx++) { + final FSTCompiler.Arc arc = nodeIn.arcs[arcIdx]; + final FSTCompiler.CompiledNode target = (FSTCompiler.CompiledNode) arc.target; + int flags = 0; + // System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + + // target.node); + + if (arcIdx == lastArc) { + flags += BIT_LAST_ARC; + } + + if (fstCompiler.lastFrozenNode == target.node && doFixedLengthArcs == false) { + // TODO: for better perf (but more RAM used) we + // could avoid this except when arc is "near" the + // last arc: + flags += BIT_TARGET_NEXT; + } + + if (arc.isFinal) { + flags += BIT_FINAL_ARC; + if (arc.nextFinalOutput != NO_OUTPUT) { + flags += BIT_ARC_HAS_FINAL_OUTPUT; + } + } else { + assert arc.nextFinalOutput == NO_OUTPUT; + } + + boolean targetHasArcs = target.node > 0; + + if (targetHasArcs == false) { + flags += BIT_STOP_NODE; + } + + if (arc.output != NO_OUTPUT) { + flags += BIT_ARC_HAS_OUTPUT; + } + + fstCompiler.bytes.writeByte((byte) flags); + long labelStart = fstCompiler.bytes.getPosition(); + writeLabel(fstCompiler.bytes, arc.label); + int numLabelBytes = (int) (fstCompiler.bytes.getPosition() - labelStart); + + // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " + // target=" + target.node + " pos=" + bytes.getPosition() + " output=" + + // outputs.outputToString(arc.output)); + + if (arc.output != NO_OUTPUT) { + outputs.write(arc.output, fstCompiler.bytes); + // System.out.println(" write output"); + } + + if (arc.nextFinalOutput != NO_OUTPUT) { + // System.out.println(" write final output"); + outputs.writeFinalOutput(arc.nextFinalOutput, fstCompiler.bytes); + } + + if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { + assert target.node > 0; + // System.out.println(" write target"); + fstCompiler.bytes.writeVLong(target.node); + } + + // just write the arcs "like normal" on first pass, but record how many bytes each one took + // and max byte size: + if (doFixedLengthArcs) { + int numArcBytes = (int) (fstCompiler.bytes.getPosition() - lastArcStart); + fstCompiler.numBytesPerArc[arcIdx] = numArcBytes; + fstCompiler.numLabelBytesPerArc[arcIdx] = numLabelBytes; + lastArcStart = fstCompiler.bytes.getPosition(); + maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes); + maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes); + // System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes); + } + } + + // TODO: try to avoid wasteful cases: disable doFixedLengthArcs in that case + /* + * + * LUCENE-4682: what is a fair heuristic here? + * It could involve some of these: + * 1. how "busy" the node is: nodeIn.inputCount relative to frontier[0].inputCount? + * 2. how much binSearch saves over scan: nodeIn.numArcs + * 3. waste: numBytes vs numBytesExpanded + * + * the one below just looks at #3 + if (doFixedLengthArcs) { + // rough heuristic: make this 1.25 "waste factor" a parameter to the phd ctor???? + int numBytes = lastArcStart - startAddress; + int numBytesExpanded = maxBytesPerArc * nodeIn.numArcs; + if (numBytesExpanded > numBytes*1.25) { + doFixedLengthArcs = false; + } + } + */ + + if (doFixedLengthArcs) { + assert maxBytesPerArc > 0; + // 2nd pass just "expands" all arcs to take up a fixed byte size + + int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1; + assert labelRange > 0; + if (shouldExpandNodeWithDirectAddressing(fstCompiler, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) { + writeNodeForDirectAddressing(fstCompiler, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange); + fstCompiler.directAddressingNodeCount++; + } else { + writeNodeForBinarySearch(fstCompiler, nodeIn, startAddress, maxBytesPerArc); + fstCompiler.binarySearchNodeCount++; + } + } + + final long thisNodeAddress = fstCompiler.bytes.getPosition() - 1; + fstCompiler.bytes.reverse(startAddress, thisNodeAddress); + fstCompiler.nodeCount++; + return thisNodeAddress; + } + + /** + * Returns whether the given node should be expanded with fixed length arcs. Nodes will be + * expanded depending on their depth (distance from the root node) and their number of arcs. + * + *

Nodes with fixed length arcs use more space, because they encode all arcs with a fixed + * number of bytes, but they allow either binary search or direct addressing on the arcs (instead + * of linear scan) on lookup by arc label. + */ + private boolean shouldExpandNodeWithFixedLengthArcs(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode node) { + return fstCompiler.allowFixedLengthArcs + && ((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) + || node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS); + } + + /** + * Returns whether the given node should be expanded with direct addressing instead of binary + * search. + * + *

Prefer direct addressing for performance if it does not oversize binary search byte size too + * much, so that the arcs can be directly addressed by label. + * + * @see FSTCompiler#getDirectAddressingMaxOversizingFactor() + */ + private boolean shouldExpandNodeWithDirectAddressing( + FSTCompiler fstCompiler, + FSTCompiler.UnCompiledNode nodeIn, + int numBytesPerArc, + int maxBytesPerArcWithoutLabel, + int labelRange + ) { + // Anticipate precisely the size of the encodings. + int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs; + int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + fstCompiler.numLabelBytesPerArc[0] + maxBytesPerArcWithoutLabel + * nodeIn.numArcs; + + // Determine the allowed oversize compared to binary search. + // This is defined by a parameter of FST Builder (default 1: no oversize). + int allowedOversize = (int) (sizeForBinarySearch * fstCompiler.getDirectAddressingMaxOversizingFactor()); + int expansionCost = sizeForDirectAddressing - allowedOversize; + + // Select direct addressing if either: + // - Direct addressing size is smaller than binary search. + // In this case, increment the credit by the reduced size (to use it later). + // - Direct addressing size is larger than binary search, but the positive credit allows the + // oversizing. + // In this case, decrement the credit by the oversize. + // In addition, do not try to oversize to a clearly too large node size + // (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter). + if (expansionCost <= 0 + || (fstCompiler.directAddressingExpansionCredit >= expansionCost + && sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) { + fstCompiler.directAddressingExpansionCredit -= expansionCost; + return true; + } + return false; + } + + private void writeNodeForBinarySearch( + FSTCompiler fstCompiler, + FSTCompiler.UnCompiledNode nodeIn, + long startAddress, + int maxBytesPerArc + ) { + // Build the header in a buffer. + // It is a false/special arc which is in fact a node header with node flags followed by node + // metadata. + fstCompiler.fixedLengthArcsBuffer.resetPosition() + .writeByte(ARCS_FOR_BINARY_SEARCH) + .writeVInt(nodeIn.numArcs) + .writeVInt(maxBytesPerArc); + int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition(); + + // Expand the arcs in place, backwards. + long srcPos = fstCompiler.bytes.getPosition(); + long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc; + assert destPos >= srcPos; + if (destPos > srcPos) { + fstCompiler.bytes.skipBytes((int) (destPos - srcPos)); + for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { + destPos -= maxBytesPerArc; + int arcLen = fstCompiler.numBytesPerArc[arcIdx]; + srcPos -= arcLen; + if (srcPos != destPos) { + assert destPos > srcPos + : "destPos=" + + destPos + + " srcPos=" + + srcPos + + " arcIdx=" + + arcIdx + + " maxBytesPerArc=" + + maxBytesPerArc + + " arcLen=" + + arcLen + + " nodeIn.numArcs=" + + nodeIn.numArcs; + fstCompiler.bytes.copyBytes(srcPos, destPos, arcLen); + } + } + } + + // Write the header. + fstCompiler.bytes.writeBytes(startAddress, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen); + } + + private void writeNodeForDirectAddressing( + FSTCompiler fstCompiler, + FSTCompiler.UnCompiledNode nodeIn, + long startAddress, + int maxBytesPerArcWithoutLabel, + int labelRange + ) { + // Expand the arcs backwards in a buffer because we remove the labels. + // So the obtained arcs might occupy less space. This is the reason why this + // whole method is more complex. + // Drop the label bytes since we can infer the label based on the arc index, + // the presence bits, and the first label. Keep the first label. + int headerMaxLen = 11; + int numPresenceBytes = getNumPresenceBytes(labelRange); + long srcPos = fstCompiler.bytes.getPosition(); + int totalArcBytes = fstCompiler.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel; + int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes; + byte[] buffer = fstCompiler.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes(); + // Copy the arcs to the buffer, dropping all labels except first one. + for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { + bufferOffset -= maxBytesPerArcWithoutLabel; + int srcArcLen = fstCompiler.numBytesPerArc[arcIdx]; + srcPos -= srcArcLen; + int labelLen = fstCompiler.numLabelBytesPerArc[arcIdx]; + // Copy the flags. + fstCompiler.bytes.copyBytes(srcPos, buffer, bufferOffset, 1); + // Skip the label, copy the remaining. + int remainingArcLen = srcArcLen - 1 - labelLen; + if (remainingArcLen != 0) { + fstCompiler.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); + } + if (arcIdx == 0) { + // Copy the label of the first arc only. + bufferOffset -= labelLen; + fstCompiler.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); + } + } + assert bufferOffset == headerMaxLen + numPresenceBytes; + + // Build the header in the buffer. + // It is a false/special arc which is in fact a node header with node flags followed by node + // metadata. + fstCompiler.fixedLengthArcsBuffer.resetPosition() + .writeByte(ARCS_FOR_DIRECT_ADDRESSING) + .writeVInt(labelRange) // labelRange instead of numArcs. + .writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc. + int headerLen = fstCompiler.fixedLengthArcsBuffer.getPosition(); + + // Prepare the builder byte store. Enlarge or truncate if needed. + long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes; + long currentPosition = fstCompiler.bytes.getPosition(); + if (nodeEnd >= currentPosition) { + fstCompiler.bytes.skipBytes((int) (nodeEnd - currentPosition)); + } else { + fstCompiler.bytes.truncate(nodeEnd); + } + assert fstCompiler.bytes.getPosition() == nodeEnd; + + // Write the header. + long writeOffset = startAddress; + fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), 0, headerLen); + writeOffset += headerLen; + + // Write the presence bits + writePresenceBits(fstCompiler, nodeIn, writeOffset, numPresenceBytes); + writeOffset += numPresenceBytes; + + // Write the first label and the arcs. + fstCompiler.bytes.writeBytes(writeOffset, fstCompiler.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); + } + + private void writePresenceBits(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode nodeIn, long dest, int numPresenceBytes) { + long bytePos = dest; + byte presenceBits = 1; // The first arc is always present. + int presenceIndex = 0; + int previousLabel = nodeIn.arcs[0].label; + for (int arcIdx = 1; arcIdx < nodeIn.numArcs; arcIdx++) { + int label = nodeIn.arcs[arcIdx].label; + assert label > previousLabel; + presenceIndex += label - previousLabel; + while (presenceIndex >= Byte.SIZE) { + fstCompiler.bytes.writeByte(bytePos++, presenceBits); + presenceBits = 0; + presenceIndex -= Byte.SIZE; + } + // Set the bit at presenceIndex to flag that the corresponding arc is present. + presenceBits |= 1 << presenceIndex; + previousLabel = label; + } + assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8; + assert presenceBits != 0; // The last byte is not 0. + assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. + fstCompiler.bytes.writeByte(bytePos++, presenceBits); + assert bytePos - dest == numPresenceBytes; + } + + /** + * Gets the number of bytes required to flag the presence of each arc in the given label range, + * one bit per arc. + */ + private static int getNumPresenceBytes(int labelRange) { + assert labelRange >= 0; + return (labelRange + 7) >> 3; + } + + /** + * Reads the presence bits of a direct-addressing node. Actually we don't read them here, we just + * keep the pointer to the bit-table start and we skip them. + */ + private void readPresenceBytes(Arc arc, BytesReader in) throws IOException { + assert arc.bytesPerArc() > 0; + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + arc.bitTableStart = in.getPosition(); + in.skipBytes(getNumPresenceBytes(arc.numArcs())); + } + + /** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */ + public Arc getFirstArc(Arc arc) { + T NO_OUTPUT = outputs.getNoOutput(); + + if (emptyOutput != null) { + arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; + arc.nextFinalOutput = emptyOutput; + if (emptyOutput != NO_OUTPUT) { + arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT); + } + } else { + arc.flags = BIT_LAST_ARC; + arc.nextFinalOutput = NO_OUTPUT; + } + arc.output = NO_OUTPUT; + + // If there are no nodes, ie, the FST only accepts the + // empty string, then startNode is 0 + arc.target = startNode; + return arc; + } + + /** + * Follows the follow arc and reads the last arc of its target; this changes the + * provided arc (2nd arg) in-place and returns it. + * + * @return Returns the second argument (arc). + */ + Arc readLastTargetArc(Arc follow, Arc arc, BytesReader in) throws IOException { + // System.out.println("readLast"); + if (targetHasArcs(follow) == false) { + // System.out.println(" end node"); + assert follow.isFinal(); + arc.label = END_LABEL; + arc.target = FINAL_END_NODE; + arc.output = follow.nextFinalOutput(); + arc.flags = BIT_LAST_ARC; + arc.nodeFlags = arc.flags; + return arc; + } else { + in.setPosition(follow.target()); + byte flags = arc.nodeFlags = in.readByte(); + if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { + // Special arc which is actually a node header for fixed length arcs. + // Jump straight to end to find the last arc. + arc.numArcs = in.readVInt(); + if (version >= VERSION_VINT_TARGET) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } + // System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); + if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + readPresenceBytes(arc, in); + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + readLastArcByDirectAddressing(arc, in); + } else { + arc.arcIdx = arc.numArcs() - 2; + arc.posArcsStart = in.getPosition(); + readNextRealArc(arc, in); + } + } else { + arc.flags = flags; + // non-array: linear scan + arc.bytesPerArc = 0; + // System.out.println(" scan"); + while (arc.isLast() == false) { + // skip this arc: + readLabel(in); + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + outputs.skipOutput(in); + } + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.skipFinalOutput(in); + } + if (arc.flag(BIT_STOP_NODE)) {} else if (arc.flag(BIT_TARGET_NEXT)) {} else { + readUnpackedNodeTarget(in); + } + arc.flags = in.readByte(); + } + // Undo the byte flags we read: + in.skipBytes(-1); + arc.nextArc = in.getPosition(); + readNextRealArc(arc, in); + } + assert arc.isLast(); + return arc; + } + } + + private long readUnpackedNodeTarget(BytesReader in) throws IOException { + if (version < VERSION_VINT_TARGET) { + return in.readInt(); + } else { + return in.readVLong(); + } + } + + /** + * Follow the follow arc and read the first arc of its target; this changes the + * provided arc (2nd arg) in-place and returns it. + * + * @return Returns the second argument (arc). + */ + public Arc readFirstTargetArc(Arc follow, Arc arc, BytesReader in) throws IOException { + // int pos = address; + // System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + + // follow.isFinal()); + if (follow.isFinal()) { + // Insert "fake" final first arc: + arc.label = END_LABEL; + arc.output = follow.nextFinalOutput(); + arc.flags = BIT_FINAL_ARC; + if (follow.target() <= 0) { + arc.flags |= BIT_LAST_ARC; + } else { + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.target = FINAL_END_NODE; + arc.nodeFlags = arc.flags; + // System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + + // arc.isLast() + " output=" + outputs.outputToString(arc.output)); + return arc; + } else { + return readFirstRealTargetArc(follow.target(), arc, in); + } + } + + public Arc readFirstRealTargetArc(long nodeAddress, Arc arc, final BytesReader in) throws IOException { + in.setPosition(nodeAddress); + // System.out.println(" flags=" + arc.flags); + + byte flags = arc.nodeFlags = in.readByte(); + if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { + // System.out.println(" fixed length arc"); + // Special arc which is actually a node header for fixed length arcs. + arc.numArcs = in.readVInt(); + if (version >= VERSION_VINT_TARGET) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } + arc.arcIdx = -1; + if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + readPresenceBytes(arc, in); + arc.firstLabel = readLabel(in); + arc.presenceIndex = -1; + } + arc.posArcsStart = in.getPosition(); + // System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " + // arcsStart=" + pos); + } else { + arc.nextArc = nodeAddress; + arc.bytesPerArc = 0; + } + + return readNextRealArc(arc, in); + } + + /** + * Returns whether arc's target points to a node in expanded format (fixed length + * arcs). + */ + boolean isExpandedTarget(Arc follow, BytesReader in) throws IOException { + if (targetHasArcs(follow) == false) { + return false; + } else { + in.setPosition(follow.target()); + byte flags = in.readByte(); + return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING; + } + } + + /** In-place read; returns the arc. */ + public Arc readNextArc(Arc arc, BytesReader in) throws IOException { + if (arc.label() == END_LABEL) { + // This was a fake inserted "final" arc + if (arc.nextArc() <= 0) { + throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true"); + } + return readFirstRealTargetArc(arc.nextArc(), arc, in); + } else { + return readNextRealArc(arc, in); + } + } + + /** Peeks at next arc's label; does not alter arc. Do not call this if arc.isLast()! */ + int readNextArcLabel(Arc arc, BytesReader in) throws IOException { + assert arc.isLast() == false; + + if (arc.label() == END_LABEL) { + // System.out.println(" nextArc fake " + arc.nextArc); + // Next arc is the first arc of a node. + // Position to read the first arc label. + + in.setPosition(arc.nextArc()); + byte flags = in.readByte(); + if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { + // System.out.println(" nextArc fixed length arc"); + // Special arc which is actually a node header for fixed length arcs. + int numArcs = in.readVInt(); + if (version >= VERSION_VINT_TARGET) { + in.readVInt(); // Skip bytesPerArc. + } else { + in.readInt(); // Skip bytesPerArc. + } + if (flags == ARCS_FOR_BINARY_SEARCH) { + in.readByte(); // Skip arc flags. + } else { + in.skipBytes(getNumPresenceBytes(numArcs)); + } + } + } else { + if (arc.bytesPerArc() != 0) { + // System.out.println(" nextArc real array"); + // Arcs have fixed length. + if (arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH) { + // Point to next arc, -1 to skip arc flags. + in.setPosition(arc.posArcsStart() - (1 + arc.arcIdx()) * arc.bytesPerArc() - 1); + } else { + assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; + // Direct addressing node. The label is not stored but rather inferred + // based on first label and arc index in the range. + assert Arc.BitTable.assertIsValid(arc, in); + assert Arc.BitTable.isBitSet(arc.arcIdx(), arc, in); + int nextIndex = Arc.BitTable.nextBitSet(arc.arcIdx(), arc, in); + assert nextIndex != -1; + return arc.firstLabel() + nextIndex; + } + } else { + // Arcs have variable length. + // System.out.println(" nextArc real list"); + // Position to next arc, -1 to skip flags. + in.setPosition(arc.nextArc() - 1); + } + } + return readLabel(in); + } + + public Arc readArcByIndex(Arc arc, final BytesReader in, int idx) throws IOException { + assert arc.bytesPerArc() > 0; + assert arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH; + assert idx >= 0 && idx < arc.numArcs(); + in.setPosition(arc.posArcsStart() - idx * arc.bytesPerArc()); + arc.arcIdx = idx; + arc.flags = in.readByte(); + return readArc(arc, in); + } + + /** + * Reads a present direct addressing node arc, with the provided index in the label range. + * + * @param rangeIndex The index of the arc in the label range. It must be present. The real arc + * offset is computed based on the presence bits of the direct addressing node. + */ + public Arc readArcByDirectAddressing(Arc arc, final BytesReader in, int rangeIndex) throws IOException { + assert Arc.BitTable.assertIsValid(arc, in); + assert rangeIndex >= 0 && rangeIndex < arc.numArcs(); + assert Arc.BitTable.isBitSet(rangeIndex, arc, in); + int presenceIndex = Arc.BitTable.countBitsUpTo(rangeIndex, arc, in); + return readArcByDirectAddressing(arc, in, rangeIndex, presenceIndex); + } + + /** + * Reads a present direct addressing node arc, with the provided index in the label range and its + * corresponding presence index (which is the count of presence bits before it). + */ + private Arc readArcByDirectAddressing(Arc arc, final BytesReader in, int rangeIndex, int presenceIndex) throws IOException { + in.setPosition(arc.posArcsStart() - presenceIndex * arc.bytesPerArc()); + arc.arcIdx = rangeIndex; + arc.presenceIndex = presenceIndex; + arc.flags = in.readByte(); + return readArc(arc, in); + } + + /** + * Reads the last arc of a direct addressing node. This method is equivalent to call {@link + * #readArcByDirectAddressing(Arc, BytesReader, int)} with {@code rangeIndex} equal to {@code + * arc.numArcs() - 1}, but it is faster. + */ + public Arc readLastArcByDirectAddressing(Arc arc, final BytesReader in) throws IOException { + assert Arc.BitTable.assertIsValid(arc, in); + int presenceIndex = Arc.BitTable.countBits(arc, in) - 1; + return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex); + } + + /** Never returns null, but you should never call this if arc.isLast() is true. */ + public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { + + // TODO: can't assert this because we call from readFirstArc + // assert !flag(arc.flags, BIT_LAST_ARC); + + switch (arc.nodeFlags()) { + case ARCS_FOR_BINARY_SEARCH: + assert arc.bytesPerArc() > 0; + arc.arcIdx++; + assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs(); + in.setPosition(arc.posArcsStart() - arc.arcIdx() * arc.bytesPerArc()); + arc.flags = in.readByte(); + break; + + case ARCS_FOR_DIRECT_ADDRESSING: + assert Arc.BitTable.assertIsValid(arc, in); + assert arc.arcIdx() == -1 || Arc.BitTable.isBitSet(arc.arcIdx(), arc, in); + int nextIndex = Arc.BitTable.nextBitSet(arc.arcIdx(), arc, in); + return readArcByDirectAddressing(arc, in, nextIndex, arc.presenceIndex + 1); + + default: + // Variable length arcs - linear search. + assert arc.bytesPerArc() == 0; + in.setPosition(arc.nextArc()); + arc.flags = in.readByte(); + } + return readArc(arc, in); + } + + /** + * Reads an arc.
+ * Precondition: The arc flags byte has already been read and set; the given BytesReader is + * positioned just after the arc flags byte. + */ + private Arc readArc(Arc arc, BytesReader in) throws IOException { + if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) { + arc.label = arc.firstLabel() + arc.arcIdx(); + } else { + arc.label = readLabel(in); + } + + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + arc.output = outputs.read(in); + } else { + arc.output = outputs.getNoOutput(); + } + + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + arc.nextFinalOutput = outputs.readFinalOutput(in); + } else { + arc.nextFinalOutput = outputs.getNoOutput(); + } + + if (arc.flag(BIT_STOP_NODE)) { + if (arc.flag(BIT_FINAL_ARC)) { + arc.target = FINAL_END_NODE; + } else { + arc.target = NON_FINAL_END_NODE; + } + arc.nextArc = in.getPosition(); // Only useful for list. + } else if (arc.flag(BIT_TARGET_NEXT)) { + arc.nextArc = in.getPosition(); // Only useful for list. + // TODO: would be nice to make this lazy -- maybe + // caller doesn't need the target and is scanning arcs... + if (arc.flag(BIT_LAST_ARC) == false) { + if (arc.bytesPerArc() == 0) { + // must scan + seekToNextNode(in); + } else { + int numArcs = arc.nodeFlags == ARCS_FOR_DIRECT_ADDRESSING ? Arc.BitTable.countBits(arc, in) : arc.numArcs(); + in.setPosition(arc.posArcsStart() - arc.bytesPerArc() * numArcs); + } + } + arc.target = in.getPosition(); + } else { + arc.target = readUnpackedNodeTarget(in); + arc.nextArc = in.getPosition(); // Only useful for list. + } + return arc; + } + + static Arc readEndArc(Arc follow, Arc arc) { + if (follow.isFinal()) { + if (follow.target() <= 0) { + arc.flags = FST.BIT_LAST_ARC; + } else { + arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.output = follow.nextFinalOutput(); + arc.label = FST.END_LABEL; + return arc; + } else { + return null; + } + } + + // TODO: could we somehow [partially] tableize arc lookups + // like automaton? + + /** + * Finds an arc leaving the incoming arc, replacing the arc in place. This returns null if the arc + * was not found, else the incoming arc. + */ + public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in) throws IOException { + + if (labelToMatch == END_LABEL) { + if (follow.isFinal()) { + if (follow.target() <= 0) { + arc.flags = BIT_LAST_ARC; + } else { + arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target(); + } + arc.output = follow.nextFinalOutput(); + arc.label = END_LABEL; + arc.nodeFlags = arc.flags; + return arc; + } else { + return null; + } + } + + if (targetHasArcs(follow) == false) { + return null; + } + + in.setPosition(follow.target()); + + // System.out.println("fta label=" + (char) labelToMatch); + + byte flags = arc.nodeFlags = in.readByte(); + if (flags == ARCS_FOR_DIRECT_ADDRESSING) { + arc.numArcs = in.readVInt(); // This is in fact the label range. + if (version >= VERSION_VINT_TARGET) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } + readPresenceBytes(arc, in); + arc.firstLabel = readLabel(in); + arc.posArcsStart = in.getPosition(); + + int arcIndex = labelToMatch - arc.firstLabel(); + if (arcIndex < 0 || arcIndex >= arc.numArcs()) { + return null; // Before or after label range. + } else if (Arc.BitTable.isBitSet(arcIndex, arc, in) == false) { + return null; // Arc missing in the range. + } + return readArcByDirectAddressing(arc, in, arcIndex); + } else if (flags == ARCS_FOR_BINARY_SEARCH) { + arc.numArcs = in.readVInt(); + if (version >= VERSION_VINT_TARGET) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } + arc.posArcsStart = in.getPosition(); + + // Array is sparse; do binary search: + int low = 0; + int high = arc.numArcs() - 1; + while (low <= high) { + // System.out.println(" cycle"); + int mid = (low + high) >>> 1; + // +1 to skip over flags + in.setPosition(arc.posArcsStart() - (arc.bytesPerArc() * mid + 1)); + int midLabel = readLabel(in); + final int cmp = midLabel - labelToMatch; + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + arc.arcIdx = mid - 1; + // System.out.println(" found!"); + return readNextRealArc(arc, in); + } + } + return null; + } + + // Linear scan + readFirstRealTargetArc(follow.target(), arc, in); + + while (true) { + // System.out.println(" non-bs cycle"); + // TODO: we should fix this code to not have to create + // object for the output of every arc we scan... only + // for the matching arc, if found + if (arc.label() == labelToMatch) { + // System.out.println(" found!"); + return arc; + } else if (arc.label() > labelToMatch) { + return null; + } else if (arc.isLast()) { + return null; + } else { + readNextRealArc(arc, in); + } + } + } + + private void seekToNextNode(BytesReader in) throws IOException { + + while (true) { + + final int flags = in.readByte(); + readLabel(in); + + if (flag(flags, BIT_ARC_HAS_OUTPUT)) { + outputs.skipOutput(in); + } + + if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.skipFinalOutput(in); + } + + if (flag(flags, BIT_STOP_NODE) == false && flag(flags, BIT_TARGET_NEXT) == false) { + readUnpackedNodeTarget(in); + } + + if (flag(flags, BIT_LAST_ARC)) { + return; + } + } + } + + /** Returns a {@link BytesReader} for this FST, positioned at position 0. */ + public BytesReader getBytesReader() { + if (this.fstStore != null) { + return this.fstStore.getReverseBytesReader(); + } else { + return bytes.getReverseReader(); + } + } + + /** Reads bytes stored in an FST. */ + public abstract static class BytesReader extends DataInput { + /** Get current read position. */ + public abstract long getPosition(); + + /** Set current read position. */ + public abstract void setPosition(long pos); + + /** Returns true if this reader uses reversed bytes under-the-hood. */ + public abstract boolean reversed(); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTCompiler.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTCompiler.java new file mode 100644 index 0000000000000..7ee6eaa5f7ba4 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTCompiler.java @@ -0,0 +1,804 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST.INPUT_TYPE; + +import java.io.IOException; + +// TODO: could we somehow stream an FST to disk while we +// build it? + +/** + * Builds a minimal FST (maps an IntsRef term to an arbitrary output) from pre-sorted terms with + * outputs. The FST becomes an FSA if you use NoOutputs. The FST is written on-the-fly into a + * compact serialized format byte array, which can be saved to / loaded from a Directory or used + * directly for traversal. The FST is always finite (no cycles). + * + *

NOTE: The algorithm is described at + * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698 + * + *

The parameterized type T is the output type. See the subclasses of {@link Outputs}. + * + *

FSTs larger than 2.1GB are now possible (as of Lucene 4.2). FSTs containing more than 2.1B + * nodes are also now possible, however they cannot be packed. + * + * @lucene.experimental + */ +public class FSTCompiler { + + static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1f; + + private final NodeHash dedupHash; + final FST fst; + private final T NO_OUTPUT; + + // private static final boolean DEBUG = true; + + // simplistic pruning: we prune node (and all following + // nodes) if less than this number of terms go through it: + private final int minSuffixCount1; + + // better pruning: we prune node (and all following + // nodes) if the prior node has less than this number of + // terms go through it: + private final int minSuffixCount2; + + private final boolean doShareNonSingletonNodes; + private final int shareMaxTailLength; + + private final IntsRefBuilder lastInput = new IntsRefBuilder(); + + // NOTE: cutting this over to ArrayList instead loses ~6% + // in build performance on 9.8M Wikipedia terms; so we + // left this as an array: + // current "frontier" + private UnCompiledNode[] frontier; + + // Used for the BIT_TARGET_NEXT optimization (whereby + // instead of storing the address of the target node for + // a given arc, we mark a single bit noting that the next + // node in the byte[] is the target node): + long lastFrozenNode; + + // Reused temporarily while building the FST: + int[] numBytesPerArc = new int[4]; + int[] numLabelBytesPerArc = new int[numBytesPerArc.length]; + final FixedLengthArcsBuffer fixedLengthArcsBuffer = new FixedLengthArcsBuffer(); + + long arcCount; + long nodeCount; + long binarySearchNodeCount; + long directAddressingNodeCount; + + final boolean allowFixedLengthArcs; + final float directAddressingMaxOversizingFactor; + long directAddressingExpansionCredit; + + final BytesStore bytes; + + /** + * Instantiates an FST/FSA builder with default settings and pruning options turned off. For more + * tuning and tweaking, see {@link Builder}. + */ + public FSTCompiler(INPUT_TYPE inputType, Outputs outputs) { + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, 1f); + } + + private FSTCompiler( + INPUT_TYPE inputType, + int minSuffixCount1, + int minSuffixCount2, + boolean doShareSuffix, + boolean doShareNonSingletonNodes, + int shareMaxTailLength, + Outputs outputs, + boolean allowFixedLengthArcs, + int bytesPageBits, + float directAddressingMaxOversizingFactor + ) { + this.minSuffixCount1 = minSuffixCount1; + this.minSuffixCount2 = minSuffixCount2; + this.doShareNonSingletonNodes = doShareNonSingletonNodes; + this.shareMaxTailLength = shareMaxTailLength; + this.allowFixedLengthArcs = allowFixedLengthArcs; + this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor; + fst = new FST<>(inputType, outputs, bytesPageBits); + bytes = fst.bytes; + assert bytes != null; + if (doShareSuffix) { + dedupHash = new NodeHash<>(fst, bytes.getReverseReader(false)); + } else { + dedupHash = null; + } + NO_OUTPUT = outputs.getNoOutput(); + + @SuppressWarnings({ "rawtypes", "unchecked" }) + final UnCompiledNode[] f = (UnCompiledNode[]) new UnCompiledNode[10]; + frontier = f; + for (int idx = 0; idx < frontier.length; idx++) { + frontier[idx] = new UnCompiledNode<>(this, idx); + } + } + + /** + * Fluent-style constructor for FST {@link FSTCompiler}. + * + *

Creates an FST/FSA builder with all the possible tuning and construction tweaks. Read + * parameter documentation carefully. + */ + public static class Builder { + + private final INPUT_TYPE inputType; + private final Outputs outputs; + private int minSuffixCount1; + private int minSuffixCount2; + private boolean shouldShareSuffix = true; + private boolean shouldShareNonSingletonNodes = true; + private int shareMaxTailLength = Integer.MAX_VALUE; + private boolean allowFixedLengthArcs = true; + private int bytesPageBits = 15; + private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR; + + /** + * @param inputType The input type (transition labels). Can be anything from {@link INPUT_TYPE} + * enumeration. Shorter types will consume less memory. Strings (character sequences) are + * represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints). + * @param outputs The output type for each input sequence. Applies only if building an FST. + */ + public Builder(INPUT_TYPE inputType, Outputs outputs) { + this.inputType = inputType; + this.outputs = outputs; + } + + /** + * If pruning the input graph during construction, this threshold is used for telling if a node + * is kept or pruned. If transition_count(node) >= minSuffixCount1, the node is kept. + * + *

Default = 0. + */ + public Builder minSuffixCount1(int minSuffixCount1) { + this.minSuffixCount1 = minSuffixCount1; + return this; + } + + /** + * Better pruning: we prune node (and all following nodes) if the prior node has less than this + * number of terms go through it. + * + *

Default = 0. + */ + public Builder minSuffixCount2(int minSuffixCount2) { + this.minSuffixCount2 = minSuffixCount2; + return this; + } + + /** + * If {@code true}, the shared suffixes will be compacted into unique paths. This requires an + * additional RAM-intensive hash map for lookups in memory. Setting this parameter to {@code + * false} creates a single suffix path for all input sequences. This will result in a larger + * FST, but requires substantially less memory and CPU during building. + * + *

Default = {@code true}. + */ + public Builder shouldShareSuffix(boolean shouldShareSuffix) { + this.shouldShareSuffix = shouldShareSuffix; + return this; + } + + /** + * Only used if {@code shouldShareSuffix} is true. Set this to true to ensure FST is fully + * minimal, at cost of more CPU and more RAM during building. + * + *

Default = {@code true}. + */ + public Builder shouldShareNonSingletonNodes(boolean shouldShareNonSingletonNodes) { + this.shouldShareNonSingletonNodes = shouldShareNonSingletonNodes; + return this; + } + + /** + * Only used if {@code shouldShareSuffix} is true. Set this to Integer.MAX_VALUE to ensure FST + * is fully minimal, at cost of more CPU and more RAM during building. + * + *

Default = {@link Integer#MAX_VALUE}. + */ + public Builder shareMaxTailLength(int shareMaxTailLength) { + this.shareMaxTailLength = shareMaxTailLength; + return this; + } + + /** + * Pass {@code false} to disable the fixed length arc optimization (binary search or direct + * addressing) while building the FST; this will make the resulting FST smaller but slower to + * traverse. + * + *

Default = {@code true}. + */ + public Builder allowFixedLengthArcs(boolean allowFixedLengthArcs) { + this.allowFixedLengthArcs = allowFixedLengthArcs; + return this; + } + + /** + * How many bits wide to make each byte[] block in the BytesStore; if you know the FST will be + * large then make this larger. For example 15 bits = 32768 byte pages. + * + *

Default = 15. + */ + public Builder bytesPageBits(int bytesPageBits) { + this.bytesPageBits = bytesPageBits; + return this; + } + + /** + * Overrides the default the maximum oversizing of fixed array allowed to enable direct + * addressing of arcs instead of binary search. + * + *

Setting this factor to a negative value (e.g. -1) effectively disables direct addressing, + * only binary search nodes will be created. + * + *

This factor does not determine whether to encode a node with a list of variable length + * arcs or with fixed length arcs. It only determines the effective encoding of a node that is + * already known to be encoded with fixed length arcs. + * + *

Default = 1. + */ + public Builder directAddressingMaxOversizingFactor(float factor) { + this.directAddressingMaxOversizingFactor = factor; + return this; + } + + /** Creates a new {@link FSTCompiler}. */ + public FSTCompiler build() { + FSTCompiler fstCompiler = new FSTCompiler<>( + inputType, + minSuffixCount1, + minSuffixCount2, + shouldShareSuffix, + shouldShareNonSingletonNodes, + shareMaxTailLength, + outputs, + allowFixedLengthArcs, + bytesPageBits, + directAddressingMaxOversizingFactor + ); + return fstCompiler; + } + } + + public float getDirectAddressingMaxOversizingFactor() { + return directAddressingMaxOversizingFactor; + } + + public long getTermCount() { + return frontier[0].inputCount; + } + + public long getNodeCount() { + // 1+ in order to count the -1 implicit final node + return 1 + nodeCount; + } + + public long getArcCount() { + return arcCount; + } + + public long getMappedStateCount() { + return dedupHash == null ? 0 : nodeCount; + } + + private CompiledNode compileNode(UnCompiledNode nodeIn, int tailLength) throws IOException { + final long node; + long bytesPosStart = bytes.getPosition(); + if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) { + if (nodeIn.numArcs == 0) { + node = fst.addNode(this, nodeIn); + lastFrozenNode = node; + } else { + node = dedupHash.add(this, nodeIn); + } + } else { + node = fst.addNode(this, nodeIn); + } + assert node != -2; + + long bytesPosEnd = bytes.getPosition(); + if (bytesPosEnd != bytesPosStart) { + // The FST added a new node: + assert bytesPosEnd > bytesPosStart; + lastFrozenNode = node; + } + + nodeIn.clear(); + + final CompiledNode fn = new CompiledNode(); + fn.node = node; + return fn; + } + + private void freezeTail(int prefixLenPlus1) throws IOException { + // System.out.println(" compileTail " + prefixLenPlus1); + final int downTo = Math.max(1, prefixLenPlus1); + for (int idx = lastInput.length(); idx >= downTo; idx--) { + + boolean doPrune = false; + boolean doCompile = false; + + final UnCompiledNode node = frontier[idx]; + final UnCompiledNode parent = frontier[idx - 1]; + + if (node.inputCount < minSuffixCount1) { + doPrune = true; + doCompile = true; + } else if (idx > prefixLenPlus1) { + // prune if parent's inputCount is less than suffixMinCount2 + if (parent.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && parent.inputCount == 1 && idx > 1)) { + // my parent, about to be compiled, doesn't make the cut, so + // I'm definitely pruned + + // if minSuffixCount2 is 1, we keep only up + // until the 'distinguished edge', ie we keep only the + // 'divergent' part of the FST. if my parent, about to be + // compiled, has inputCount 1 then we are already past the + // distinguished edge. NOTE: this only works if + // the FST outputs are not "compressible" (simple + // ords ARE compressible). + doPrune = true; + } else { + // my parent, about to be compiled, does make the cut, so + // I'm definitely not pruned + doPrune = false; + } + doCompile = true; + } else { + // if pruning is disabled (count is 0) we can always + // compile current node + doCompile = minSuffixCount2 == 0; + } + + // System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + // + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + // + doPrune); + + if (node.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && node.inputCount == 1 && idx > 1)) { + // drop all arcs + for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) { + @SuppressWarnings({ "rawtypes", "unchecked" }) + final UnCompiledNode target = (UnCompiledNode) node.arcs[arcIdx].target; + target.clear(); + } + node.numArcs = 0; + } + + if (doPrune) { + // this node doesn't make it -- deref it + node.clear(); + parent.deleteLast(lastInput.intAt(idx - 1), node); + } else { + + if (minSuffixCount2 != 0) { + compileAllTargets(node, lastInput.length() - idx); + } + final T nextFinalOutput = node.output; + + // We "fake" the node as being final if it has no + // outgoing arcs; in theory we could leave it + // as non-final (the FST can represent this), but + // FSTEnum, Util, etc., have trouble w/ non-final + // dead-end states: + final boolean isFinal = node.isFinal || node.numArcs == 0; + + if (doCompile) { + // this node makes it and we now compile it. first, + // compile any targets that were previously + // undecided: + parent.replaceLast(lastInput.intAt(idx - 1), compileNode(node, 1 + lastInput.length() - idx), nextFinalOutput, isFinal); + } else { + // replaceLast just to install + // nextFinalOutput/isFinal onto the arc + parent.replaceLast(lastInput.intAt(idx - 1), node, nextFinalOutput, isFinal); + // this node will stay in play for now, since we are + // undecided on whether to prune it. later, it + // will be either compiled or pruned, so we must + // allocate a new node: + frontier[idx] = new UnCompiledNode<>(this, idx); + } + } + } + } + + // for debugging + /* + private String toString(BytesRef b) { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + return b.toString(); + } + } + */ + + /** + * Add the next input/output pair. The provided input must be sorted after the previous one + * according to {@link IntsRef#compareTo}. It's also OK to add the same input twice in a row with + * different outputs, as long as {@link Outputs} implements the {@link Outputs#merge} method. Note + * that input is fully consumed after this method is returned (so caller is free to reuse), but + * output is not. So if your outputs are changeable (eg {@link ByteSequenceOutputs}) + * then you cannot reuse across calls. + */ + public void add(IntsRef input, T output) throws IOException { + /* + if (DEBUG) { + BytesRef b = new BytesRef(input.length); + for(int x=0;x n = (UnCompiledNode) arc.target; + if (n.numArcs == 0) { + // System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label); + arc.isFinal = n.isFinal = true; + } + arc.target = compileNode(n, tailLength - 1); + } + } + } + + /** Expert: holds a pending (seen but not yet serialized) arc. */ + static class Arc { + int label; // really an "unsigned" byte + Node target; + boolean isFinal; + T output; + T nextFinalOutput; + } + + // NOTE: not many instances of Node or CompiledNode are in + // memory while the FST is being built; it's only the + // current "frontier": + + interface Node { + boolean isCompiled(); + } + + public long fstRamBytesUsed() { + return fst.ramBytesUsed(); + } + + static final class CompiledNode implements Node { + long node; + + @Override + public boolean isCompiled() { + return true; + } + } + + /** Expert: holds a pending (seen but not yet serialized) Node. */ + static final class UnCompiledNode implements Node { + final FSTCompiler owner; + int numArcs; + Arc[] arcs; + // TODO: instead of recording isFinal/output on the + // node, maybe we should use -1 arc to mean "end" (like + // we do when reading the FST). Would simplify much + // code here... + T output; + boolean isFinal; + long inputCount; + + /** This node's depth, starting from the automaton root. */ + final int depth; + + /** + * @param depth The node's depth starting from the automaton root. Needed for LUCENE-2934 (node + * expansion based on conditions other than the fanout size). + */ + @SuppressWarnings({ "rawtypes", "unchecked" }) + UnCompiledNode(FSTCompiler owner, int depth) { + this.owner = owner; + arcs = (Arc[]) new Arc[1]; + arcs[0] = new Arc<>(); + output = owner.NO_OUTPUT; + this.depth = depth; + } + + @Override + public boolean isCompiled() { + return false; + } + + void clear() { + numArcs = 0; + isFinal = false; + output = owner.NO_OUTPUT; + inputCount = 0; + + // We don't clear the depth here because it never changes + // for nodes on the frontier (even when reused). + } + + T getLastOutput(int labelToMatch) { + assert numArcs > 0; + assert arcs[numArcs - 1].label == labelToMatch; + return arcs[numArcs - 1].output; + } + + void addArc(int label, Node target) { + assert label >= 0; + assert numArcs == 0 || label > arcs[numArcs - 1].label + : "arc[numArcs-1].label=" + arcs[numArcs - 1].label + " new label=" + label + " numArcs=" + numArcs; + if (numArcs == arcs.length) { + final Arc[] newArcs = ArrayUtil.grow(arcs); + for (int arcIdx = numArcs; arcIdx < newArcs.length; arcIdx++) { + newArcs[arcIdx] = new Arc<>(); + } + arcs = newArcs; + } + final Arc arc = arcs[numArcs++]; + arc.label = label; + arc.target = target; + arc.output = arc.nextFinalOutput = owner.NO_OUTPUT; + arc.isFinal = false; + } + + void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) { + assert numArcs > 0; + final Arc arc = arcs[numArcs - 1]; + assert arc.label == labelToMatch : "arc.label=" + arc.label + " vs " + labelToMatch; + arc.target = target; + // assert target.node != -2; + arc.nextFinalOutput = nextFinalOutput; + arc.isFinal = isFinal; + } + + void deleteLast(int label, Node target) { + assert numArcs > 0; + assert label == arcs[numArcs - 1].label; + assert target == arcs[numArcs - 1].target; + numArcs--; + } + + void setLastOutput(int labelToMatch, T newOutput) { + assert owner.validOutput(newOutput); + assert numArcs > 0; + final Arc arc = arcs[numArcs - 1]; + assert arc.label == labelToMatch; + arc.output = newOutput; + } + + // pushes an output prefix forward onto all arcs + void prependOutput(T outputPrefix) { + assert owner.validOutput(outputPrefix); + + for (int arcIdx = 0; arcIdx < numArcs; arcIdx++) { + arcs[arcIdx].output = owner.fst.outputs.add(outputPrefix, arcs[arcIdx].output); + assert owner.validOutput(arcs[arcIdx].output); + } + + if (isFinal) { + output = owner.fst.outputs.add(outputPrefix, output); + assert owner.validOutput(output); + } + } + } + + /** + * Reusable buffer for building nodes with fixed length arcs (binary search or direct addressing). + */ + static class FixedLengthArcsBuffer { + + // Initial capacity is the max length required for the header of a node with fixed length arcs: + // header(byte) + numArcs(vint) + numBytes(vint) + private byte[] bytes = new byte[11]; + private final ByteArrayDataOutput bado = new ByteArrayDataOutput(bytes); + + /** Ensures the capacity of the internal byte array. Enlarges it if needed. */ + FixedLengthArcsBuffer ensureCapacity(int capacity) { + if (bytes.length < capacity) { + bytes = new byte[ArrayUtil.oversize(capacity, Byte.BYTES)]; + bado.reset(bytes); + } + return this; + } + + FixedLengthArcsBuffer resetPosition() { + bado.reset(bytes); + return this; + } + + FixedLengthArcsBuffer writeByte(byte b) { + bado.writeByte(b); + return this; + } + + FixedLengthArcsBuffer writeVInt(int i) { + try { + bado.writeVInt(i); + } catch (IOException e) { // Never thrown. + throw new RuntimeException(e); + } + return this; + } + + int getPosition() { + return bado.getPosition(); + } + + /** Gets the internal byte array. */ + byte[] getBytes() { + return bytes; + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTEnum.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTEnum.java new file mode 100644 index 0000000000000..789c216df6f95 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTEnum.java @@ -0,0 +1,660 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST.Arc.BitTable; + +import java.io.IOException; + +/** + * Can next() and advance() through the terms in an FST + * + * @lucene.experimental + */ +abstract class FSTEnum { + protected final FST fst; + + @SuppressWarnings({ "rawtypes", "unchecked" }) + protected FST.Arc[] arcs = new FST.Arc[10]; + // outputs are cumulative + @SuppressWarnings({ "rawtypes", "unchecked" }) + protected T[] output = (T[]) new Object[10]; + + protected final T NO_OUTPUT; + protected final FST.BytesReader fstReader; + + protected int upto; + int targetLength; + + /** + * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to + * the biggest term before target. + */ + FSTEnum(FST fst) { + this.fst = fst; + fstReader = fst.getBytesReader(); + NO_OUTPUT = fst.outputs.getNoOutput(); + fst.getFirstArc(getArc(0)); + output[0] = NO_OUTPUT; + } + + protected abstract int getTargetLabel(); + + protected abstract int getCurrentLabel(); + + protected abstract void setCurrentLabel(int label); + + protected abstract void grow(); + + /** Rewinds enum state to match the shared prefix between current term and target term */ + private void rewindPrefix() throws IOException { + if (upto == 0) { + // System.out.println(" init"); + upto = 1; + fst.readFirstTargetArc(getArc(0), getArc(1), fstReader); + return; + } + // System.out.println(" rewind upto=" + upto + " vs targetLength=" + targetLength); + + final int currentLimit = upto; + upto = 1; + while (upto < currentLimit && upto <= targetLength + 1) { + final int cmp = getCurrentLabel() - getTargetLabel(); + if (cmp < 0) { + // seek forward + // System.out.println(" seek fwd"); + break; + } else if (cmp > 0) { + // seek backwards -- reset this arc to the first arc + final FST.Arc arc = getArc(upto); + fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader); + // System.out.println(" seek first arc"); + break; + } + upto++; + } + // System.out.println(" fall through upto=" + upto); + } + + protected void doNext() throws IOException { + // System.out.println("FE: next upto=" + upto); + if (upto == 0) { + // System.out.println(" init"); + upto = 1; + fst.readFirstTargetArc(getArc(0), getArc(1), fstReader); + } else { + // pop + // System.out.println(" check pop curArc target=" + arcs[upto].target + " label=" + + // arcs[upto].label + " isLast?=" + arcs[upto].isLast()); + while (arcs[upto].isLast()) { + upto--; + if (upto == 0) { + // System.out.println(" eof"); + return; + } + } + fst.readNextArc(arcs[upto], fstReader); + } + + pushFirst(); + } + + // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / + // SEEK_END)? saves the eq check above? + + /** Seeks to smallest term that's >= target. */ + protected void doSeekCeil() throws IOException { + + // System.out.println(" advance len=" + target.length + " curlen=" + current.length); + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + // System.out.println("FE.seekCeil upto=" + upto); + + // Save time by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + // System.out.println(" after rewind upto=" + upto); + + FST.Arc arc = getArc(upto); + // System.out.println(" init targetLabel=" + targetLabel); + + // Now scan forward, matching the new suffix of the target + while (arc != null) { + int targetLabel = getTargetLabel(); + // System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) + // arc.label + ") vs targetLabel=" + targetLabel); + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + // Arcs are in an array + final FST.BytesReader in = fst.getBytesReader(); + if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { + arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in); + } else { + assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH; + arc = doSeekCeilArrayPacked(arc, targetLabel, in); + } + } else { + arc = doSeekCeilList(arc, targetLabel); + } + } + } + + private FST.Arc doSeekCeilArrayDirectAddressing(final FST.Arc arc, final int targetLabel, final FST.BytesReader in) + throws IOException { + // The array is addressed directly by label, with presence bits to compute the actual arc + // offset. + + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex >= arc.numArcs()) { + // Target is beyond the last arc, out of label range. + // Dead end (target is after the last arc); + // rollback to last fork then push + upto--; + while (true) { + if (upto == 0) { + return null; + } + final FST.Arc prevArc = getArc(upto); + // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " + // isLast?=" + prevArc.isLast()); + if (prevArc.isLast() == false) { + fst.readNextArc(prevArc, fstReader); + pushFirst(); + return null; + } + upto--; + } + } else { + if (targetIndex < 0) { + targetIndex = -1; + } else if (BitTable.isBitSet(targetIndex, arc, in)) { + fst.readArcByDirectAddressing(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + // Not found, return the next arc (ceil). + int ceilIndex = BitTable.nextBitSet(targetIndex, arc, in); + assert ceilIndex != -1; + fst.readArcByDirectAddressing(arc, in, ceilIndex); + assert arc.label() > targetLabel; + pushFirst(); + return null; + } + } + + private FST.Arc doSeekCeilArrayPacked(final FST.Arc arc, final int targetLabel, final FST.BytesReader in) throws IOException { + // The array is packed -- use binary search to find the target. + int idx = Util.binarySearch(fst, arc, targetLabel); + if (idx >= 0) { + // Match + fst.readArcByIndex(arc, in, idx); + assert arc.arcIdx() == idx; + assert arc.label() == targetLabel : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + idx; + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + idx = -1 - idx; + if (idx == arc.numArcs()) { + // Dead end + fst.readArcByIndex(arc, in, idx - 1); + assert arc.isLast(); + // Dead end (target is after the last arc); + // rollback to last fork then push + upto--; + while (true) { + if (upto == 0) { + return null; + } + final FST.Arc prevArc = getArc(upto); + // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " + // isLast?=" + prevArc.isLast()); + if (prevArc.isLast() == false) { + fst.readNextArc(prevArc, fstReader); + pushFirst(); + return null; + } + upto--; + } + } else { + // Ceiling - arc with least higher label + fst.readArcByIndex(arc, in, idx); + assert arc.label() > targetLabel; + pushFirst(); + return null; + } + } + + private FST.Arc doSeekCeilList(final FST.Arc arc, final int targetLabel) throws IOException { + // Arcs are not array'd -- must do linear scan: + if (arc.label() == targetLabel) { + // recurse + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } else if (arc.label() > targetLabel) { + pushFirst(); + return null; + } else if (arc.isLast()) { + // Dead end (target is after the last arc); + // rollback to last fork then push + upto--; + while (true) { + if (upto == 0) { + return null; + } + final FST.Arc prevArc = getArc(upto); + // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " + // isLast?=" + prevArc.isLast()); + if (prevArc.isLast() == false) { + fst.readNextArc(prevArc, fstReader); + pushFirst(); + return null; + } + upto--; + } + } else { + // keep scanning + // System.out.println(" next scan"); + fst.readNextArc(arc, fstReader); + } + return arc; + } + + // Todo: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / + // SEEK_END)? saves the eq check above? + /** Seeks to largest term that's <= target. */ + void doSeekFloor() throws IOException { + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + // System.out.println("FE: seek floor upto=" + upto); + + // Save CPU by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + + // System.out.println("FE: after rewind upto=" + upto); + + FST.Arc arc = getArc(upto); + + // System.out.println("FE: init targetLabel=" + targetLabel); + + // Now scan forward, matching the new suffix of the target + while (arc != null) { + // System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) + // arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + + // arc.bytesPerArc); + int targetLabel = getTargetLabel(); + + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + // Arcs are in an array + final FST.BytesReader in = fst.getBytesReader(); + if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { + arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in); + } else { + assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH; + arc = doSeekFloorArrayPacked(arc, targetLabel, in); + } + } else { + arc = doSeekFloorList(arc, targetLabel); + } + } + } + + private FST.Arc doSeekFloorArrayDirectAddressing(FST.Arc arc, int targetLabel, FST.BytesReader in) throws IOException { + // The array is addressed directly by label, with presence bits to compute the actual arc + // offset. + + int targetIndex = targetLabel - arc.firstLabel(); + if (targetIndex < 0) { + // Before first arc. + return backtrackToFloorArc(arc, targetLabel, in); + } else if (targetIndex >= arc.numArcs()) { + // After last arc. + fst.readLastArcByDirectAddressing(arc, in); + assert arc.label() < targetLabel; + assert arc.isLast(); + pushLast(); + return null; + } else { + // Within label range. + if (BitTable.isBitSet(targetIndex, arc, in)) { + fst.readArcByDirectAddressing(arc, in, targetIndex); + assert arc.label() == targetLabel; + // found -- copy pasta from below + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } + // Scan backwards to find a floor arc. + int floorIndex = BitTable.previousBitSet(targetIndex, arc, in); + assert floorIndex != -1; + fst.readArcByDirectAddressing(arc, in, floorIndex); + assert arc.label() < targetLabel; + assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel; + pushLast(); + return null; + } + } + + /** + * Backtracks until it finds a node which first arc is before our target label.` Then on the node, + * finds the arc just before the targetLabel. + * + * @return null to continue the seek floor recursion loop. + */ + private FST.Arc backtrackToFloorArc(FST.Arc arc, int targetLabel, final FST.BytesReader in) throws IOException { + while (true) { + // First, walk backwards until we find a node which first arc is before our target label. + fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader); + if (arc.label() < targetLabel) { + // Then on this node, find the arc just before the targetLabel. + if (arc.isLast() == false) { + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) { + findNextFloorArcBinarySearch(arc, targetLabel, in); + } else { + assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING; + findNextFloorArcDirectAddressing(arc, targetLabel, in); + } + } else { + while (arc.isLast() == false && fst.readNextArcLabel(arc, in) < targetLabel) { + fst.readNextArc(arc, fstReader); + } + } + } + assert arc.label() < targetLabel; + assert arc.isLast() || fst.readNextArcLabel(arc, in) >= targetLabel; + pushLast(); + return null; + } + upto--; + if (upto == 0) { + return null; + } + targetLabel = getTargetLabel(); + arc = getArc(upto); + } + } + + /** + * Finds and reads an arc on the current node which label is strictly less than the given label. + * Skips the first arc, finds next floor arc; or none if the floor arc is the first arc itself (in + * this case it has already been read). + * + *

Precondition: the given arc is the first arc of the node. + */ + private void findNextFloorArcDirectAddressing(FST.Arc arc, int targetLabel, final FST.BytesReader in) throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING; + assert arc.label() != FST.END_LABEL; + assert arc.label() == arc.firstLabel(); + if (arc.numArcs() > 1) { + int targetIndex = targetLabel - arc.firstLabel(); + assert targetIndex >= 0; + if (targetIndex >= arc.numArcs()) { + // Beyond last arc. Take last arc. + fst.readLastArcByDirectAddressing(arc, in); + } else { + // Take the preceding arc, even if the target is present. + int floorIndex = BitTable.previousBitSet(targetIndex, arc, in); + if (floorIndex > 0) { + fst.readArcByDirectAddressing(arc, in, floorIndex); + } + } + } + } + + /** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */ + private void findNextFloorArcBinarySearch(FST.Arc arc, int targetLabel, FST.BytesReader in) throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH; + assert arc.label() != FST.END_LABEL; + assert arc.arcIdx() == 0; + if (arc.numArcs() > 1) { + int idx = Util.binarySearch(fst, arc, targetLabel); + assert idx != -1; + if (idx > 1) { + fst.readArcByIndex(arc, in, idx - 1); + } else if (idx < -2) { + fst.readArcByIndex(arc, in, -2 - idx); + } + } + } + + private FST.Arc doSeekFloorArrayPacked(FST.Arc arc, int targetLabel, final FST.BytesReader in) throws IOException { + // Arcs are fixed array -- use binary search to find the target. + int idx = Util.binarySearch(fst, arc, targetLabel); + + if (idx >= 0) { + // Match -- recurse + // System.out.println(" match! arcIdx=" + idx); + fst.readArcByIndex(arc, in, idx); + assert arc.arcIdx() == idx; + assert arc.label() == targetLabel : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel + " mid=" + idx; + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } else if (idx == -1) { + // Before first arc. + return backtrackToFloorArc(arc, targetLabel, in); + } else { + // There is a floor arc; idx will be (-1 - (floor + 1)). + fst.readArcByIndex(arc, in, -2 - idx); + assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel; + assert arc.label() < targetLabel : "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel; + pushLast(); + return null; + } + } + + private FST.Arc doSeekFloorList(FST.Arc arc, int targetLabel) throws IOException { + if (arc.label() == targetLabel) { + // Match -- recurse + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (targetLabel == FST.END_LABEL) { + return null; + } + setCurrentLabel(arc.label()); + incr(); + return fst.readFirstTargetArc(arc, getArc(upto), fstReader); + } else if (arc.label() > targetLabel) { + // TODO: if each arc could somehow read the arc just + // before, we can save this re-scan. The ceil case + // doesn't need this because it reads the next arc + // instead: + while (true) { + // First, walk backwards until we find a first arc + // that's before our target label: + fst.readFirstTargetArc(getArc(upto - 1), arc, fstReader); + if (arc.label() < targetLabel) { + // Then, scan forwards to the arc just before + // the targetLabel: + while (arc.isLast() == false && fst.readNextArcLabel(arc, fstReader) < targetLabel) { + fst.readNextArc(arc, fstReader); + } + pushLast(); + return null; + } + upto--; + if (upto == 0) { + return null; + } + targetLabel = getTargetLabel(); + arc = getArc(upto); + } + } else if (arc.isLast() == false) { + // System.out.println(" check next label=" + fst.readNextArcLabel(arc) + " (" + (char) + // fst.readNextArcLabel(arc) + ")"); + if (fst.readNextArcLabel(arc, fstReader) > targetLabel) { + pushLast(); + return null; + } else { + // keep scanning + return fst.readNextArc(arc, fstReader); + } + } else { + pushLast(); + return null; + } + } + + /** Seeks to exactly target term. */ + boolean doSeekExact() throws IOException { + + // TODO: possibly caller could/should provide common + // prefix length? ie this work may be redundant if + // caller is in fact intersecting against its own + // automaton + + // System.out.println("FE: seek exact upto=" + upto); + + // Save time by starting at the end of the shared prefix + // b/w our current term & the target: + rewindPrefix(); + + // System.out.println("FE: after rewind upto=" + upto); + FST.Arc arc = getArc(upto - 1); + int targetLabel = getTargetLabel(); + + final FST.BytesReader fstReader = fst.getBytesReader(); + + while (true) { + // System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); + final FST.Arc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader); + if (nextArc == null) { + // short circuit + // upto--; + // upto = 0; + fst.readFirstTargetArc(arc, getArc(upto), fstReader); + // System.out.println(" no match upto=" + upto); + return false; + } + // Match -- recurse: + output[upto] = fst.outputs.add(output[upto - 1], nextArc.output()); + if (targetLabel == FST.END_LABEL) { + // System.out.println(" return found; upto=" + upto + " output=" + output[upto] + " + // nextArc=" + nextArc.isLast()); + return true; + } + setCurrentLabel(targetLabel); + incr(); + targetLabel = getTargetLabel(); + arc = nextArc; + } + } + + private void incr() { + upto++; + grow(); + if (arcs.length <= upto) { + @SuppressWarnings({ "rawtypes", "unchecked" }) + final FST.Arc[] newArcs = new FST.Arc[ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(arcs, 0, newArcs, 0, arcs.length); + arcs = newArcs; + } + if (output.length <= upto) { + @SuppressWarnings({ "rawtypes", "unchecked" }) + final T[] newOutput = (T[]) new Object[ArrayUtil.oversize(1 + upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(output, 0, newOutput, 0, output.length); + output = newOutput; + } + } + + // Appends current arc, and then recurses from its target, + // appending first arc all the way to the final node + private void pushFirst() throws IOException { + + FST.Arc arc = arcs[upto]; + assert arc != null; + + while (true) { + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (arc.label() == FST.END_LABEL) { + // Final node + break; + } + // System.out.println(" pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" + + // fst.outputs.outputToString(output[upto])); + setCurrentLabel(arc.label()); + incr(); + + final FST.Arc nextArc = getArc(upto); + fst.readFirstTargetArc(arc, nextArc, fstReader); + arc = nextArc; + } + } + + // Recurses from current arc, appending last arc all the + // way to the first final node + private void pushLast() throws IOException { + + FST.Arc arc = arcs[upto]; + assert arc != null; + + while (true) { + setCurrentLabel(arc.label()); + output[upto] = fst.outputs.add(output[upto - 1], arc.output()); + if (arc.label() == FST.END_LABEL) { + // Final node + break; + } + incr(); + + arc = fst.readLastTargetArc(arc, getArc(upto), fstReader); + } + } + + private FST.Arc getArc(int idx) { + if (arcs[idx] == null) { + arcs[idx] = new FST.Arc<>(); + } + return arcs[idx]; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTStore.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTStore.java new file mode 100644 index 0000000000000..c0648ac6b6a83 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTStore.java @@ -0,0 +1,37 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.Accountable; + +import java.io.IOException; + +/** Abstraction for reading/writing bytes necessary for FST. */ +public interface FSTStore extends Accountable { + void init(DataInput in, long numBytes) throws IOException; + + long size(); + + FST.BytesReader getReverseBytesReader(); + + void writeTo(DataOutput out) throws IOException; +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ForwardBytesReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ForwardBytesReader.java new file mode 100644 index 0000000000000..dcabb3a4d68f2 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ForwardBytesReader.java @@ -0,0 +1,64 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +// TODO: can we use just ByteArrayDataInput...? need to +// add a .skipBytes to DataInput.. hmm and .setPosition + +/** Reads from a single byte[]. */ +final class ForwardBytesReader extends FST.BytesReader { + private final byte[] bytes; + private int pos; + + ForwardBytesReader(byte[] bytes) { + this.bytes = bytes; + } + + @Override + public byte readByte() { + return bytes[pos++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + System.arraycopy(bytes, pos, b, offset, len); + pos += len; + } + + @Override + public void skipBytes(long count) { + pos += count; + } + + @Override + public long getPosition() { + return pos; + } + + @Override + public void setPosition(long pos) { + this.pos = (int) pos; + } + + @Override + public boolean reversed() { + return false; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/NodeHash.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/NodeHash.java new file mode 100644 index 0000000000000..f0b8364e9f1bf --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/NodeHash.java @@ -0,0 +1,192 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.util.packed.PackedInts; +import org.apache.lucene.util.packed.PagedGrowableWriter; + +import java.io.IOException; + +// Used to dedup states (lookup already-frozen states) +final class NodeHash { + + private PagedGrowableWriter table; + private long count; + private long mask; + private final FST fst; + private final FST.Arc scratchArc = new FST.Arc<>(); + private final FST.BytesReader in; + + NodeHash(FST fst, FST.BytesReader in) { + table = new PagedGrowableWriter(16, 1 << 27, 8, PackedInts.COMPACT); + mask = 15; + this.fst = fst; + this.in = in; + } + + private boolean nodesEqual(FSTCompiler.UnCompiledNode node, long address) throws IOException { + fst.readFirstRealTargetArc(address, scratchArc, in); + + // Fail fast for a node with fixed length arcs. + if (scratchArc.bytesPerArc() != 0) { + if (scratchArc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) { + if (node.numArcs != scratchArc.numArcs()) { + return false; + } + } else { + assert scratchArc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING; + if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs() + || node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) { + return false; + } + } + } + + for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) { + final FSTCompiler.Arc arc = node.arcs[arcUpto]; + if (arc.label != scratchArc.label() + || arc.output.equals(scratchArc.output()) == false + || ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target() + || arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false + || arc.isFinal != scratchArc.isFinal()) { + return false; + } + + if (scratchArc.isLast()) { + if (arcUpto == node.numArcs - 1) { + return true; + } else { + return false; + } + } + fst.readNextRealArc(scratchArc, in); + } + + return false; + } + + // hash code for an unfrozen node. This must be identical + // to the frozen case (below)!! + private long hash(FSTCompiler.UnCompiledNode node) { + final int PRIME = 31; + // System.out.println("hash unfrozen"); + long h = 0; + // TODO: maybe if number of arcs is high we can safely subsample? + for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) { + final FSTCompiler.Arc arc = node.arcs[arcIdx]; + // System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) + // arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " + // isFinal?=" + arc.isFinal); + h = PRIME * h + arc.label; + long n = ((FSTCompiler.CompiledNode) arc.target).node; + h = PRIME * h + (int) (n ^ (n >> 32)); + h = PRIME * h + arc.output.hashCode(); + h = PRIME * h + arc.nextFinalOutput.hashCode(); + if (arc.isFinal) { + h += 17; + } + } + // System.out.println(" ret " + (h&Integer.MAX_VALUE)); + return h & Long.MAX_VALUE; + } + + // hash code for a frozen node + private long hash(long node) throws IOException { + final int PRIME = 31; + // System.out.println("hash frozen node=" + node); + long h = 0; + fst.readFirstRealTargetArc(node, scratchArc, in); + while (true) { + // System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + + // h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + + // scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition()); + h = PRIME * h + scratchArc.label(); + h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 32)); + h = PRIME * h + scratchArc.output().hashCode(); + h = PRIME * h + scratchArc.nextFinalOutput().hashCode(); + if (scratchArc.isFinal()) { + h += 17; + } + if (scratchArc.isLast()) { + break; + } + fst.readNextRealArc(scratchArc, in); + } + // System.out.println(" ret " + (h&Integer.MAX_VALUE)); + return h & Long.MAX_VALUE; + } + + public long add(FSTCompiler fstCompiler, FSTCompiler.UnCompiledNode nodeIn) throws IOException { + // System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask); + final long h = hash(nodeIn); + long pos = h & mask; + int c = 0; + while (true) { + final long v = table.get(pos); + if (v == 0) { + // freeze & add + final long node = fst.addNode(fstCompiler, nodeIn); + // System.out.println(" now freeze node=" + node); + assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; + count++; + table.set(pos, node); + // Rehash at 2/3 occupancy: + if (count > 2 * table.size() / 3) { + rehash(); + } + return node; + } else if (nodesEqual(nodeIn, v)) { + // same node is already here + return v; + } + + // quadratic probe + pos = (pos + (++c)) & mask; + } + } + + // called only by rehash + private void addNew(long address) throws IOException { + long pos = hash(address) & mask; + int c = 0; + while (true) { + if (table.get(pos) == 0) { + table.set(pos, address); + break; + } + + // quadratic probe + pos = (pos + (++c)) & mask; + } + } + + private void rehash() throws IOException { + final PagedGrowableWriter oldTable = table; + + table = new PagedGrowableWriter(2 * oldTable.size(), 1 << 30, PackedInts.bitsRequired(count), PackedInts.COMPACT); + mask = table.size() - 1; + for (long idx = 0; idx < oldTable.size(); idx++) { + final long address = oldTable.get(idx); + if (address != 0) { + addNew(address); + } + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OffHeapFSTStore.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OffHeapFSTStore.java new file mode 100644 index 0000000000000..f0246cbf5c862 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OffHeapFSTStore.java @@ -0,0 +1,79 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; + +/** + * Provides off heap storage of finite state machine (FST), using underlying index input instead of + * byte store on heap + * + * @lucene.experimental + */ +public final class OffHeapFSTStore implements FSTStore { + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OffHeapFSTStore.class); + + private IndexInput in; + private long offset; + private long numBytes; + + @Override + public void init(DataInput in, long numBytes) throws IOException { + if (in instanceof IndexInput) { + this.in = (IndexInput) in; + this.numBytes = numBytes; + this.offset = this.in.getFilePointer(); + } else { + throw new IllegalArgumentException( + "parameter:in should be an instance of IndexInput for using OffHeapFSTStore, not a " + in.getClass().getName() + ); + } + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED; + } + + @Override + public long size() { + return numBytes; + } + + @Override + public FST.BytesReader getReverseBytesReader() { + try { + return new ReverseRandomAccessReader(in.randomAccessSlice(offset, numBytes)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void writeTo(DataOutput out) throws IOException { + throw new UnsupportedOperationException("writeToOutput operation is not supported for OffHeapFSTStore"); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OnHeapFSTStore.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OnHeapFSTStore.java new file mode 100644 index 0000000000000..646e56f095d9a --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OnHeapFSTStore.java @@ -0,0 +1,103 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.RamUsageEstimator; + +import java.io.IOException; + +/** + * Provides storage of finite state machine (FST), using byte array or byte store allocated on heap. + * + * @lucene.experimental + */ +public final class OnHeapFSTStore implements FSTStore { + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OnHeapFSTStore.class); + + /** + * A {@link BytesStore}, used during building, or during reading when the FST is very large (more + * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead. + */ + private BytesStore bytes; + + /** Used at read time when the FST fits into a single byte[]. */ + private byte[] bytesArray; + + private final int maxBlockBits; + + public OnHeapFSTStore(int maxBlockBits) { + if (maxBlockBits < 1 || maxBlockBits > 30) { + throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits); + } + + this.maxBlockBits = maxBlockBits; + } + + @Override + public void init(DataInput in, long numBytes) throws IOException { + if (numBytes > 1 << this.maxBlockBits) { + // FST is big: we need multiple pages + bytes = new BytesStore(in, numBytes, 1 << this.maxBlockBits); + } else { + // FST fits into a single block: use ByteArrayBytesStoreReader for less overhead + bytesArray = new byte[(int) numBytes]; + in.readBytes(bytesArray, 0, bytesArray.length); + } + } + + @Override + public long size() { + if (bytesArray != null) { + return bytesArray.length; + } else { + return bytes.ramBytesUsed(); + } + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + size(); + } + + @Override + public FST.BytesReader getReverseBytesReader() { + if (bytesArray != null) { + return new ReverseBytesReader(bytesArray); + } else { + return bytes.getReverseReader(); + } + } + + @Override + public void writeTo(DataOutput out) throws IOException { + if (bytes != null) { + long numBytes = bytes.getPosition(); + out.writeVLong(numBytes); + bytes.writeTo(out); + } else { + assert bytesArray != null; + out.writeVLong(bytesArray.length); + out.writeBytes(bytesArray, 0, bytesArray.length); + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Outputs.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Outputs.java new file mode 100644 index 0000000000000..a7c5ed8933fed --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Outputs.java @@ -0,0 +1,108 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.Accountable; + +import java.io.IOException; + +/** + * Represents the outputs for an FST, providing the basic algebra required for building and + * traversing the FST. + * + *

Note that any operation that returns NO_OUTPUT must return the same singleton object from + * {@link #getNoOutput}. + * + * @lucene.experimental + */ +public abstract class Outputs { + + // TODO: maybe change this API to allow for re-use of the + // output instances -- this is an insane amount of garbage + // (new object per byte/char/int) if eg used during + // analysis + + /** Eg common("foobar", "food") -> "foo" */ + public abstract T common(T output1, T output2); + + /** Eg subtract("foobar", "foo") -> "bar" */ + public abstract T subtract(T output, T inc); + + /** Eg add("foo", "bar") -> "foobar" */ + public abstract T add(T prefix, T output); + + /** Encode an output value into a {@link DataOutput}. */ + public abstract void write(T output, DataOutput out) throws IOException; + + /** + * Encode an final node output value into a {@link DataOutput}. By default this just calls {@link + * #write(Object, DataOutput)}. + */ + public void writeFinalOutput(T output, DataOutput out) throws IOException { + write(output, out); + } + + /** Decode an output value previously written with {@link #write(Object, DataOutput)}. */ + public abstract T read(DataInput in) throws IOException; + + /** Skip the output; defaults to just calling {@link #read} and discarding the result. */ + public void skipOutput(DataInput in) throws IOException { + read(in); + } + + /** + * Decode an output value previously written with {@link #writeFinalOutput(Object, DataOutput)}. + * By default this just calls {@link #read(DataInput)}. + */ + public T readFinalOutput(DataInput in) throws IOException { + return read(in); + } + + /** + * Skip the output previously written with {@link #writeFinalOutput}; defaults to just calling + * {@link #readFinalOutput} and discarding the result. + */ + public void skipFinalOutput(DataInput in) throws IOException { + skipOutput(in); + } + + /** + * NOTE: this output is compared with == so you must ensure that all methods return the single + * object if it's really no output + */ + public abstract T getNoOutput(); + + public abstract String outputToString(T output); + + // TODO: maybe make valid(T output) public...? for asserts + + public T merge(T first, T second) { + throw new UnsupportedOperationException(); + } + + /** + * Return memory usage for the provided output. + * + * @see Accountable + */ + public abstract long ramBytesUsed(T output); +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ReverseBytesReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ReverseBytesReader.java new file mode 100644 index 0000000000000..8d22cc77694dd --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ReverseBytesReader.java @@ -0,0 +1,62 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +/** Reads in reverse from a single byte[]. */ +final class ReverseBytesReader extends FST.BytesReader { + private final byte[] bytes; + private int pos; + + ReverseBytesReader(byte[] bytes) { + this.bytes = bytes; + } + + @Override + public byte readByte() { + return bytes[pos--]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for (int i = 0; i < len; i++) { + b[offset + i] = bytes[pos--]; + } + } + + @Override + public void skipBytes(long count) { + pos -= count; + } + + @Override + public long getPosition() { + return pos; + } + + @Override + public void setPosition(long pos) { + this.pos = (int) pos; + } + + @Override + public boolean reversed() { + return true; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ReverseRandomAccessReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ReverseRandomAccessReader.java new file mode 100644 index 0000000000000..55eca99aaeb1e --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ReverseRandomAccessReader.java @@ -0,0 +1,67 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.store.RandomAccessInput; + +import java.io.IOException; + +/** Implements reverse read from a RandomAccessInput. */ +final class ReverseRandomAccessReader extends FST.BytesReader { + private final RandomAccessInput in; + private long pos; + + ReverseRandomAccessReader(RandomAccessInput in) { + this.in = in; + } + + @Override + public byte readByte() throws IOException { + return in.readByte(pos--); + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + int i = offset, end = offset + len; + while (i < end) { + b[i++] = in.readByte(pos--); + } + } + + @Override + public void skipBytes(long count) { + pos -= count; + } + + @Override + public long getPosition() { + return pos; + } + + @Override + public void setPosition(long pos) { + this.pos = pos; + } + + @Override + public boolean reversed() { + return true; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Util.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Util.java new file mode 100644 index 0000000000000..ce2ac82d478b6 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Util.java @@ -0,0 +1,903 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST.Arc; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST.Arc.BitTable; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST.BytesReader; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.TreeSet; + +/** + * Static helper methods. + * + * @lucene.experimental + */ +public final class Util { + private Util() {} + + /** Looks up the output for this input, or null if the input is not accepted. */ + public static T get(FST fst, IntsRef input) throws IOException { + + // TODO: would be nice not to alloc this on every lookup + final Arc arc = fst.getFirstArc(new Arc<>()); + + final BytesReader fstReader = fst.getBytesReader(); + + // Accumulate output as we go + T output = fst.outputs.getNoOutput(); + for (int i = 0; i < input.length; i++) { + if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) { + return null; + } + output = fst.outputs.add(output, arc.output()); + } + + if (arc.isFinal()) { + return fst.outputs.add(output, arc.nextFinalOutput()); + } else { + return null; + } + } + + // TODO: maybe a CharsRef version for BYTE2 + + /** Looks up the output for this input, or null if the input is not accepted */ + public static T get(FST fst, BytesRef input) throws IOException { + assert fst.inputType == FST.INPUT_TYPE.BYTE1; + + final BytesReader fstReader = fst.getBytesReader(); + + // TODO: would be nice not to alloc this on every lookup + final Arc arc = fst.getFirstArc(new Arc<>()); + + // Accumulate output as we go + T output = fst.outputs.getNoOutput(); + for (int i = 0; i < input.length; i++) { + if (fst.findTargetArc(input.bytes[i + input.offset] & 0xFF, arc, arc, fstReader) == null) { + return null; + } + output = fst.outputs.add(output, arc.output()); + } + + if (arc.isFinal()) { + return fst.outputs.add(output, arc.nextFinalOutput()); + } else { + return null; + } + } + + /** + * Represents a path in TopNSearcher. + * + * @lucene.experimental + */ + public static class FSTPath { + /** Holds the last arc appended to this path */ + public Arc arc; + /** Holds cost plus any usage-specific output: */ + public T output; + + public final IntsRefBuilder input; + public final float boost; + public final CharSequence context; + + // Custom int payload for consumers; the NRT suggester uses this to record if this path has + // already enumerated a surface form + public int payload; + + FSTPath(T output, Arc arc, IntsRefBuilder input, float boost, CharSequence context, int payload) { + this.arc = new Arc().copyFrom(arc); + this.output = output; + this.input = input; + this.boost = boost; + this.context = context; + this.payload = payload; + } + + FSTPath newPath(T output, IntsRefBuilder input) { + return new FSTPath<>(output, this.arc, input, this.boost, this.context, this.payload); + } + + @Override + public String toString() { + return "input=" + input.get() + " output=" + output + " context=" + context + " boost=" + boost + " payload=" + payload; + } + } + + /** Compares first by the provided comparator, and then tie breaks by path.input. */ + private static class TieBreakByInputComparator implements Comparator> { + private final Comparator comparator; + + TieBreakByInputComparator(Comparator comparator) { + this.comparator = comparator; + } + + @Override + public int compare(FSTPath a, FSTPath b) { + int cmp = comparator.compare(a.output, b.output); + if (cmp == 0) { + return a.input.get().compareTo(b.input.get()); + } else { + return cmp; + } + } + } + + /** Utility class to find top N shortest paths from start point(s). */ + public static class TopNSearcher { + + private final FST fst; + private final BytesReader bytesReader; + private final int topN; + private final int maxQueueDepth; + + private final Arc scratchArc = new Arc<>(); + + private final Comparator comparator; + private final Comparator> pathComparator; + + TreeSet> queue; + + /** + * Creates an unbounded TopNSearcher + * + * @param fst the {@link FST} to search on + * @param topN the number of top scoring entries to retrieve + * @param maxQueueDepth the maximum size of the queue of possible top entries + * @param comparator the comparator to select the top N + */ + public TopNSearcher(FST fst, int topN, int maxQueueDepth, Comparator comparator) { + this(fst, topN, maxQueueDepth, comparator, new TieBreakByInputComparator<>(comparator)); + } + + public TopNSearcher(FST fst, int topN, int maxQueueDepth, Comparator comparator, Comparator> pathComparator) { + this.fst = fst; + this.bytesReader = fst.getBytesReader(); + this.topN = topN; + this.maxQueueDepth = maxQueueDepth; + this.comparator = comparator; + this.pathComparator = pathComparator; + queue = new TreeSet<>(pathComparator); + } + + // If back plus this arc is competitive then add to queue: + protected void addIfCompetitive(FSTPath path) { + + assert queue != null; + + T output = fst.outputs.add(path.output, path.arc.output()); + + if (queue.size() == maxQueueDepth) { + FSTPath bottom = queue.last(); + int comp = pathComparator.compare(path, bottom); + if (comp > 0) { + // Doesn't compete + return; + } else if (comp == 0) { + // Tie break by alpha sort on the input: + path.input.append(path.arc.label()); + final int cmp = bottom.input.get().compareTo(path.input.get()); + path.input.setLength(path.input.length() - 1); + + // We should never see dups: + assert cmp != 0; + + if (cmp < 0) { + // Doesn't compete + return; + } + } + // Competes + } + // else ... Queue isn't full yet, so any path we hit competes: + + // copy over the current input to the new input + // and add the arc.label to the end + IntsRefBuilder newInput = new IntsRefBuilder(); + newInput.copyInts(path.input.get()); + newInput.append(path.arc.label()); + + FSTPath newPath = path.newPath(output, newInput); + if (acceptPartialPath(newPath)) { + queue.add(newPath); + if (queue.size() == maxQueueDepth + 1) { + queue.pollLast(); + } + } + } + + public void addStartPaths(Arc node, T startOutput, boolean allowEmptyString, IntsRefBuilder input) throws IOException { + addStartPaths(node, startOutput, allowEmptyString, input, 0, null, -1); + } + + /** + * Adds all leaving arcs, including 'finished' arc, if the node is final, from this node into + * the queue. + */ + public void addStartPaths( + Arc node, + T startOutput, + boolean allowEmptyString, + IntsRefBuilder input, + float boost, + CharSequence context, + int payload + ) throws IOException { + + // De-dup NO_OUTPUT since it must be a singleton: + if (startOutput.equals(fst.outputs.getNoOutput())) { + startOutput = fst.outputs.getNoOutput(); + } + + FSTPath path = new FSTPath<>(startOutput, node, input, boost, context, payload); + fst.readFirstTargetArc(node, path.arc, bytesReader); + + // Bootstrap: find the min starting arc + while (true) { + if (allowEmptyString || path.arc.label() != FST.END_LABEL) { + addIfCompetitive(path); + } + if (path.arc.isLast()) { + break; + } + fst.readNextArc(path.arc, bytesReader); + } + } + + public TopResults search() throws IOException { + + final List> results = new ArrayList<>(); + + final BytesReader fstReader = fst.getBytesReader(); + final T NO_OUTPUT = fst.outputs.getNoOutput(); + + // TODO: we could enable FST to sorting arcs by weight + // as it freezes... can easily do this on first pass + // (w/o requiring rewrite) + + // TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!? + // (nibbles) + int rejectCount = 0; + + // For each top N path: + while (results.size() < topN) { + + FSTPath path; + + if (queue == null) { + // Ran out of paths + break; + } + + // Remove top path since we are now going to + // pursue it: + path = queue.pollFirst(); + + if (path == null) { + // There were less than topN paths available: + break; + } + // System.out.println("pop path=" + path + " arc=" + path.arc.output); + + if (acceptPartialPath(path) == false) { + continue; + } + + if (path.arc.label() == FST.END_LABEL) { + // Empty string! + path.input.setLength(path.input.length() - 1); + results.add(new Result<>(path.input.get(), path.output)); + continue; + } + + if (results.size() == topN - 1 && maxQueueDepth == topN) { + // Last path -- don't bother w/ queue anymore: + queue = null; + } + + // We take path and find its "0 output completion", + // ie, just keep traversing the first arc with + // NO_OUTPUT that we can find, since this must lead + // to the minimum path that completes from + // path.arc. + + // For each input letter: + while (true) { + + fst.readFirstTargetArc(path.arc, path.arc, fstReader); + + // For each arc leaving this node: + boolean foundZero = false; + boolean arcCopyIsPending = false; + while (true) { + // tricky: instead of comparing output == 0, we must + // express it via the comparator compare(output, 0) == 0 + if (comparator.compare(NO_OUTPUT, path.arc.output()) == 0) { + if (queue == null) { + foundZero = true; + break; + } else if (foundZero == false) { + arcCopyIsPending = true; + foundZero = true; + } else { + addIfCompetitive(path); + } + } else if (queue != null) { + addIfCompetitive(path); + } + if (path.arc.isLast()) { + break; + } + if (arcCopyIsPending) { + scratchArc.copyFrom(path.arc); + arcCopyIsPending = false; + } + fst.readNextArc(path.arc, fstReader); + } + + assert foundZero; + + if (queue != null && arcCopyIsPending == false) { + path.arc.copyFrom(scratchArc); + } + + if (path.arc.label() == FST.END_LABEL) { + // Add final output: + path.output = fst.outputs.add(path.output, path.arc.output()); + if (acceptResult(path)) { + results.add(new Result<>(path.input.get(), path.output)); + } else { + rejectCount++; + } + break; + } else { + path.input.append(path.arc.label()); + path.output = fst.outputs.add(path.output, path.arc.output()); + if (acceptPartialPath(path) == false) { + break; + } + } + } + } + return new TopResults<>(rejectCount + topN <= maxQueueDepth, results); + } + + protected boolean acceptResult(FSTPath path) { + return acceptResult(path.input.get(), path.output); + } + + /** Override this to prevent considering a path before it's complete */ + protected boolean acceptPartialPath(FSTPath path) { + return true; + } + + protected boolean acceptResult(IntsRef input, T output) { + return true; + } + } + + /** + * Holds a single input (IntsRef) + output, returned by {@link #shortestPaths shortestPaths()}. + */ + public static final class Result { + public final IntsRef input; + public final T output; + + public Result(IntsRef input, T output) { + this.input = input; + this.output = output; + } + } + + /** Holds the results for a top N search using {@link TopNSearcher} */ + public static final class TopResults implements Iterable> { + + /** + * true iff this is a complete result ie. if the specified queue size was large + * enough to find the complete list of results. This might be false if the {@link + * TopNSearcher} rejected too many results. + */ + public final boolean isComplete; + /** The top results */ + public final List> topN; + + TopResults(boolean isComplete, List> topN) { + this.topN = topN; + this.isComplete = isComplete; + } + + @Override + public Iterator> iterator() { + return topN.iterator(); + } + } + + /** Starting from node, find the top N min cost completions to a final node. */ + public static TopResults shortestPaths( + FST fst, + Arc fromNode, + T startOutput, + Comparator comparator, + int topN, + boolean allowEmptyString + ) throws IOException { + + // All paths are kept, so we can pass topN for + // maxQueueDepth and the pruning is admissible: + TopNSearcher searcher = new TopNSearcher<>(fst, topN, topN, comparator); + + // since this search is initialized with a single start node + // it is okay to start with an empty input path here + searcher.addStartPaths(fromNode, startOutput, allowEmptyString, new IntsRefBuilder()); + return searcher.search(); + } + + /** + * Dumps an {@link FST} to a GraphViz's dot language description for visualization. + * Example of use: + * + *

+     * PrintWriter pw = new PrintWriter("out.dot");
+     * Util.toDot(fst, pw, true, true);
+     * pw.close();
+     * 
+ * + * and then, from command line: + * + *
+     * dot -Tpng -o out.png out.dot
+     * 
+ * + *

Note: larger FSTs (a few thousand nodes) won't even render, don't bother. + * + * @param sameRank If true, the resulting dot file will try to order + * states in layers of breadth-first traversal. This may mess up arcs, but makes the output + * FST's structure a bit clearer. + * @param labelStates If true states will have labels equal to their offsets in their + * binary format. Expands the graph considerably. + * @see graphviz project + */ + public static void toDot(FST fst, Writer out, boolean sameRank, boolean labelStates) throws IOException { + final String expandedNodeColor = "blue"; + + // This is the start arc in the automaton (from the epsilon state to the first state + // with outgoing transitions. + final Arc startArc = fst.getFirstArc(new Arc<>()); + + // A queue of transitions to consider for the next level. + final List> thisLevelQueue = new ArrayList<>(); + + // A queue of transitions to consider when processing the next level. + final List> nextLevelQueue = new ArrayList<>(); + nextLevelQueue.add(startArc); + // System.out.println("toDot: startArc: " + startArc); + + // A list of states on the same level (for ranking). + final List sameLevelStates = new ArrayList<>(); + + // A bitset of already seen states (target offset). + final BitSet seen = new BitSet(); + seen.set((int) startArc.target()); + + // Shape for states. + final String stateShape = "circle"; + final String finalStateShape = "doublecircle"; + + // Emit DOT prologue. + out.write("digraph FST {\n"); + out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); + + if (labelStates == false) { + out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); + } + + emitDotState(out, "initial", "point", "white", ""); + + final T NO_OUTPUT = fst.outputs.getNoOutput(); + final BytesReader r = fst.getBytesReader(); + + // final FST.Arc scratchArc = new FST.Arc<>(); + + { + final String stateColor; + if (fst.isExpandedTarget(startArc, r)) { + stateColor = expandedNodeColor; + } else { + stateColor = null; + } + + final boolean isFinal; + final T finalOutput; + if (startArc.isFinal()) { + isFinal = true; + finalOutput = startArc.nextFinalOutput() == NO_OUTPUT ? null : startArc.nextFinalOutput(); + } else { + isFinal = false; + finalOutput = null; + } + + emitDotState( + out, + Long.toString(startArc.target()), + isFinal ? finalStateShape : stateShape, + stateColor, + finalOutput == null ? "" : fst.outputs.outputToString(finalOutput) + ); + } + + out.write(" initial -> " + startArc.target() + "\n"); + + int level = 0; + + while (nextLevelQueue.isEmpty() == false) { + // we could double buffer here, but it doesn't matter probably. + // System.out.println("next level=" + level); + thisLevelQueue.addAll(nextLevelQueue); + nextLevelQueue.clear(); + + level++; + out.write("\n // Transitions and states at level: " + level + "\n"); + while (thisLevelQueue.isEmpty() == false) { + final Arc arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); + // System.out.println(" pop: " + arc); + if (FST.targetHasArcs(arc)) { + // scan all target arcs + // System.out.println(" readFirstTarget..."); + + final long node = arc.target(); + + fst.readFirstRealTargetArc(arc.target(), arc, r); + + // System.out.println(" firstTarget: " + arc); + + while (true) { + + // System.out.println(" cycle arc=" + arc); + // Emit the unseen state and add it to the queue for the next level. + if (arc.target() >= 0 && seen.get((int) arc.target()) == false) { + + /* + boolean isFinal = false; + T finalOutput = null; + fst.readFirstTargetArc(arc, scratchArc); + if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { + // target is final + isFinal = true; + finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; + System.out.println("dot hit final label=" + (char) scratchArc.label); + } + */ + final String stateColor; + if (fst.isExpandedTarget(arc, r)) { + stateColor = expandedNodeColor; + } else { + stateColor = null; + } + + final String finalOutput; + if (arc.nextFinalOutput() != null && arc.nextFinalOutput() != NO_OUTPUT) { + finalOutput = fst.outputs.outputToString(arc.nextFinalOutput()); + } else { + finalOutput = ""; + } + + emitDotState(out, Long.toString(arc.target()), stateShape, stateColor, finalOutput); + // To see the node address, use this instead: + // emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, + // String.valueOf(arc.target)); + seen.set((int) arc.target()); + nextLevelQueue.add(new Arc().copyFrom(arc)); + sameLevelStates.add((int) arc.target()); + } + + String outs; + if (arc.output() != NO_OUTPUT) { + outs = "/" + fst.outputs.outputToString(arc.output()); + } else { + outs = ""; + } + + if (FST.targetHasArcs(arc) == false && arc.isFinal() && arc.nextFinalOutput() != NO_OUTPUT) { + // Tricky special case: sometimes, due to + // pruning, the builder can [sillily] produce + // an FST with an arc into the final end state + // (-1) but also with a next final output; in + // this case we pull that output up onto this + // arc + outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput()) + "]"; + } + + final String arcColor; + if (arc.flag(FST.BIT_TARGET_NEXT)) { + arcColor = "red"; + } else { + arcColor = "black"; + } + + assert arc.label() != FST.END_LABEL; + out.write( + " " + + node + + " -> " + + arc.target() + + " [label=\"" + + printableLabel(arc.label()) + + outs + + "\"" + + (arc.isFinal() ? " style=\"bold\"" : "") + + " color=\"" + + arcColor + + "\"]\n" + ); + + // Break the loop if we're on the last arc of this state. + if (arc.isLast()) { + // System.out.println(" break"); + break; + } + fst.readNextRealArc(arc, r); + } + } + } + + // Emit state ranking information. + if (sameRank && sameLevelStates.size() > 1) { + out.write(" {rank=same; "); + for (int state : sameLevelStates) { + out.write(state + "; "); + } + out.write(" }\n"); + } + sameLevelStates.clear(); + } + + // Emit terminating state (always there anyway). + out.write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); + out.write(" {rank=sink; -1 }\n"); + + out.write("}\n"); + out.flush(); + } + + /** Emit a single state in the dot language. */ + private static void emitDotState(Writer out, String name, String shape, String color, String label) throws IOException { + out.write( + " " + + name + + " [" + + (shape != null ? "shape=" + shape : "") + + " " + + (color != null ? "color=" + color : "") + + " " + + (label != null ? "label=\"" + label + "\"" : "label=\"\"") + + " " + + "]\n" + ); + } + + /** Ensures an arc's label is indeed printable (dot uses US-ASCII). */ + private static String printableLabel(int label) { + // Any ordinary ascii character, except for " or \, are + // printed as the character; else, as a hex string: + if (label >= 0x20 && label <= 0x7d && label != 0x22 && label != 0x5c) { // " OR \ + return Character.toString((char) label); + } + return "0x" + Integer.toHexString(label); + } + + /** Just maps each UTF16 unit (char) to the ints in an IntsRef. */ + public static IntsRef toUTF16(CharSequence s, IntsRefBuilder scratch) { + final int charLimit = s.length(); + scratch.setLength(charLimit); + scratch.grow(charLimit); + for (int idx = 0; idx < charLimit; idx++) { + scratch.setIntAt(idx, s.charAt(idx)); + } + return scratch.get(); + } + + /** + * Decodes the Unicode codepoints from the provided CharSequence and places them in the provided + * scratch IntsRef, which must not be null, returning it. + */ + public static IntsRef toUTF32(CharSequence s, IntsRefBuilder scratch) { + int charIdx = 0; + int intIdx = 0; + final int charLimit = s.length(); + while (charIdx < charLimit) { + scratch.grow(intIdx + 1); + final int utf32 = Character.codePointAt(s, charIdx); + scratch.setIntAt(intIdx, utf32); + charIdx += Character.charCount(utf32); + intIdx++; + } + scratch.setLength(intIdx); + return scratch.get(); + } + + /** + * Decodes the Unicode codepoints from the provided char[] and places them in the provided scratch + * IntsRef, which must not be null, returning it. + */ + public static IntsRef toUTF32(char[] s, int offset, int length, IntsRefBuilder scratch) { + int charIdx = offset; + int intIdx = 0; + final int charLimit = offset + length; + while (charIdx < charLimit) { + scratch.grow(intIdx + 1); + final int utf32 = Character.codePointAt(s, charIdx, charLimit); + scratch.setIntAt(intIdx, utf32); + charIdx += Character.charCount(utf32); + intIdx++; + } + scratch.setLength(intIdx); + return scratch.get(); + } + + /** Just takes unsigned byte values from the BytesRef and converts into an IntsRef. */ + public static IntsRef toIntsRef(BytesRef input, IntsRefBuilder scratch) { + scratch.clear(); + for (int i = 0; i < input.length; i++) { + scratch.append(input.bytes[i + input.offset] & 0xFF); + } + return scratch.get(); + } + + /** Just converts IntsRef to BytesRef; you must ensure the int values fit into a byte. */ + public static BytesRef toBytesRef(IntsRef input, BytesRefBuilder scratch) { + scratch.grow(input.length); + for (int i = 0; i < input.length; i++) { + int value = input.ints[i + input.offset]; + // NOTE: we allow -128 to 255 + assert value >= Byte.MIN_VALUE && value <= 255 : "value " + value + " doesn't fit into byte"; + scratch.setByteAt(i, (byte) value); + } + scratch.setLength(input.length); + return scratch.get(); + } + + // Uncomment for debugging: + + /* + public static void dotToFile(FST fst, String filePath) throws IOException { + Writer w = new OutputStreamWriter(new FileOutputStream(filePath)); + toDot(fst, w, true, true); + w.close(); + } + */ + + /** + * Reads the first arc greater or equal than the given label into the provided arc in place and + * returns it iff found, otherwise return null. + * + * @param label the label to ceil on + * @param fst the fst to operate on + * @param follow the arc to follow reading the label from + * @param arc the arc to read into in place + * @param in the fst's {@link BytesReader} + */ + public static Arc readCeilArc(int label, FST fst, Arc follow, Arc arc, BytesReader in) throws IOException { + if (label == FST.END_LABEL) { + return FST.readEndArc(follow, arc); + } + if (FST.targetHasArcs(follow) == false) { + return null; + } + fst.readFirstTargetArc(follow, arc, in); + if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { + if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { + // Fixed length arcs in a direct addressing node. + int targetIndex = label - arc.label(); + if (targetIndex >= arc.numArcs()) { + return null; + } else if (targetIndex < 0) { + return arc; + } else { + if (BitTable.isBitSet(targetIndex, arc, in)) { + fst.readArcByDirectAddressing(arc, in, targetIndex); + assert arc.label() == label; + } else { + int ceilIndex = BitTable.nextBitSet(targetIndex, arc, in); + assert ceilIndex != -1; + fst.readArcByDirectAddressing(arc, in, ceilIndex); + assert arc.label() > label; + } + return arc; + } + } + // Fixed length arcs in a binary search node. + int idx = binarySearch(fst, arc, label); + if (idx >= 0) { + return fst.readArcByIndex(arc, in, idx); + } + idx = -1 - idx; + if (idx == arc.numArcs()) { + // DEAD END! + return null; + } + return fst.readArcByIndex(arc, in, idx); + } + + // Variable length arcs in a linear scan list, + // or special arc with label == FST.END_LABEL. + fst.readFirstRealTargetArc(follow.target(), arc, in); + + while (true) { + // System.out.println(" non-bs cycle"); + if (arc.label() >= label) { + // System.out.println(" found!"); + return arc; + } else if (arc.isLast()) { + return null; + } else { + fst.readNextRealArc(arc, in); + } + } + } + + /** + * Perform a binary search of Arcs encoded as a packed array + * + * @param fst the FST from which to read + * @param arc the starting arc; sibling arcs greater than this will be searched. Usually the first + * arc in the array. + * @param targetLabel the label to search for + * @param the output type of the FST + * @return the index of the Arc having the target label, or if no Arc has the matching label, + * {@code -1 - idx)}, where {@code idx} is the index of the Arc with the next highest label, + * or the total number of arcs if the target label exceeds the maximum. + * @throws IOException when the FST reader does + */ + static int binarySearch(FST fst, Arc arc, int targetLabel) throws IOException { + assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH + : "Arc is not encoded as packed array for binary search (nodeFlags=" + arc.nodeFlags() + ")"; + BytesReader in = fst.getBytesReader(); + int low = arc.arcIdx(); + int mid; + int high = arc.numArcs() - 1; + while (low <= high) { + mid = (low + high) >>> 1; + in.setPosition(arc.posArcsStart()); + in.skipBytes((long) arc.bytesPerArc() * mid + 1); + final int midLabel = fst.readLabel(in); + final int cmp = midLabel - targetLabel; + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; + } + } + return -1 - low; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsWriter.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsWriter.java new file mode 100644 index 0000000000000..e6435dae4c12b --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsWriter.java @@ -0,0 +1,1124 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree; + +import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.compress.LZ4; +import org.apache.lucene.util.compress.LowercaseAsciiCompression; +import org.elasticsearch.core.internal.io.IOUtils; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.ByteSequenceOutputs; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.BytesRefFSTEnum; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FST; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.FSTCompiler; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.Util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +/* + TODO: + + - Currently there is a one-to-one mapping of indexed + term to term block, but we could decouple the two, ie, + put more terms into the index than there are blocks. + The index would take up more RAM but then it'd be able + to avoid seeking more often and could make PK/FuzzyQ + faster if the additional indexed terms could store + the offset into the terms block. + + - The blocks are not written in true depth-first + order, meaning if you just next() the file pointer will + sometimes jump backwards. For example, block foo* will + be written before block f* because it finished before. + This could possibly hurt performance if the terms dict is + not hot, since OSs anticipate sequential file access. We + could fix the writer to re-order the blocks as a 2nd + pass. + + - Each block encodes the term suffixes packed + sequentially using a separate vInt per term, which is + 1) wasteful and 2) slow (must linear scan to find a + particular suffix). We should instead 1) make + random-access array so we can directly access the Nth + suffix, and 2) bulk-encode this array using bulk int[] + codecs; then at search time we can binary search when + we seek a particular term. +*/ + +/** + * Block-based terms index and dictionary writer. + * + *

Writes terms dict and index, block-encoding (column stride) each term's metadata for each set + * of terms between two index terms. + * + *

Files: + * + *

+ * + *

+ * + *

Term Dictionary

+ * + *

The .tim file contains the list of terms in each field along with per-term statistics (such as + * docfreq) and per-term metadata (typically pointers to the postings list for that term in the + * inverted index). + * + *

The .tim is arranged in blocks: with blocks containing a variable number of entries (by + * default 25-48), where each entry is either a term or a reference to a sub-block. + * + *

NOTE: The term dictionary can plug into different postings implementations: the postings + * writer/reader are actually responsible for encoding and decoding the Postings Metadata and Term + * Metadata sections. + * + *

    + *
  • TermsDict (.tim) --> Header, PostingsHeader, NodeBlockNumBlocks, + * FieldSummary, DirOffset, Footer + *
  • NodeBlock --> (OuterNode | InnerNode) + *
  • OuterNode --> EntryCount, SuffixLength, ByteSuffixLength, StatsLength, < + * TermStats >EntryCount, MetaLength, + * <TermMetadata>EntryCount + *
  • InnerNode --> EntryCount, SuffixLength[,Sub?], ByteSuffixLength, StatsLength, + * < TermStats ? >EntryCount, MetaLength, <TermMetadata ? + * >EntryCount + *
  • TermStats --> DocFreq, TotalTermFreq + *
  • FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, + * ByteRootCodeLength, SumTotalTermFreq?, SumDocFreq, DocCount, LongsSize, MinTerm, + * MaxTerm>NumFields + *
  • Header --> {@link CodecUtil#writeHeader CodecHeader} + *
  • DirOffset --> {@link DataOutput#writeLong Uint64} + *
  • MinTerm,MaxTerm --> {@link DataOutput#writeVInt VInt} length followed by the byte[] + *
  • EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength,NumFields, + * FieldNumber,RootCodeLength,DocCount,LongsSize --> {@link DataOutput#writeVInt VInt} + *
  • TotalTermFreq,NumTerms,SumTotalTermFreq,SumDocFreq --> {@link DataOutput#writeVLong + * VLong} + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ * + *

Notes: + * + *

    + *
  • Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information for + * the BlockTree implementation. + *
  • DirOffset is a pointer to the FieldSummary section. + *
  • DocFreq is the count of documents which contain the term. + *
  • TotalTermFreq is the total number of occurrences of the term. This is encoded as the + * difference between the total number of occurrences and the DocFreq. + *
  • FieldNumber is the fields number from {@link FieldInfos}. (.fnm) + *
  • NumTerms is the number of unique terms for the field. + *
  • RootCode points to the root block for the field. + *
  • SumDocFreq is the total number of postings, the number of term-document pairs across the + * entire field. + *
  • DocCount is the number of documents that have at least one posting for this field. + *
  • LongsSize records how many long values the postings writer/reader record per term (e.g., to + * hold freq/prox/doc file offsets). + *
  • MinTerm, MaxTerm are the lowest and highest term in this field. + *
  • PostingsHeader and TermMetadata are plugged into by the specific postings implementation: + * these contain arbitrary per-file data (such as parameters or versioning information) and + * per-term data (such as pointers to inverted files). + *
  • For inner nodes of the tree, every entry will steal one bit to mark whether it points to + * child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted + *
+ * + * + * + *

Term Index

+ * + *

The .tip file contains an index into the term dictionary, so that it can be accessed randomly. + * The index is also used to determine when a given term cannot exist on disk (in the .tim file), + * saving a disk seek. + * + *

    + *
  • TermsIndex (.tip) --> Header, FSTIndexNumFields + * <IndexStartFP>NumFields, DirOffset, Footer + *
  • Header --> {@link CodecUtil#writeHeader CodecHeader} + *
  • DirOffset --> {@link DataOutput#writeLong Uint64} + *
  • IndexStartFP --> {@link DataOutput#writeVLong VLong} + * + *
  • FSTIndex --> {@link FST FST<byte[]>} + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ * + *

Notes: + * + *

    + *
  • The .tip file contains a separate FST for each field. The FST maps a term prefix to the + * on-disk block that holds all terms starting with that prefix. Each field's IndexStartFP + * points to its FST. + *
  • DirOffset is a pointer to the start of the IndexStartFPs for all fields + *
  • It's possible that an on-disk block would contain too many terms (more than the allowed + * maximum (default: 48)). When this happens, the block is sub-divided into new blocks (called + * "floor blocks"), and then the output in the FST for the block's prefix encodes the leading + * byte of each sub-block, and its file pointer. + *
+ * + * @see Lucene40BlockTreeTermsReader + * @lucene.experimental + */ +public final class Lucene40BlockTreeTermsWriter extends FieldsConsumer { + + /** + * Suggested default value for the {@code minItemsInBlock} parameter to {@link + * #Lucene40BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + */ + public static final int DEFAULT_MIN_BLOCK_SIZE = 25; + + /** + * Suggested default value for the {@code maxItemsInBlock} parameter to {@link + * #Lucene40BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + */ + public static final int DEFAULT_MAX_BLOCK_SIZE = 48; + + // public static boolean DEBUG = false; + // public static boolean DEBUG2 = false; + + // private final static boolean SAVE_DOT_FILES = false; + + private final IndexOutput metaOut; + private final IndexOutput termsOut; + private final IndexOutput indexOut; + final int maxDoc; + final int minItemsInBlock; + final int maxItemsInBlock; + + final PostingsWriterBase postingsWriter; + final FieldInfos fieldInfos; + + private final List fields = new ArrayList<>(); + + /** + * Create a new writer. The number of items (terms or sub-blocks) per block will aim to be between + * minItemsPerBlock and maxItemsPerBlock, though in some cases the blocks may be smaller than the + * min. + */ + public Lucene40BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock + ) throws IOException { + validateSettings(minItemsInBlock, maxItemsInBlock); + + this.minItemsInBlock = minItemsInBlock; + this.maxItemsInBlock = maxItemsInBlock; + + this.maxDoc = state.segmentInfo.maxDoc(); + this.fieldInfos = state.fieldInfos; + this.postingsWriter = postingsWriter; + + final String termsName = IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene40BlockTreeTermsReader.TERMS_EXTENSION + ); + termsOut = EndiannessReverserUtil.createOutput(state.directory, termsName, state.context); + boolean success = false; + IndexOutput metaOut = null, indexOut = null; + try { + CodecUtil.writeIndexHeader( + termsOut, + Lucene40BlockTreeTermsReader.TERMS_CODEC_NAME, + Lucene40BlockTreeTermsReader.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + + final String indexName = IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene40BlockTreeTermsReader.TERMS_INDEX_EXTENSION + ); + indexOut = EndiannessReverserUtil.createOutput(state.directory, indexName, state.context); + CodecUtil.writeIndexHeader( + indexOut, + Lucene40BlockTreeTermsReader.TERMS_INDEX_CODEC_NAME, + Lucene40BlockTreeTermsReader.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + // segment = state.segmentInfo.name; + + final String metaName = IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene40BlockTreeTermsReader.TERMS_META_EXTENSION + ); + metaOut = EndiannessReverserUtil.createOutput(state.directory, metaName, state.context); + CodecUtil.writeIndexHeader( + metaOut, + Lucene40BlockTreeTermsReader.TERMS_META_CODEC_NAME, + Lucene40BlockTreeTermsReader.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + + postingsWriter.init(metaOut, state); // have consumer write its format/header + + this.metaOut = metaOut; + this.indexOut = indexOut; + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut); + } + } + } + + /** Throws {@code IllegalArgumentException} if any of these settings is invalid. */ + public static void validateSettings(int minItemsInBlock, int maxItemsInBlock) { + if (minItemsInBlock <= 1) { + throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock); + } + if (minItemsInBlock > maxItemsInBlock) { + throw new IllegalArgumentException( + "maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock + ); + } + if (2 * (minItemsInBlock - 1) > maxItemsInBlock) { + throw new IllegalArgumentException( + "maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + + maxItemsInBlock + + " minItemsInBlock=" + + minItemsInBlock + ); + } + } + + @Override + public void write(Fields fields, NormsProducer norms) throws IOException { + // if (DEBUG) System.out.println("\nBTTW.write seg=" + segment); + + String lastField = null; + for (String field : fields) { + assert lastField == null || lastField.compareTo(field) < 0; + lastField = field; + + // if (DEBUG) System.out.println("\nBTTW.write seg=" + segment + " field=" + field); + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + + TermsEnum termsEnum = terms.iterator(); + TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field)); + while (true) { + BytesRef term = termsEnum.next(); + // if (DEBUG) System.out.println("BTTW: next term " + term); + + if (term == null) { + break; + } + + // if (DEBUG) System.out.println("write field=" + fieldInfo.name + " term=" + + // brToString(term)); + termsWriter.write(term, termsEnum, norms); + } + + termsWriter.finish(); + + // if (DEBUG) System.out.println("\nBTTW.write done seg=" + segment + " field=" + field); + } + } + + static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) { + assert fp < (1L << 62); + return (fp << 2) | (hasTerms ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor + ? Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR + : 0); + } + + private static class PendingEntry { + public final boolean isTerm; + + protected PendingEntry(boolean isTerm) { + this.isTerm = isTerm; + } + } + + private static final class PendingTerm extends PendingEntry { + public final byte[] termBytes; + // stats + metadata + public final BlockTermState state; + + PendingTerm(BytesRef term, BlockTermState state) { + super(true); + this.termBytes = new byte[term.length]; + System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length); + this.state = state; + } + + @Override + public String toString() { + return "TERM: " + brToString(termBytes); + } + } + + // for debugging + @SuppressWarnings("unused") + static String brToString(BytesRef b) { + if (b == null) { + return "(null)"; + } else { + try { + return b.utf8ToString() + " " + b; + } catch (Throwable t) { + // If BytesRef isn't actually UTF8, or it's eg a + // prefix of UTF8 that ends mid-unicode-char, we + // fallback to hex: + return b.toString(); + } + } + } + + // for debugging + @SuppressWarnings("unused") + static String brToString(byte[] b) { + return brToString(new BytesRef(b)); + } + + private static final class PendingBlock extends PendingEntry { + public final BytesRef prefix; + public final long fp; + public FST index; + public List> subIndices; + public final boolean hasTerms; + public final boolean isFloor; + public final int floorLeadByte; + + PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List> subIndices) { + super(false); + this.prefix = prefix; + this.fp = fp; + this.hasTerms = hasTerms; + this.isFloor = isFloor; + this.floorLeadByte = floorLeadByte; + this.subIndices = subIndices; + } + + @Override + public String toString() { + return "BLOCK: prefix=" + brToString(prefix); + } + + public void compileIndex(List blocks, ByteBuffersDataOutput scratchBytes, IntsRefBuilder scratchIntsRef) + throws IOException { + + assert (isFloor && blocks.size() > 1) || (isFloor == false && blocks.size() == 1) : "isFloor=" + isFloor + " blocks=" + blocks; + assert this == blocks.get(0); + + assert scratchBytes.size() == 0; + + // TODO: try writing the leading vLong in MSB order + // (opposite of what Lucene does today), for better + // outputs sharing in the FST + scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor)); + if (isFloor) { + scratchBytes.writeVInt(blocks.size() - 1); + for (int i = 1; i < blocks.size(); i++) { + PendingBlock sub = blocks.get(i); + assert sub.floorLeadByte != -1; + // if (DEBUG) { + // System.out.println(" write floorLeadByte=" + + // Integer.toHexString(sub.floorLeadByte&0xff)); + // } + scratchBytes.writeByte((byte) sub.floorLeadByte); + assert sub.fp > fp; + scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0)); + } + } + + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).shouldShareNonSingletonNodes( + false + ).build(); + // if (DEBUG) { + // System.out.println(" compile index for prefix=" + prefix); + // } + // indexBuilder.DEBUG = false; + final byte[] bytes = scratchBytes.toArrayCopy(); + assert bytes.length > 0; + fstCompiler.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); + scratchBytes.reset(); + + // Copy over index for all sub-blocks + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (FST subIndex : block.subIndices) { + append(fstCompiler, subIndex, scratchIntsRef); + } + block.subIndices = null; + } + } + + index = fstCompiler.compile(); + + assert subIndices == null; + + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); + Util.toDot(index, w, false, false); + System.out.println("SAVED to out.dot"); + w.close(); + */ + } + + // TODO: maybe we could add bulk-add method to + // Builder? Takes FST and unions it w/ current + // FST. + private void append(FSTCompiler fstCompiler, FST subIndex, IntsRefBuilder scratchIntsRef) throws IOException { + final BytesRefFSTEnum subIndexEnum = new BytesRefFSTEnum<>(subIndex); + BytesRefFSTEnum.InputOutput indexEnt; + while ((indexEnt = subIndexEnum.next()) != null) { + // if (DEBUG) { + // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + // + indexEnt.output); + // } + fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); + } + } + } + + private final ByteBuffersDataOutput scratchBytes = ByteBuffersDataOutput.newResettableInstance(); + private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); + + static final BytesRef EMPTY_BYTES_REF = new BytesRef(); + + private static class StatsWriter { + + private final DataOutput out; + private final boolean hasFreqs; + private int singletonCount; + + StatsWriter(DataOutput out, boolean hasFreqs) { + this.out = out; + this.hasFreqs = hasFreqs; + } + + void add(int df, long ttf) throws IOException { + // Singletons (DF==1, TTF==1) are run-length encoded + if (df == 1 && (hasFreqs == false || ttf == 1)) { + singletonCount++; + } else { + finish(); + out.writeVInt(df << 1); + if (hasFreqs) { + out.writeVLong(ttf - df); + } + } + } + + void finish() throws IOException { + if (singletonCount > 0) { + out.writeVInt(((singletonCount - 1) << 1) | 1); + singletonCount = 0; + } + } + } + + class TermsWriter { + private final FieldInfo fieldInfo; + private long numTerms; + final FixedBitSet docsSeen; + long sumTotalTermFreq; + long sumDocFreq; + + // Records index into pending where the current prefix at that + // length "started"; for example, if current term starts with 't', + // startsByPrefix[0] is the index into pending for the first + // term/sub-block starting with 't'. We use this to figure out when + // to write a new block: + private final BytesRefBuilder lastTerm = new BytesRefBuilder(); + private int[] prefixStarts = new int[8]; + + // Pending stack of terms and blocks. As terms arrive (in sorted order) + // we append to this stack, and once the top of the stack has enough + // terms starting with a common prefix, we write a new block with + // those terms and replace those terms in the stack with a new block: + private final List pending = new ArrayList<>(); + + // Reused in writeBlocks: + private final List newBlocks = new ArrayList<>(); + + private PendingTerm firstPendingTerm; + private PendingTerm lastPendingTerm; + + /** Writes the top count entries in pending, using prevTerm to compute the prefix. */ + void writeBlocks(int prefixLength, int count) throws IOException { + + assert count > 0; + + // if (DEBUG2) { + // BytesRef br = new BytesRef(lastTerm.bytes()); + // br.length = prefixLength; + // System.out.println("writeBlocks: seg=" + segment + " prefix=" + brToString(br) + " count=" + // + count); + // } + + // Root block better write all remaining pending entries: + assert prefixLength > 0 || count == pending.size(); + + int lastSuffixLeadLabel = -1; + + // True if we saw at least one term in this block (we record if a block + // only points to sub-blocks in the terms index so we can avoid seeking + // to it when we are looking for a term): + boolean hasTerms = false; + boolean hasSubBlocks = false; + + int start = pending.size() - count; + int end = pending.size(); + int nextBlockStart = start; + int nextFloorLeadLabel = -1; + + for (int i = start; i < end; i++) { + + PendingEntry ent = pending.get(i); + + int suffixLeadLabel; + + if (ent.isTerm) { + PendingTerm term = (PendingTerm) ent; + if (term.termBytes.length == prefixLength) { + // Suffix is 0, i.e. prefix 'foo' and term is + // 'foo' so the term has empty string suffix + // in this block + assert lastSuffixLeadLabel == -1 : "i=" + i + " lastSuffixLeadLabel=" + lastSuffixLeadLabel; + suffixLeadLabel = -1; + } else { + suffixLeadLabel = term.termBytes[prefixLength] & 0xff; + } + } else { + PendingBlock block = (PendingBlock) ent; + assert block.prefix.length > prefixLength; + suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff; + } + // if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " suffixLeadLabel=" + + // suffixLeadLabel); + + if (suffixLeadLabel != lastSuffixLeadLabel) { + int itemsInBlock = i - nextBlockStart; + if (itemsInBlock >= minItemsInBlock && end - nextBlockStart > maxItemsInBlock) { + // The count is too large for one block, so we must break it into "floor" blocks, where + // we record + // the leading label of the suffix of the first term in each floor block, so at search + // time we can + // jump to the right floor block. We just use a naive greedy segmenter here: make a new + // floor + // block as soon as we have at least minItemsInBlock. This is not always best: it often + // produces + // a too-small block as the final block: + boolean isFloor = itemsInBlock < count; + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasSubBlocks)); + + hasTerms = false; + hasSubBlocks = false; + nextFloorLeadLabel = suffixLeadLabel; + nextBlockStart = i; + } + + lastSuffixLeadLabel = suffixLeadLabel; + } + + if (ent.isTerm) { + hasTerms = true; + } else { + hasSubBlocks = true; + } + } + + // Write last block, if any: + if (nextBlockStart < end) { + int itemsInBlock = end - nextBlockStart; + boolean isFloor = itemsInBlock < count; + newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasSubBlocks)); + } + + assert newBlocks.isEmpty() == false; + + PendingBlock firstBlock = newBlocks.get(0); + + assert firstBlock.isFloor || newBlocks.size() == 1; + + firstBlock.compileIndex(newBlocks, scratchBytes, scratchIntsRef); + + // Remove slice from the top of the pending stack, that we just wrote: + pending.subList(pending.size() - count, pending.size()).clear(); + + // Append new block + pending.add(firstBlock); + + newBlocks.clear(); + } + + private boolean allEqual(byte[] b, int startOffset, int endOffset, byte value) { + Objects.checkFromToIndex(startOffset, endOffset, b.length); + for (int i = startOffset; i < endOffset; ++i) { + if (b[i] != value) { + return false; + } + } + return true; + } + + /** + * Writes the specified slice (start is inclusive, end is exclusive) from pending stack as a new + * block. If isFloor is true, there were too many (more than maxItemsInBlock) entries sharing + * the same prefix, and so we broke it into multiple floor blocks where we record the starting + * label of the suffix of each floor block. + */ + private PendingBlock writeBlock( + int prefixLength, + boolean isFloor, + int floorLeadLabel, + int start, + int end, + boolean hasTerms, + boolean hasSubBlocks + ) throws IOException { + + assert end > start; + + long startFP = termsOut.getFilePointer(); + + boolean hasFloorLeadLabel = isFloor && floorLeadLabel != -1; + + final BytesRef prefix = new BytesRef(prefixLength + (hasFloorLeadLabel ? 1 : 0)); + System.arraycopy(lastTerm.get().bytes, 0, prefix.bytes, 0, prefixLength); + prefix.length = prefixLength; + + // if (DEBUG2) System.out.println(" writeBlock field=" + fieldInfo.name + " prefix=" + + // brToString(prefix) + " fp=" + startFP + " isFloor=" + isFloor + " isLastInFloor=" + (end == + // pending.size()) + " floorLeadLabel=" + floorLeadLabel + " start=" + start + " end=" + end + + // " hasTerms=" + hasTerms + " hasSubBlocks=" + hasSubBlocks); + + // Write block header: + int numEntries = end - start; + int code = numEntries << 1; + if (end == pending.size()) { + // Last block: + code |= 1; + } + termsOut.writeVInt(code); + + // 1st pass: pack term suffix bytes into byte[] blob + // TODO: cutover to bulk int codec... simple64? + + // We optimize the leaf block case (block has only terms), writing a more + // compact format in this case: + boolean isLeafBlock = hasSubBlocks == false; + + // System.out.println(" isLeaf=" + isLeafBlock); + + final List> subIndices; + + boolean absolute = true; + + if (isLeafBlock) { + // Block contains only ordinary terms: + subIndices = null; + StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); + for (int i = start; i < end; i++) { + PendingEntry ent = pending.get(i); + assert ent.isTerm : "i=" + i; + + PendingTerm term = (PendingTerm) ent; + + assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix; + BlockTermState state = term.state; + final int suffix = term.termBytes.length - prefixLength; + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write term suffix=" + brToString(suffixBytes)); + // } + + // For leaf block we write suffix straight + suffixLengthsWriter.writeVInt(suffix); + suffixWriter.append(term.termBytes, prefixLength, suffix); + assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel; + + // Write term stats, to separate byte[] blob: + statsWriter.add(state.docFreq, state.totalTermFreq); + + // Write term meta data + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + absolute = false; + } + statsWriter.finish(); + } else { + // Block has at least one prefix term or a sub block: + subIndices = new ArrayList<>(); + StatsWriter statsWriter = new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); + for (int i = start; i < end; i++) { + PendingEntry ent = pending.get(i); + if (ent.isTerm) { + PendingTerm term = (PendingTerm) ent; + + assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix; + BlockTermState state = term.state; + final int suffix = term.termBytes.length - prefixLength; + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write term suffix=" + brToString(suffixBytes)); + // } + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block, and 1 bit to record if + // it's a prefix term. Terms cannot be larger than ~32 KB + // so we won't run out of bits: + + suffixLengthsWriter.writeVInt(suffix << 1); + suffixWriter.append(term.termBytes, prefixLength, suffix); + + // Write term stats, to separate byte[] blob: + statsWriter.add(state.docFreq, state.totalTermFreq); + + // TODO: now that terms dict "sees" these longs, + // we can explore better column-stride encodings + // to encode all long[0]s for this block at + // once, all long[1]s, etc., e.g. using + // Simple64. Alternatively, we could interleave + // stats + meta ... no reason to have them + // separate anymore: + + // Write term meta data + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + absolute = false; + } else { + PendingBlock block = (PendingBlock) ent; + assert StringHelper.startsWith(block.prefix, prefix); + final int suffix = block.prefix.length - prefixLength; + assert StringHelper.startsWith(block.prefix, prefix); + + assert suffix > 0; + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block:f + suffixLengthsWriter.writeVInt((suffix << 1) | 1); + suffixWriter.append(block.prefix.bytes, prefixLength, suffix); + + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write sub-block suffix=" + brToString(suffixBytes) + " + // subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor); + // } + + assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel + : "floorLeadLabel=" + floorLeadLabel + " suffixLead=" + (block.prefix.bytes[prefixLength] & 0xff); + assert block.fp < startFP; + + suffixLengthsWriter.writeVLong(startFP - block.fp); + subIndices.add(block.index); + } + } + statsWriter.finish(); + + assert subIndices.size() != 0; + } + + // Write suffixes byte[] blob to terms dict output, either uncompressed, compressed with LZ4 + // or with LowercaseAsciiCompression. + CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION; + // If there are 2 suffix bytes or less per term, then we don't bother compressing as suffix + // are unlikely what + // makes the terms dictionary large, and it also tends to be frequently the case for dense IDs + // like + // auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much. + // We also only start compressing when the prefix length is greater than 2 since blocks whose + // prefix length is + // 1 or 2 always all get visited when running a fuzzy query whose max number of edits is 2. + if (suffixWriter.length() > 2L * numEntries && prefixLength > 2) { + // LZ4 inserts references whenever it sees duplicate strings of 4 chars or more, so only try + // it out if the + // average suffix length is greater than 6. + if (suffixWriter.length() > 6L * numEntries) { + LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable); + if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) { + // LZ4 saved more than 25%, go for it + compressionAlg = CompressionAlgorithm.LZ4; + } + } + if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) { + spareWriter.reset(); + if (spareBytes.length < suffixWriter.length()) { + spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)]; + } + if (LowercaseAsciiCompression.compress(suffixWriter.bytes(), suffixWriter.length(), spareBytes, spareWriter)) { + compressionAlg = CompressionAlgorithm.LOWERCASE_ASCII; + } + } + } + long token = ((long) suffixWriter.length()) << 3; + if (isLeafBlock) { + token |= 0x04; + } + token |= compressionAlg.code; + termsOut.writeVLong(token); + if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) { + termsOut.writeBytes(suffixWriter.bytes(), suffixWriter.length()); + } else { + spareWriter.copyTo(termsOut); + } + suffixWriter.setLength(0); + spareWriter.reset(); + + // Write suffix lengths + final int numSuffixBytes = Math.toIntExact(suffixLengthsWriter.size()); + spareBytes = ArrayUtil.grow(spareBytes, numSuffixBytes); + suffixLengthsWriter.copyTo(new ByteArrayDataOutput(spareBytes)); + suffixLengthsWriter.reset(); + if (allEqual(spareBytes, 1, numSuffixBytes, spareBytes[0])) { + // Structured fields like IDs often have most values of the same length + termsOut.writeVInt((numSuffixBytes << 1) | 1); + termsOut.writeByte(spareBytes[0]); + } else { + termsOut.writeVInt(numSuffixBytes << 1); + termsOut.writeBytes(spareBytes, numSuffixBytes); + } + + // Stats + final int numStatsBytes = Math.toIntExact(statsWriter.size()); + termsOut.writeVInt(numStatsBytes); + statsWriter.copyTo(termsOut); + statsWriter.reset(); + + // Write term meta data byte[] blob + termsOut.writeVInt((int) metaWriter.size()); + metaWriter.copyTo(termsOut); + metaWriter.reset(); + + // if (DEBUG) { + // System.out.println(" fpEnd=" + out.getFilePointer()); + // } + + if (hasFloorLeadLabel) { + // We already allocated to length+1 above: + prefix.bytes[prefix.length++] = (byte) floorLeadLabel; + } + + return new PendingBlock(prefix, startFP, hasTerms, isFloor, floorLeadLabel, subIndices); + } + + TermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + assert fieldInfo.getIndexOptions() != IndexOptions.NONE; + docsSeen = new FixedBitSet(maxDoc); + postingsWriter.setField(fieldInfo); + } + + /** Writes one term's worth of postings. */ + public void write(BytesRef text, TermsEnum termsEnum, NormsProducer norms) throws IOException { + + BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen, norms); + if (state != null) { + + assert state.docFreq != 0; + assert fieldInfo.getIndexOptions() == IndexOptions.DOCS || state.totalTermFreq >= state.docFreq + : "postingsWriter=" + postingsWriter; + pushTerm(text); + + PendingTerm term = new PendingTerm(text, state); + pending.add(term); + // if (DEBUG) System.out.println(" add pending term = " + text + " pending.size()=" + + // pending.size()); + + sumDocFreq += state.docFreq; + sumTotalTermFreq += state.totalTermFreq; + numTerms++; + if (firstPendingTerm == null) { + firstPendingTerm = term; + } + lastPendingTerm = term; + } + } + + /** Pushes the new term to the top of the stack, and writes new blocks. */ + private void pushTerm(BytesRef text) throws IOException { + // Find common prefix between last term and current term: + int prefixLength = Arrays.mismatch(lastTerm.bytes(), 0, lastTerm.length(), text.bytes, text.offset, text.offset + text.length); + if (prefixLength == -1) { // Only happens for the first term, if it is empty + assert lastTerm.length() == 0; + prefixLength = 0; + } + + // if (DEBUG) System.out.println(" shared=" + pos + " lastTerm.length=" + lastTerm.length); + + // Close the "abandoned" suffix now: + for (int i = lastTerm.length() - 1; i >= prefixLength; i--) { + + // How many items on top of the stack share the current suffix + // we are closing: + int prefixTopSize = pending.size() - prefixStarts[i]; + if (prefixTopSize >= minItemsInBlock) { + // if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " + // minItemsInBlock=" + minItemsInBlock); + writeBlocks(i + 1, prefixTopSize); + prefixStarts[i] -= prefixTopSize - 1; + } + } + + if (prefixStarts.length < text.length) { + prefixStarts = ArrayUtil.grow(prefixStarts, text.length); + } + + // Init new tail: + for (int i = prefixLength; i < text.length; i++) { + prefixStarts[i] = pending.size(); + } + + lastTerm.copyBytes(text); + } + + // Finishes all terms in this field + public void finish() throws IOException { + if (numTerms > 0) { + // if (DEBUG) System.out.println("BTTW: finish prefixStarts=" + + // Arrays.toString(prefixStarts)); + + // Add empty term to force closing of all final blocks: + pushTerm(new BytesRef()); + + // TODO: if pending.size() is already 1 with a non-zero prefix length + // we can save writing a "degenerate" root block, but we have to + // fix all the places that assume the root block's prefix is the empty string: + pushTerm(new BytesRef()); + writeBlocks(0, pending.size()); + + // We better have one final "root" block: + assert pending.size() == 1 && pending.get(0).isTerm == false : "pending.size()=" + pending.size() + " pending=" + pending; + final PendingBlock root = (PendingBlock) pending.get(0); + assert root.prefix.length == 0; + final BytesRef rootCode = root.index.getEmptyOutput(); + assert rootCode != null; + + ByteBuffersDataOutput metaOut = new ByteBuffersDataOutput(); + fields.add(metaOut); + + metaOut.writeVInt(fieldInfo.number); + metaOut.writeVLong(numTerms); + metaOut.writeVInt(rootCode.length); + metaOut.writeBytes(rootCode.bytes, rootCode.offset, rootCode.length); + assert fieldInfo.getIndexOptions() != IndexOptions.NONE; + if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) { + metaOut.writeVLong(sumTotalTermFreq); + } + metaOut.writeVLong(sumDocFreq); + metaOut.writeVInt(docsSeen.cardinality()); + writeBytesRef(metaOut, new BytesRef(firstPendingTerm.termBytes)); + writeBytesRef(metaOut, new BytesRef(lastPendingTerm.termBytes)); + metaOut.writeVLong(indexOut.getFilePointer()); + // Write FST to index + root.index.save(metaOut, indexOut); + // System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name); + + /* + if (DEBUG) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(root.index, w, false, false); + System.out.println("SAVED to " + dotFileName); + w.close(); + } + */ + + } else { + assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS && sumTotalTermFreq == -1; + assert sumDocFreq == 0; + assert docsSeen.cardinality() == 0; + } + } + + private final ByteBuffersDataOutput suffixLengthsWriter = ByteBuffersDataOutput.newResettableInstance(); + private final BytesRefBuilder suffixWriter = new BytesRefBuilder(); + private final ByteBuffersDataOutput statsWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput spareWriter = ByteBuffersDataOutput.newResettableInstance(); + private byte[] spareBytes = BytesRef.EMPTY_BYTES; + private final LZ4.HighCompressionHashTable compressionHashTable = new LZ4.HighCompressionHashTable(); + } + + private boolean closed; + + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + + boolean success = false; + try { + metaOut.writeVInt(fields.size()); + for (ByteBuffersDataOutput fieldMeta : fields) { + fieldMeta.copyTo(metaOut); + } + CodecUtil.writeFooter(indexOut); + metaOut.writeLong(indexOut.getFilePointer()); + CodecUtil.writeFooter(termsOut); + metaOut.writeLong(termsOut.getFilePointer()); + CodecUtil.writeFooter(metaOut); + success = true; + } finally { + if (success) { + IOUtils.close(metaOut, termsOut, indexOut, postingsWriter); + } else { + IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut, postingsWriter); + } + } + } + + private static void writeBytesRef(DataOutput out, BytesRef bytes) throws IOException { + out.writeVInt(bytes.length); + out.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormat2Tests.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormat2Tests.java new file mode 100644 index 0000000000000..a5422a03c1f22 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormat2Tests.java @@ -0,0 +1,149 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; + +/** Tests special cases of BlockPostingsFormat */ +public class BlockPostingsFormat2Tests extends LuceneTestCase { + Directory dir; + RandomIndexWriter iw; + + @Override + public void setUp() throws Exception { + super.setUp(); + dir = newFSDirectory(createTempDir("testDFBlockSize")); + IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50RWPostingsFormat())); + iw = new RandomIndexWriter(random(), dir, iwc); + iw.setDoRandomForceMerge(false); // we will ourselves + } + + @Override + public void tearDown() throws Exception { + iw.close(); + TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge + IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50RWPostingsFormat())); + iwc.setOpenMode(OpenMode.APPEND); + IndexWriter iw = new IndexWriter(dir, iwc); + iw.forceMerge(1); + iw.close(); + dir.close(); // just force a checkindex for now + super.tearDown(); + } + + private Document newDocument() { + Document doc = new Document(); + for (IndexOptions option : IndexOptions.values()) { + if (option == IndexOptions.NONE) { + continue; + } + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + // turn on tvs for a cross-check, since we rely upon checkindex in this test (for now) + ft.setStoreTermVectors(true); + ft.setStoreTermVectorOffsets(true); + ft.setStoreTermVectorPositions(true); + ft.setStoreTermVectorPayloads(true); + ft.setIndexOptions(option); + doc.add(new Field(option.toString(), "", ft)); + } + return doc; + } + + /** tests terms with df = blocksize */ + public void testDFBlockSize() throws Exception { + Document doc = newDocument(); + for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE; i++) { + for (IndexableField f : doc.getFields()) { + ((Field) f).setStringValue(f.name() + " " + f.name() + "_2"); + } + iw.addDocument(doc); + } + } + + /** tests terms with df % blocksize = 0 */ + public void testDFBlockSizeMultiple() throws Exception { + Document doc = newDocument(); + for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE * 16; i++) { + for (IndexableField f : doc.getFields()) { + ((Field) f).setStringValue(f.name() + " " + f.name() + "_2"); + } + iw.addDocument(doc); + } + } + + /** tests terms with ttf = blocksize */ + public void testTTFBlockSize() throws Exception { + Document doc = newDocument(); + for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE / 2; i++) { + for (IndexableField f : doc.getFields()) { + ((Field) f).setStringValue(f.name() + " " + f.name() + " " + f.name() + "_2 " + f.name() + "_2"); + } + iw.addDocument(doc); + } + } + + /** tests terms with ttf % blocksize = 0 */ + public void testTTFBlockSizeMultiple() throws Exception { + Document doc = newDocument(); + for (int i = 0; i < Lucene50PostingsFormat.BLOCK_SIZE / 2; i++) { + for (IndexableField f : doc.getFields()) { + String proto = (f.name() + + " " + + f.name() + + " " + + f.name() + + " " + + f.name() + + " " + + f.name() + + "_2 " + + f.name() + + "_2 " + + f.name() + + "_2 " + + f.name() + + "_2"); + StringBuilder val = new StringBuilder(); + for (int j = 0; j < 16; j++) { + val.append(proto); + val.append(" "); + } + ((Field) f).setStringValue(val.toString()); + } + iw.addDocument(doc); + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormat3Tests.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormat3Tests.java new file mode 100644 index 0000000000000..805fd67b03893 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormat3Tests.java @@ -0,0 +1,477 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockFixedLengthPayloadFilter; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.analysis.MockVariableLengthPayloadFilter; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.English; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RegExp; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.Random; + +/** Tests partial enumeration (only pulling a subset of the indexed data) */ +public class BlockPostingsFormat3Tests extends LuceneTestCase { + private final int MAXDOC = TEST_NIGHTLY ? Lucene50PostingsFormat.BLOCK_SIZE * 20 : Lucene50PostingsFormat.BLOCK_SIZE * 3; + + // creates 8 fields with different options and does "duels" of fields against each other + public void test() throws Exception { + Directory dir = newDirectory(); + Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(); + if (fieldName.contains("payloadsFixed")) { + TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1); + return new TokenStreamComponents(tokenizer, filter); + } else if (fieldName.contains("payloadsVariable")) { + TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer); + return new TokenStreamComponents(tokenizer, filter); + } else { + return new TokenStreamComponents(tokenizer); + } + } + }; + IndexWriterConfig iwc = newIndexWriterConfig(analyzer); + iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50RWPostingsFormat())); + // TODO we could actually add more fields implemented with different PFs + // or, just put this test into the usual rotation? + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + Document doc = new Document(); + FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED); + // turn this on for a cross-check + docsOnlyType.setStoreTermVectors(true); + docsOnlyType.setIndexOptions(IndexOptions.DOCS); + + FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED); + // turn this on for a cross-check + docsAndFreqsType.setStoreTermVectors(true); + docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); + + FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED); + // turn these on for a cross-check + positionsType.setStoreTermVectors(true); + positionsType.setStoreTermVectorPositions(true); + positionsType.setStoreTermVectorOffsets(true); + positionsType.setStoreTermVectorPayloads(true); + FieldType offsetsType = new FieldType(positionsType); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field field1 = new Field("field1docs", "", docsOnlyType); + Field field2 = new Field("field2freqs", "", docsAndFreqsType); + Field field3 = new Field("field3positions", "", positionsType); + Field field4 = new Field("field4offsets", "", offsetsType); + Field field5 = new Field("field5payloadsFixed", "", positionsType); + Field field6 = new Field("field6payloadsVariable", "", positionsType); + Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType); + Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType); + doc.add(field1); + doc.add(field2); + doc.add(field3); + doc.add(field4); + doc.add(field5); + doc.add(field6); + doc.add(field7); + doc.add(field8); + for (int i = 0; i < MAXDOC; i++) { + String stringValue = Integer.toString(i) + + " verycommon " + + English.intToEnglish(i).replace('-', ' ') + + " " + + TestUtil.randomSimpleString(random()); + field1.setStringValue(stringValue); + field2.setStringValue(stringValue); + field3.setStringValue(stringValue); + field4.setStringValue(stringValue); + field5.setStringValue(stringValue); + field6.setStringValue(stringValue); + field7.setStringValue(stringValue); + field8.setStringValue(stringValue); + iw.addDocument(doc); + } + iw.close(); + verify(dir); + TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge + iwc = newIndexWriterConfig(analyzer); + iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50RWPostingsFormat())); + iwc.setOpenMode(OpenMode.APPEND); + IndexWriter iw2 = new IndexWriter(dir, iwc); + iw2.forceMerge(1); + iw2.close(); + verify(dir); + dir.close(); + } + + private void verify(Directory dir) throws Exception { + DirectoryReader ir = DirectoryReader.open(dir); + for (LeafReaderContext leaf : ir.leaves()) { + LeafReader leafReader = leaf.reader(); + assertTerms(leafReader.terms("field1docs"), leafReader.terms("field2freqs"), true); + assertTerms(leafReader.terms("field3positions"), leafReader.terms("field4offsets"), true); + assertTerms(leafReader.terms("field4offsets"), leafReader.terms("field5payloadsFixed"), true); + assertTerms(leafReader.terms("field5payloadsFixed"), leafReader.terms("field6payloadsVariable"), true); + assertTerms(leafReader.terms("field6payloadsVariable"), leafReader.terms("field7payloadsFixedOffsets"), true); + assertTerms(leafReader.terms("field7payloadsFixedOffsets"), leafReader.terms("field8payloadsVariableOffsets"), true); + } + ir.close(); + } + + // following code is almost an exact dup of code from TestDuelingCodecs: sorry! + + public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception { + if (leftTerms == null || rightTerms == null) { + assertNull(leftTerms); + assertNull(rightTerms); + return; + } + assertTermsStatistics(leftTerms, rightTerms); + + // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be + // different + + boolean bothHaveFreqs = leftTerms.hasFreqs() && rightTerms.hasFreqs(); + boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions(); + TermsEnum leftTermsEnum = leftTerms.iterator(); + TermsEnum rightTermsEnum = rightTerms.iterator(); + assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHaveFreqs, bothHavePositions); + + assertTermsSeeking(leftTerms, rightTerms); + + if (deep) { + int numIntersections = atLeast(3); + for (int i = 0; i < numIntersections; i++) { + String re = AutomatonTestUtil.randomRegexp(random()); + CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton()); + if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + // TODO: test start term too + TermsEnum leftIntersection = leftTerms.intersect(automaton, null); + TermsEnum rightIntersection = rightTerms.intersect(automaton, null); + assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHaveFreqs, bothHavePositions); + } + } + } + } + + private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception { + TermsEnum leftEnum = null; + TermsEnum rightEnum = null; + + // just an upper bound + int numTests = atLeast(20); + Random random = random(); + + // collect this number of terms from the left side + HashSet tests = new HashSet<>(); + int numPasses = 0; + while (numPasses < 10 && tests.size() < numTests) { + leftEnum = leftTerms.iterator(); + BytesRef term = null; + while ((term = leftEnum.next()) != null) { + int code = random.nextInt(10); + if (code == 0) { + // the term + tests.add(BytesRef.deepCopyOf(term)); + } else if (code == 1) { + // truncated subsequence of term + term = BytesRef.deepCopyOf(term); + if (term.length > 0) { + // truncate it + term.length = random.nextInt(term.length); + } + } else if (code == 2) { + // term, but ensure a non-zero offset + byte[] newbytes = new byte[term.length + 5]; + System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length); + tests.add(new BytesRef(newbytes, 5, term.length)); + } + } + numPasses++; + } + + ArrayList shuffledTests = new ArrayList<>(tests); + Collections.shuffle(shuffledTests, random); + + for (BytesRef b : shuffledTests) { + leftEnum = leftTerms.iterator(); + rightEnum = rightTerms.iterator(); + + assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); + assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); + + SeekStatus leftStatus; + SeekStatus rightStatus; + + leftStatus = leftEnum.seekCeil(b); + rightStatus = rightEnum.seekCeil(b); + assertEquals(leftStatus, rightStatus); + if (leftStatus != SeekStatus.END) { + assertEquals(leftEnum.term(), rightEnum.term()); + } + + leftStatus = leftEnum.seekCeil(b); + rightStatus = rightEnum.seekCeil(b); + assertEquals(leftStatus, rightStatus); + if (leftStatus != SeekStatus.END) { + assertEquals(leftEnum.term(), rightEnum.term()); + } + } + } + + /** checks collection-level statistics on Terms */ + public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception { + assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount()); + assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq()); + if (leftTerms.hasFreqs() && rightTerms.hasFreqs()) { + assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq()); + } + if (leftTerms.size() != -1 && rightTerms.size() != -1) { + assertEquals(leftTerms.size(), rightTerms.size()); + } + } + + /** + * checks the terms enum sequentially if deep is false, it does a 'shallow' test that doesnt go + * down to the docsenums + */ + public void assertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, boolean deep, boolean hasFreqs, boolean hasPositions) + throws Exception { + BytesRef term; + PostingsEnum leftPositions = null; + PostingsEnum rightPositions = null; + PostingsEnum leftDocs = null; + PostingsEnum rightDocs = null; + + while ((term = leftTermsEnum.next()) != null) { + assertEquals(term, rightTermsEnum.next()); + assertTermStats(leftTermsEnum, rightTermsEnum, hasFreqs); + if (deep) { + if (hasPositions) { + // with payloads + off + assertDocsAndPositionsEnum( + leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL), + rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL) + ); + + assertPositionsSkipping( + leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL), + rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL) + ); + // with payloads only + assertDocsAndPositionsEnum( + leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS), + rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS) + ); + + assertPositionsSkipping( + leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS), + rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS) + ); + + // with offsets only + assertDocsAndPositionsEnum( + leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS), + rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS) + ); + + assertPositionsSkipping( + leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS), + rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS) + ); + + // with positions only + assertDocsAndPositionsEnum( + leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS), + rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS) + ); + + assertPositionsSkipping( + leftTermsEnum.docFreq(), + leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS), + rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS) + ); + } + + // with freqs: + assertDocsEnum(leftDocs = leftTermsEnum.postings(leftDocs), rightDocs = rightTermsEnum.postings(rightDocs)); + + // w/o freqs: + assertDocsEnum( + leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE), + rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE) + ); + + // with freqs: + assertDocsSkipping( + leftTermsEnum.docFreq(), + leftDocs = leftTermsEnum.postings(leftDocs), + rightDocs = rightTermsEnum.postings(rightDocs) + ); + + // w/o freqs: + assertDocsSkipping( + leftTermsEnum.docFreq(), + leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE), + rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE) + ); + } + } + assertNull(rightTermsEnum.next()); + } + + /** checks term-level statistics */ + public void assertTermStats(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, boolean bothHaveFreqs) throws Exception { + assertEquals(leftTermsEnum.docFreq(), rightTermsEnum.docFreq()); + if (bothHaveFreqs) { + assertEquals(leftTermsEnum.totalTermFreq(), rightTermsEnum.totalTermFreq()); + } + } + + /** checks docs + freqs + positions + payloads, sequentially */ + public void assertDocsAndPositionsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { + assertNotNull(leftDocs); + assertNotNull(rightDocs); + assertEquals(-1, leftDocs.docID()); + assertEquals(-1, rightDocs.docID()); + int docid; + while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + assertEquals(docid, rightDocs.nextDoc()); + int freq = leftDocs.freq(); + assertEquals(freq, rightDocs.freq()); + for (int i = 0; i < freq; i++) { + assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition()); + // we don't assert offsets/payloads, they are allowed to be different + } + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); + } + + /** checks docs + freqs, sequentially */ + public void assertDocsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { + if (leftDocs == null) { + assertNull(rightDocs); + return; + } + assertEquals(-1, leftDocs.docID()); + assertEquals(-1, rightDocs.docID()); + int docid; + while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + assertEquals(docid, rightDocs.nextDoc()); + // we don't assert freqs, they are allowed to be different + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); + } + + /** checks advancing docs */ + public void assertDocsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { + if (leftDocs == null) { + assertNull(rightDocs); + return; + } + int docid = -1; + int averageGap = MAXDOC / (1 + docFreq); + int skipInterval = 16; + + while (true) { + if (random().nextBoolean()) { + // nextDoc() + docid = leftDocs.nextDoc(); + assertEquals(docid, rightDocs.nextDoc()); + } else { + // advance() + int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap)); + docid = leftDocs.advance(skip); + assertEquals(docid, rightDocs.advance(skip)); + } + + if (docid == DocIdSetIterator.NO_MORE_DOCS) { + return; + } + // we don't assert freqs, they are allowed to be different + } + } + + /** checks advancing docs + positions */ + public void assertPositionsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { + if (leftDocs == null || rightDocs == null) { + assertNull(leftDocs); + assertNull(rightDocs); + return; + } + + int docid = -1; + int averageGap = MAXDOC / (1 + docFreq); + int skipInterval = 16; + + while (true) { + if (random().nextBoolean()) { + // nextDoc() + docid = leftDocs.nextDoc(); + assertEquals(docid, rightDocs.nextDoc()); + } else { + // advance() + int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap)); + docid = leftDocs.advance(skip); + assertEquals(docid, rightDocs.advance(skip)); + } + + if (docid == DocIdSetIterator.NO_MORE_DOCS) { + return; + } + int freq = leftDocs.freq(); + assertEquals(freq, rightDocs.freq()); + for (int i = 0; i < freq; i++) { + assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition()); + // we don't compare the payloads, it's allowed that one is empty etc + } + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormatTests.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormatTests.java new file mode 100644 index 0000000000000..bc04dceea30cd --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BlockPostingsFormatTests.java @@ -0,0 +1,138 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.index.BasePostingsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree.FieldReader; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree.Stats; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.Lucene50ScoreSkipReader.MutableImpactList; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** Tests BlockPostingsFormat */ +public class BlockPostingsFormatTests extends BasePostingsFormatTestCase { + private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene50RWPostingsFormat()); + + @Override + protected Codec getCodec() { + return codec; + } + + /** Make sure the final sub-block(s) are not skipped. */ + public void testFinalBlock() throws Exception { + Directory d = newDirectory(); + IndexWriter w = new IndexWriter(d, new IndexWriterConfig(new MockAnalyzer(random()))); + for (int i = 0; i < 25; i++) { + Document doc = new Document(); + doc.add(newStringField("field", Character.toString((char) (97 + i)), Field.Store.NO)); + doc.add(newStringField("field", "z" + Character.toString((char) (97 + i)), Field.Store.NO)); + w.addDocument(doc); + } + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + assertEquals(1, r.leaves().size()); + FieldReader field = (FieldReader) r.leaves().get(0).reader().terms("field"); + // We should see exactly two blocks: one root block (prefix empty string) and one block for z* + // terms (prefix z): + Stats stats = field.getStats(); + assertEquals(0, stats.floorBlockCount); + assertEquals(2, stats.nonFloorBlockCount); + r.close(); + w.close(); + d.close(); + } + + public void testImpactSerialization() throws IOException { + // omit norms and omit freqs + doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L))); + + // omit freqs + doTestImpactSerialization(Collections.singletonList(new Impact(1, 42L))); + // omit freqs with very large norms + doTestImpactSerialization(Collections.singletonList(new Impact(1, -100L))); + + // omit norms + doTestImpactSerialization(Collections.singletonList(new Impact(30, 1L))); + // omit norms with large freq + doTestImpactSerialization(Collections.singletonList(new Impact(500, 1L))); + + // freqs and norms, basic + doTestImpactSerialization( + Arrays.asList( + new Impact(1, 7L), + new Impact(3, 9L), + new Impact(7, 10L), + new Impact(15, 11L), + new Impact(20, 13L), + new Impact(28, 14L) + ) + ); + + // freqs and norms, high values + doTestImpactSerialization( + Arrays.asList( + new Impact(2, 2L), + new Impact(10, 10L), + new Impact(12, 50L), + new Impact(50, -100L), + new Impact(1000, -80L), + new Impact(1005, -3L) + ) + ); + } + + private void doTestImpactSerialization(List impacts) throws IOException { + CompetitiveImpactAccumulator acc = new CompetitiveImpactAccumulator(); + for (Impact impact : impacts) { + acc.add(impact.freq, impact.norm); + } + try (Directory dir = newDirectory()) { + try (IndexOutput out = EndiannessReverserUtil.createOutput(dir, "foo", IOContext.DEFAULT)) { + Lucene50SkipWriter.writeImpacts(acc, out); + } + try (IndexInput in = EndiannessReverserUtil.openInput(dir, "foo", IOContext.DEFAULT)) { + byte[] b = new byte[Math.toIntExact(in.length())]; + in.readBytes(b, 0, b.length); + List impacts2 = Lucene50ScoreSkipReader.readImpacts(new ByteArrayDataInput(b), new MutableImpactList()); + assertEquals(impacts, impacts2); + } + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsWriter.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsWriter.java new file mode 100644 index 0000000000000..7e3a92acc4682 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsWriter.java @@ -0,0 +1,513 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.PushPostingsWriterBase; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.packed.PackedInts; +import org.elasticsearch.core.internal.io.IOUtils; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.IntBlockTermState; + +import java.io.IOException; + +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.BLOCK_SIZE; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.DOC_CODEC; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.DOC_EXTENSION; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.MAX_SKIP_LEVELS; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.PAY_CODEC; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.PAY_EXTENSION; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.POS_CODEC; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.POS_EXTENSION; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.TERMS_CODEC; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat.VERSION_CURRENT; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.ForUtil.MAX_DATA_SIZE; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.ForUtil.MAX_ENCODED_SIZE; + +/** + * Concrete class that writes docId(maybe frq,pos,offset,payloads) list + * with postings format. + * + * Postings list for each term will be stored separately. + * + * @see Lucene50SkipWriter for details about skipping setting and postings layout. + * @lucene.experimental + */ +public final class Lucene50PostingsWriter extends PushPostingsWriterBase { + + IndexOutput docOut; + IndexOutput posOut; + IndexOutput payOut; + + static final IntBlockTermState emptyState = new IntBlockTermState(); + IntBlockTermState lastState; + + // Holds starting file pointers for current term: + private long docStartFP; + private long posStartFP; + private long payStartFP; + + final int[] docDeltaBuffer; + final int[] freqBuffer; + private int docBufferUpto; + + final int[] posDeltaBuffer; + final int[] payloadLengthBuffer; + final int[] offsetStartDeltaBuffer; + final int[] offsetLengthBuffer; + private int posBufferUpto; + + private byte[] payloadBytes; + private int payloadByteUpto; + + private int lastBlockDocID; + private long lastBlockPosFP; + private long lastBlockPayFP; + private int lastBlockPosBufferUpto; + private int lastBlockPayloadByteUpto; + + private int lastDocID; + private int lastPosition; + private int lastStartOffset; + private int docCount; + + final byte[] encoded; + + private final ForUtil forUtil; + private final Lucene50SkipWriter skipWriter; + + private boolean fieldHasNorms; + private NumericDocValues norms; + private final CompetitiveImpactAccumulator competitiveFreqNormAccumulator = new CompetitiveImpactAccumulator(); + + /** Creates a postings writer */ + public Lucene50PostingsWriter(SegmentWriteState state) throws IOException { + final float acceptableOverheadRatio = PackedInts.COMPACT; + + String docFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DOC_EXTENSION); + docOut = state.directory.createOutput(docFileName, state.context); + IndexOutput posOut = null; + IndexOutput payOut = null; + boolean success = false; + try { + CodecUtil.writeIndexHeader(docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + forUtil = new ForUtil(acceptableOverheadRatio, docOut); + if (state.fieldInfos.hasProx()) { + posDeltaBuffer = new int[MAX_DATA_SIZE]; + String posFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, POS_EXTENSION); + posOut = state.directory.createOutput(posFileName, state.context); + CodecUtil.writeIndexHeader(posOut, POS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + + if (state.fieldInfos.hasPayloads()) { + payloadBytes = new byte[128]; + payloadLengthBuffer = new int[MAX_DATA_SIZE]; + } else { + payloadBytes = null; + payloadLengthBuffer = null; + } + + if (state.fieldInfos.hasOffsets()) { + offsetStartDeltaBuffer = new int[MAX_DATA_SIZE]; + offsetLengthBuffer = new int[MAX_DATA_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + } + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, PAY_EXTENSION); + payOut = state.directory.createOutput(payFileName, state.context); + CodecUtil.writeIndexHeader(payOut, PAY_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + } + } else { + posDeltaBuffer = null; + payloadLengthBuffer = null; + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + payloadBytes = null; + } + this.payOut = payOut; + this.posOut = posOut; + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(docOut, posOut, payOut); + } + } + + docDeltaBuffer = new int[MAX_DATA_SIZE]; + freqBuffer = new int[MAX_DATA_SIZE]; + + // TODO: should we try skipping every 2/4 blocks...? + skipWriter = new Lucene50SkipWriter(MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut); + + encoded = new byte[MAX_ENCODED_SIZE]; + } + + @Override + public IntBlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { + CodecUtil.writeIndexHeader(termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + termsOut.writeVInt(BLOCK_SIZE); + } + + @Override + public void setField(FieldInfo fieldInfo) { + super.setField(fieldInfo); + skipWriter.setField(writePositions, writeOffsets, writePayloads); + lastState = emptyState; + fieldHasNorms = fieldInfo.hasNorms(); + } + + @Override + public void startTerm(NumericDocValues norms) { + docStartFP = docOut.getFilePointer(); + if (writePositions) { + posStartFP = posOut.getFilePointer(); + if (writePayloads || writeOffsets) { + payStartFP = payOut.getFilePointer(); + } + } + lastDocID = 0; + lastBlockDocID = -1; + skipWriter.resetSkip(); + this.norms = norms; + competitiveFreqNormAccumulator.clear(); + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + // Have collected a block of docs, and get a new doc. + // Should write skip data as well as postings list for + // current block. + if (lastBlockDocID != -1 && docBufferUpto == 0) { + skipWriter.bufferSkip( + lastBlockDocID, + competitiveFreqNormAccumulator, + docCount, + lastBlockPosFP, + lastBlockPayFP, + lastBlockPosBufferUpto, + lastBlockPayloadByteUpto + ); + competitiveFreqNormAccumulator.clear(); + } + + final int docDelta = docID - lastDocID; + + if (docID < 0 || (docCount > 0 && docDelta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )", docOut); + } + + docDeltaBuffer[docBufferUpto] = docDelta; + if (writeFreqs) { + freqBuffer[docBufferUpto] = termDocFreq; + } + + docBufferUpto++; + docCount++; + + if (docBufferUpto == BLOCK_SIZE) { + forUtil.writeBlock(docDeltaBuffer, encoded, docOut); + if (writeFreqs) { + forUtil.writeBlock(freqBuffer, encoded, docOut); + } + // NOTE: don't set docBufferUpto back to 0 here; + // finishDoc will do so (because it needs to see that + // the block was filled so it can save skip data) + } + + lastDocID = docID; + lastPosition = 0; + lastStartOffset = 0; + + long norm; + if (fieldHasNorms) { + boolean found = norms.advanceExact(docID); + if (found == false) { + // This can happen if indexing hits a problem after adding a doc to the + // postings but before buffering the norm. Such documents are written + // deleted and will go away on the first merge. + norm = 1L; + } else { + norm = norms.longValue(); + assert norm != 0 : docID; + } + } else { + norm = 1L; + } + + competitiveFreqNormAccumulator.add(writeFreqs ? termDocFreq : 1, norm); + } + + @Override + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { + if (position > IndexWriter.MAX_POSITION) { + throw new CorruptIndexException( + "position=" + position + " is too large (> IndexWriter.MAX_POSITION=" + IndexWriter.MAX_POSITION + ")", + docOut + ); + } + if (position < 0) { + throw new CorruptIndexException("position=" + position + " is < 0", docOut); + } + posDeltaBuffer[posBufferUpto] = position - lastPosition; + if (writePayloads) { + if (payload == null || payload.length == 0) { + // no payload + payloadLengthBuffer[posBufferUpto] = 0; + } else { + payloadLengthBuffer[posBufferUpto] = payload.length; + if (payloadByteUpto + payload.length > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length); + } + System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); + payloadByteUpto += payload.length; + } + } + + if (writeOffsets) { + assert startOffset >= lastStartOffset; + assert endOffset >= startOffset; + offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset; + offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; + lastStartOffset = startOffset; + } + + posBufferUpto++; + lastPosition = position; + if (posBufferUpto == BLOCK_SIZE) { + forUtil.writeBlock(posDeltaBuffer, encoded, posOut); + + if (writePayloads) { + forUtil.writeBlock(payloadLengthBuffer, encoded, payOut); + payOut.writeVInt(payloadByteUpto); + payOut.writeBytes(payloadBytes, 0, payloadByteUpto); + payloadByteUpto = 0; + } + if (writeOffsets) { + forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut); + forUtil.writeBlock(offsetLengthBuffer, encoded, payOut); + } + posBufferUpto = 0; + } + } + + @Override + public void finishDoc() throws IOException { + // Since we don't know df for current term, we had to buffer + // those skip data for each block, and when a new doc comes, + // write them to skip file. + if (docBufferUpto == BLOCK_SIZE) { + lastBlockDocID = lastDocID; + if (posOut != null) { + if (payOut != null) { + lastBlockPayFP = payOut.getFilePointer(); + } + lastBlockPosFP = posOut.getFilePointer(); + lastBlockPosBufferUpto = posBufferUpto; + lastBlockPayloadByteUpto = payloadByteUpto; + } + docBufferUpto = 0; + } + } + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(BlockTermState _state) throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + assert state.docFreq > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert state.docFreq == docCount : state.docFreq + " vs " + docCount; + + // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it. + final int singletonDocID; + if (state.docFreq == 1) { + // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq + singletonDocID = docDeltaBuffer[0]; + } else { + singletonDocID = -1; + // vInt encode the remaining doc deltas and freqs: + for (int i = 0; i < docBufferUpto; i++) { + final int docDelta = docDeltaBuffer[i]; + final int freq = freqBuffer[i]; + if (writeFreqs == false) { + docOut.writeVInt(docDelta); + } else if (freqBuffer[i] == 1) { + docOut.writeVInt((docDelta << 1) | 1); + } else { + docOut.writeVInt(docDelta << 1); + docOut.writeVInt(freq); + } + } + } + + final long lastPosBlockOffset; + + if (writePositions) { + // totalTermFreq is just total number of positions(or payloads, or offsets) + // associated with current term. + assert state.totalTermFreq != -1; + if (state.totalTermFreq > BLOCK_SIZE) { + // record file offset for last pos in last block + lastPosBlockOffset = posOut.getFilePointer() - posStartFP; + } else { + lastPosBlockOffset = -1; + } + if (posBufferUpto > 0) { + // TODO: should we send offsets/payloads to + // .pay...? seems wasteful (have to store extra + // vLong for low (< BLOCK_SIZE) DF terms = vast vast + // majority) + + // vInt encode the remaining positions/payloads/offsets: + int lastPayloadLength = -1; // force first payload length to be written + int lastOffsetLength = -1; // force first offset length to be written + int payloadBytesReadUpto = 0; + for (int i = 0; i < posBufferUpto; i++) { + final int posDelta = posDeltaBuffer[i]; + if (writePayloads) { + final int payloadLength = payloadLengthBuffer[i]; + if (payloadLength != lastPayloadLength) { + lastPayloadLength = payloadLength; + posOut.writeVInt((posDelta << 1) | 1); + posOut.writeVInt(payloadLength); + } else { + posOut.writeVInt(posDelta << 1); + } + + if (payloadLength != 0) { + posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength); + payloadBytesReadUpto += payloadLength; + } + } else { + posOut.writeVInt(posDelta); + } + + if (writeOffsets) { + int delta = offsetStartDeltaBuffer[i]; + int length = offsetLengthBuffer[i]; + if (length == lastOffsetLength) { + posOut.writeVInt(delta << 1); + } else { + posOut.writeVInt(delta << 1 | 1); + posOut.writeVInt(length); + lastOffsetLength = length; + } + } + } + + if (writePayloads) { + assert payloadBytesReadUpto == payloadByteUpto; + payloadByteUpto = 0; + } + } + } else { + lastPosBlockOffset = -1; + } + + long skipOffset; + if (docCount > BLOCK_SIZE) { + skipOffset = skipWriter.writeSkip(docOut) - docStartFP; + } else { + skipOffset = -1; + } + + state.docStartFP = docStartFP; + state.posStartFP = posStartFP; + state.payStartFP = payStartFP; + state.singletonDocID = singletonDocID; + state.skipOffset = skipOffset; + state.lastPosBlockOffset = lastPosBlockOffset; + docBufferUpto = 0; + posBufferUpto = 0; + lastDocID = 0; + docCount = 0; + } + + @Override + public void encodeTerm(DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + if (absolute) { + lastState = emptyState; + } + out.writeVLong(state.docStartFP - lastState.docStartFP); + if (writePositions) { + out.writeVLong(state.posStartFP - lastState.posStartFP); + if (writePayloads || writeOffsets) { + out.writeVLong(state.payStartFP - lastState.payStartFP); + } + } + if (state.singletonDocID != -1) { + out.writeVInt(state.singletonDocID); + } + if (writePositions) { + if (state.lastPosBlockOffset != -1) { + out.writeVLong(state.lastPosBlockOffset); + } + } + if (state.skipOffset != -1) { + out.writeVLong(state.skipOffset); + } + lastState = state; + } + + @Override + public void close() throws IOException { + // TODO: add a finish() at least to PushBase? DV too...? + boolean success = false; + try { + if (docOut != null) { + CodecUtil.writeFooter(docOut); + } + if (posOut != null) { + CodecUtil.writeFooter(posOut); + } + if (payOut != null) { + CodecUtil.writeFooter(payOut); + } + success = true; + } finally { + if (success) { + IOUtils.close(docOut, posOut, payOut); + } else { + IOUtils.closeWhileHandlingException(docOut, posOut, payOut); + } + docOut = posOut = payOut = null; + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50RWPostingsFormat.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50RWPostingsFormat.java new file mode 100644 index 0000000000000..11ed11e46d6b4 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50RWPostingsFormat.java @@ -0,0 +1,56 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.SegmentWriteState; +import org.elasticsearch.core.internal.io.IOUtils; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriter; + +import java.io.IOException; + +public class Lucene50RWPostingsFormat extends BWCLucene50PostingsFormat { + + public Lucene50RWPostingsFormat() { + super("Lucene50RW"); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene50PostingsWriter(state); + boolean success = false; + try { + FieldsConsumer ret = new Lucene40BlockTreeTermsWriter( + state, + postingsWriter, + Lucene40BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene40BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE + ); + success = true; + return ret; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } + +} diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50SkipWriter.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50SkipWriter.java new file mode 100644 index 0000000000000..9555f266e0611 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50SkipWriter.java @@ -0,0 +1,233 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene50; + +import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.MultiLevelSkipListWriter; +import org.apache.lucene.index.Impact; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; + +/** + * Write skip lists with multiple levels, and support skip within block ints. + * + *

Assume that docFreq = 28, skipInterval = blockSize = 12 + * + *

+ *  |       block#0       | |      block#1        | |vInts|
+ *  d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
+ *                          ^                       ^       (level 0 skip point)
+ * 
+ * + * Note that skipWriter will ignore first document in block#0, since it is useless as a skip point. + * Also, we'll never skip into the vInts block, only record skip data at the start its start + * point(if it exist). + * + *

For each skip point, we will record: 1. docID in former position, i.e. for position 12, record + * docID[11], etc. 2. its related file points(position, payload), 3. related numbers or + * uptos(position, payload). 4. start offset. + */ +final class Lucene50SkipWriter extends MultiLevelSkipListWriter { + private int[] lastSkipDoc; + private long[] lastSkipDocPointer; + private long[] lastSkipPosPointer; + private long[] lastSkipPayPointer; + private int[] lastPayloadByteUpto; + + private final IndexOutput docOut; + private final IndexOutput posOut; + private final IndexOutput payOut; + + private int curDoc; + private long curDocPointer; + private long curPosPointer; + private long curPayPointer; + private int curPosBufferUpto; + private int curPayloadByteUpto; + private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms; + private boolean fieldHasPositions; + private boolean fieldHasOffsets; + private boolean fieldHasPayloads; + + Lucene50SkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) { + super(blockSize, 8, maxSkipLevels, docCount); + this.docOut = docOut; + this.posOut = posOut; + this.payOut = payOut; + + lastSkipDoc = new int[maxSkipLevels]; + lastSkipDocPointer = new long[maxSkipLevels]; + if (posOut != null) { + lastSkipPosPointer = new long[maxSkipLevels]; + if (payOut != null) { + lastSkipPayPointer = new long[maxSkipLevels]; + } + lastPayloadByteUpto = new int[maxSkipLevels]; + } + curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels]; + for (int i = 0; i < maxSkipLevels; ++i) { + curCompetitiveFreqNorms[i] = new CompetitiveImpactAccumulator(); + } + } + + public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) { + this.fieldHasPositions = fieldHasPositions; + this.fieldHasOffsets = fieldHasOffsets; + this.fieldHasPayloads = fieldHasPayloads; + } + + // tricky: we only skip data for blocks (terms with more than 128 docs), but re-init'ing the + // skipper + // is pretty slow for rare terms in large segments as we have to fill O(log #docs in segment) of + // junk. + // this is the vast majority of terms (worst case: ID field or similar). so in resetSkip() we + // save + // away the previous pointers, and lazy-init only if we need to buffer skip data for the term. + private boolean initialized; + long lastDocFP; + long lastPosFP; + long lastPayFP; + + @Override + public void resetSkip() { + lastDocFP = docOut.getFilePointer(); + if (fieldHasPositions) { + lastPosFP = posOut.getFilePointer(); + if (fieldHasOffsets || fieldHasPayloads) { + lastPayFP = payOut.getFilePointer(); + } + } + if (initialized) { + for (CompetitiveImpactAccumulator acc : curCompetitiveFreqNorms) { + acc.clear(); + } + } + initialized = false; + } + + private void initSkip() { + if (initialized == false) { + super.resetSkip(); + Arrays.fill(lastSkipDoc, 0); + Arrays.fill(lastSkipDocPointer, lastDocFP); + if (fieldHasPositions) { + Arrays.fill(lastSkipPosPointer, lastPosFP); + if (fieldHasPayloads) { + Arrays.fill(lastPayloadByteUpto, 0); + } + if (fieldHasOffsets || fieldHasPayloads) { + Arrays.fill(lastSkipPayPointer, lastPayFP); + } + } + // sets of competitive freq,norm pairs should be empty at this point + assert Arrays.stream(curCompetitiveFreqNorms) + .map(CompetitiveImpactAccumulator::getCompetitiveFreqNormPairs) + .mapToInt(Collection::size) + .sum() == 0; + initialized = true; + } + } + + /** Sets the values for the current skip data. */ + public void bufferSkip( + int doc, + CompetitiveImpactAccumulator competitiveFreqNorms, + int numDocs, + long posFP, + long payFP, + int posBufferUpto, + int payloadByteUpto + ) throws IOException { + initSkip(); + this.curDoc = doc; + this.curDocPointer = docOut.getFilePointer(); + this.curPosPointer = posFP; + this.curPayPointer = payFP; + this.curPosBufferUpto = posBufferUpto; + this.curPayloadByteUpto = payloadByteUpto; + this.curCompetitiveFreqNorms[0].addAll(competitiveFreqNorms); + bufferSkip(numDocs); + } + + private final ByteBuffersDataOutput freqNormOut = ByteBuffersDataOutput.newResettableInstance(); + + @Override + protected void writeSkipData(int level, DataOutput skipBuffer) throws IOException { + + int delta = curDoc - lastSkipDoc[level]; + + skipBuffer.writeVInt(delta); + lastSkipDoc[level] = curDoc; + + skipBuffer.writeVLong(curDocPointer - lastSkipDocPointer[level]); + lastSkipDocPointer[level] = curDocPointer; + + if (fieldHasPositions) { + + skipBuffer.writeVLong(curPosPointer - lastSkipPosPointer[level]); + lastSkipPosPointer[level] = curPosPointer; + skipBuffer.writeVInt(curPosBufferUpto); + + if (fieldHasPayloads) { + skipBuffer.writeVInt(curPayloadByteUpto); + } + + if (fieldHasOffsets || fieldHasPayloads) { + skipBuffer.writeVLong(curPayPointer - lastSkipPayPointer[level]); + lastSkipPayPointer[level] = curPayPointer; + } + } + + CompetitiveImpactAccumulator competitiveFreqNorms = curCompetitiveFreqNorms[level]; + assert competitiveFreqNorms.getCompetitiveFreqNormPairs().size() > 0; + if (level + 1 < numberOfSkipLevels) { + curCompetitiveFreqNorms[level + 1].addAll(competitiveFreqNorms); + } + writeImpacts(competitiveFreqNorms, freqNormOut); + skipBuffer.writeVInt(Math.toIntExact(freqNormOut.size())); + freqNormOut.copyTo(skipBuffer); + freqNormOut.reset(); + competitiveFreqNorms.clear(); + } + + static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out) throws IOException { + Collection impacts = acc.getCompetitiveFreqNormPairs(); + Impact previous = new Impact(0, 0); + for (Impact impact : impacts) { + assert impact.freq > previous.freq; + assert Long.compareUnsigned(impact.norm, previous.norm) > 0; + int freqDelta = impact.freq - previous.freq - 1; + long normDelta = impact.norm - previous.norm - 1; + if (normDelta == 0) { + // most of time, norm only increases by 1, so we can fold everything in a single byte + out.writeVInt(freqDelta << 1); + } else { + out.writeVInt((freqDelta << 1) | 1); + out.writeZLong(normDelta); + } + previous = impact; + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/test/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/x-pack/plugin/old-lucene-versions/src/test/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat new file mode 100644 index 0000000000000..b2c1a7ca06a52 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/test/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.Lucene50RWPostingsFormat diff --git a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java index 8bb8cb6b08fe4..d63ea5301fcdb 100644 --- a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java +++ b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java @@ -16,6 +16,8 @@ import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus; import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusRequest; import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse; +import org.elasticsearch.action.get.GetRequest; +import org.elasticsearch.action.get.GetResponse; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.Request; @@ -453,6 +455,7 @@ private void assertDocs( XContentBuilder mappingBuilder = JsonXContent.contentBuilder(); mappingBuilder.startObject().startObject("properties"); mappingBuilder.startObject("val").field("type", "long").endObject(); + mappingBuilder.startObject("test").field("type", "text").endObject(); mappingBuilder.endObject().endObject(); assertTrue( client.indices().putMapping(new PutMappingRequest(index).source(mappingBuilder), RequestOptions.DEFAULT).isAcknowledged() @@ -474,6 +477,22 @@ private void assertDocs( Arrays.stream(searchResponse.getHits().getHits()).map(SearchHit::getId).collect(Collectors.toList()) ); + // look up by id (only 6.0+ as we would otherwise need ability to specify _type in GET API) + if (oldVersion.onOrAfter(Version.fromString("6.0.0"))) { + GetResponse getResponse = client.get(new GetRequest(index, id), RequestOptions.DEFAULT); + assertTrue(getResponse.isExists()); + assertEquals(sourceForDoc(getIdAsNumeric(id)), getResponse.getSourceAsString()); + } + + // look up postings + searchResponse = client.search( + new SearchRequest(index).source(SearchSourceBuilder.searchSource().query(QueryBuilders.matchQuery("test", "test" + num))), + randomRequestOptions + ); + logger.info(searchResponse); + // check match + assertEquals(List.of(id), Arrays.stream(searchResponse.getHits().getHits()).map(SearchHit::getId).collect(Collectors.toList())); + if (oldVersion.before(Version.fromString("6.0.0"))) { // search on _type and check that results contain _type information String randomType = getType(oldVersion, randomFrom(expectedIds)); From 56d0e7b28ea3893a3093390c42b040f7c2d05248 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Thu, 24 Mar 2022 10:48:36 +0100 Subject: [PATCH 02/19] javadoc --- .../lucene/bwc/codecs/lucene40/blocktree/FieldReader.java | 2 -- .../lucene40/blocktree/Lucene40BlockTreeTermsReader.java | 1 - .../xpack/lucene/bwc/codecs/lucene40/blocktree/Stats.java | 2 -- .../lucene/bwc/codecs/lucene50/BWCLucene50PostingsFormat.java | 4 ---- .../lucene/bwc/codecs/lucene50/Lucene50PostingsReader.java | 2 -- .../xpack/lucene/bwc/codecs/lucene54/LegacyStringHelper.java | 1 - .../xpack/lucene/bwc/codecs/lucene70/fst/BitTableUtil.java | 1 - .../lucene/bwc/codecs/lucene70/fst/ByteSequenceOutputs.java | 1 - .../xpack/lucene/bwc/codecs/lucene70/fst/BytesRefFSTEnum.java | 2 -- .../xpack/lucene/bwc/codecs/lucene70/fst/FST.java | 1 - .../xpack/lucene/bwc/codecs/lucene70/fst/FSTCompiler.java | 1 - .../xpack/lucene/bwc/codecs/lucene70/fst/FSTEnum.java | 2 -- .../xpack/lucene/bwc/codecs/lucene70/fst/OffHeapFSTStore.java | 1 - .../xpack/lucene/bwc/codecs/lucene70/fst/OnHeapFSTStore.java | 1 - .../xpack/lucene/bwc/codecs/lucene70/fst/Outputs.java | 1 - .../xpack/lucene/bwc/codecs/lucene70/fst/Util.java | 2 -- .../lucene40/blocktree/Lucene40BlockTreeTermsWriter.java | 1 - .../lucene/bwc/codecs/lucene50/Lucene50PostingsWriter.java | 1 - 18 files changed, 27 deletions(-) diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/FieldReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/FieldReader.java index 3d24e82edd18b..71b90bc71bc4a 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/FieldReader.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/FieldReader.java @@ -35,8 +35,6 @@ /** * BlockTree's implementation of {@link Terms}. - * - * @lucene.internal */ public final class FieldReader extends Terms { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java index 807b821d8d145..3237da73cf830 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java @@ -64,7 +64,6 @@ * *

See {@code BlockTreeTermsWriter}. * - * @lucene.experimental */ public final class Lucene40BlockTreeTermsReader extends FieldsProducer { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Stats.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Stats.java index 90ee6d1115a57..6ae18c70f3ca9 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Stats.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Stats.java @@ -31,8 +31,6 @@ /** * BlockTree statistics for a single field returned by {@link FieldReader#getStats()}. - * - * @lucene.internal */ public class Stats { /** Byte size of the index. */ diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BWCLucene50PostingsFormat.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BWCLucene50PostingsFormat.java index fd04a28ce23fb..4ff6199a52577 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BWCLucene50PostingsFormat.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/BWCLucene50PostingsFormat.java @@ -330,8 +330,6 @@ * current position. * * - * - * @lucene.experimental */ public class BWCLucene50PostingsFormat extends PostingsFormat { @@ -407,8 +405,6 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException /** * Holds all state required for {@link Lucene50PostingsReader} to produce a {@link * org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict. - * - * @lucene.internal */ public static final class IntBlockTermState extends BlockTermState { /** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */ diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsReader.java index 206f5e1ae943b..a3c91c7d3ec44 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsReader.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsReader.java @@ -54,8 +54,6 @@ /** * Concrete class that reads docId(maybe frq,pos,offset,payloads) list with postings format. - * - * @lucene.experimental */ public final class Lucene50PostingsReader extends PostingsReaderBase { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/LegacyStringHelper.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/LegacyStringHelper.java index 50e5cde04ead3..3f58b1bb417ae 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/LegacyStringHelper.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/LegacyStringHelper.java @@ -24,7 +24,6 @@ /** * Legacy methods for manipulating strings. * - * @lucene.internal * @deprecated This is only used for backwards compatibility codecs (they * don't work with the Java9-based replacement methods). */ diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BitTableUtil.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BitTableUtil.java index 728191932763c..56ba113a1abbb 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BitTableUtil.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BitTableUtil.java @@ -24,7 +24,6 @@ /** * Static helper methods for {@link FST.Arc.BitTable}. * - * @lucene.experimental */ class BitTableUtil { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ByteSequenceOutputs.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ByteSequenceOutputs.java index 7a58a350fcab1..23db4618bffc0 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ByteSequenceOutputs.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/ByteSequenceOutputs.java @@ -30,7 +30,6 @@ /** * An FST {@link Outputs} implementation where each output is a sequence of bytes. * - * @lucene.experimental */ public final class ByteSequenceOutputs extends Outputs { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesRefFSTEnum.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesRefFSTEnum.java index 955327af17ba0..609e419232043 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesRefFSTEnum.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/BytesRefFSTEnum.java @@ -26,8 +26,6 @@ /** * Enumerates all input (BytesRef) + output pairs in an FST. - * - * @lucene.experimental */ public final class BytesRefFSTEnum extends FSTEnum { private final BytesRef current = new BytesRef(10); diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FST.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FST.java index 9fb73edb5a118..e5e684e08cd87 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FST.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FST.java @@ -60,7 +60,6 @@ * *

See the {@link org.apache.lucene.util.fst package documentation} for some simple examples. * - * @lucene.experimental */ public final class FST implements Accountable { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTCompiler.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTCompiler.java index 7ee6eaa5f7ba4..6fcd4b82b7174 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTCompiler.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTCompiler.java @@ -44,7 +44,6 @@ *

FSTs larger than 2.1GB are now possible (as of Lucene 4.2). FSTs containing more than 2.1B * nodes are also now possible, however they cannot be packed. * - * @lucene.experimental */ public class FSTCompiler { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTEnum.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTEnum.java index 789c216df6f95..3da2100cf79b7 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTEnum.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/FSTEnum.java @@ -27,8 +27,6 @@ /** * Can next() and advance() through the terms in an FST - * - * @lucene.experimental */ abstract class FSTEnum { protected final FST fst; diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OffHeapFSTStore.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OffHeapFSTStore.java index f0246cbf5c862..310098bbc255a 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OffHeapFSTStore.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OffHeapFSTStore.java @@ -30,7 +30,6 @@ * Provides off heap storage of finite state machine (FST), using underlying index input instead of * byte store on heap * - * @lucene.experimental */ public final class OffHeapFSTStore implements FSTStore { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OnHeapFSTStore.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OnHeapFSTStore.java index 646e56f095d9a..436a1ac7f1d40 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OnHeapFSTStore.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/OnHeapFSTStore.java @@ -28,7 +28,6 @@ /** * Provides storage of finite state machine (FST), using byte array or byte store allocated on heap. * - * @lucene.experimental */ public final class OnHeapFSTStore implements FSTStore { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Outputs.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Outputs.java index a7c5ed8933fed..cb273182a20ea 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Outputs.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Outputs.java @@ -32,7 +32,6 @@ *

Note that any operation that returns NO_OUTPUT must return the same singleton object from * {@link #getNoOutput}. * - * @lucene.experimental */ public abstract class Outputs { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Util.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Util.java index ce2ac82d478b6..2711e9c3f5110 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Util.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/fst/Util.java @@ -39,7 +39,6 @@ /** * Static helper methods. * - * @lucene.experimental */ public final class Util { private Util() {} @@ -98,7 +97,6 @@ public static T get(FST fst, BytesRef input) throws IOException { /** * Represents a path in TopNSearcher. * - * @lucene.experimental */ public static class FSTPath { /** Holds the last arc appended to this path */ diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsWriter.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsWriter.java index e6435dae4c12b..eaf35139bd146 100644 --- a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsWriter.java +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsWriter.java @@ -198,7 +198,6 @@ order, meaning if you just next() the file pointer will * * * @see Lucene40BlockTreeTermsReader - * @lucene.experimental */ public final class Lucene40BlockTreeTermsWriter extends FieldsConsumer { diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsWriter.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsWriter.java index 7e3a92acc4682..7bae5453196f9 100644 --- a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsWriter.java +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene50/Lucene50PostingsWriter.java @@ -59,7 +59,6 @@ * Postings list for each term will be stored separately. * * @see Lucene50SkipWriter for details about skipping setting and postings layout. - * @lucene.experimental */ public final class Lucene50PostingsWriter extends PushPostingsWriterBase { From bbaa535eae3148b9fb742a9904ff2737def64abc Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Mon, 28 Mar 2022 09:40:55 +0200 Subject: [PATCH 03/19] review comments o --- .../core/internal/io/IOUtils.java | 41 ---------------- .../LegacyAdaptingPerFieldPostingsFormat.java | 47 +++++++++++++------ .../Lucene40BlockTreeTermsReader.java | 8 +++- .../bwc/codecs/lucene60/Lucene60Codec.java | 5 -- .../bwc/codecs/lucene62/Lucene62Codec.java | 5 -- .../bwc/codecs/lucene70/BWCLucene70Codec.java | 2 - .../oldrepos/OldRepositoryAccessIT.java | 2 +- 7 files changed, 40 insertions(+), 70 deletions(-) diff --git a/libs/core/src/main/java/org/elasticsearch/core/internal/io/IOUtils.java b/libs/core/src/main/java/org/elasticsearch/core/internal/io/IOUtils.java index 183ff4111b693..5699180285746 100644 --- a/libs/core/src/main/java/org/elasticsearch/core/internal/io/IOUtils.java +++ b/libs/core/src/main/java/org/elasticsearch/core/internal/io/IOUtils.java @@ -317,45 +317,4 @@ public static void fsync(final Path fileToSync, final boolean isDir, final boole } } - /** - * This utility method takes a previously caught (non-null) {@code Throwable} and rethrows either - * the original argument if it was a subclass of the {@code IOException} or an {@code - * RuntimeException} with the cause set to the argument. - * - *

This method never returns any value, even though it declares a return value - * of type {@link Error}. The return value declaration is very useful to let the compiler know - * that the code path following the invocation of this method is unreachable. So in most cases the - * invocation of this method will be guarded by an {@code if} and used together with a {@code - * throw} statement, as in: - * - *

{@code
-     * if (t != null) throw IOUtils.rethrowAlways(t)
-     * }
- * - * @param th The throwable to rethrow, must not be null. - * @return This method always results in an exception, it never returns any value. See method - * documentation for details and usage example. - * @throws IOException if the argument was an instance of IOException - * @throws RuntimeException with the {@link RuntimeException#getCause()} set to the argument, if - * it was not an instance of IOException. - */ - public static Error rethrowAlways(Throwable th) throws IOException, RuntimeException { - if (th == null) { - throw new AssertionError("rethrow argument must not be null."); - } - - if (th instanceof IOException) { - throw (IOException) th; - } - - if (th instanceof RuntimeException) { - throw (RuntimeException) th; - } - - if (th instanceof Error) { - throw (Error) th; - } - - throw new RuntimeException(th); - } } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/LegacyAdaptingPerFieldPostingsFormat.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/LegacyAdaptingPerFieldPostingsFormat.java index 8aefcd875834c..4ee7456ae6993 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/LegacyAdaptingPerFieldPostingsFormat.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/LegacyAdaptingPerFieldPostingsFormat.java @@ -1,10 +1,22 @@ /* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. */ - package org.elasticsearch.xpack.lucene.bwc.codecs; import org.apache.lucene.codecs.FieldsConsumer; @@ -32,6 +44,19 @@ import java.util.Map; import java.util.TreeMap; +/** + * Modified version of {@link PerFieldPostingsFormat} that allows swapping in + * {@link org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.BWCLucene50PostingsFormat} instead of + * {@link org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat} when reading from older + * codecs. The former has full support for older Lucene versions (going back to Lucene 5) while the + * latter only supports Lucene 7 and above (as it was shipped with backwards-codecs of Lucene 9 that + * only has support for N-2). + * + * This class can probably be removed once we are on Lucene 10 and Lucene50PostingsFormat is no longer + * shipped as part of bwc jars. + * + * Swapping out formats can be done via the {@link #getPostingsFormat(String) method}. + */ public abstract class LegacyAdaptingPerFieldPostingsFormat extends PostingsFormat { /** Name of this {@link PostingsFormat}. */ public static final String PER_FIELD_NAME = "PerField40"; @@ -65,12 +90,12 @@ private class FieldsWriter extends FieldsConsumer { @Override public void write(Fields fields, NormsProducer norms) throws IOException { - throw new UnsupportedOperationException(); + throw new IllegalStateException("This codec should only be used for reading, not writing"); } @Override public void merge(MergeState mergeState, NormsProducer norms) throws IOException { - throw new UnsupportedOperationException(); + throw new IllegalStateException("This codec should only be used for reading, not writing"); } @Override @@ -189,12 +214,4 @@ public final FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOExc public final FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return new FieldsReader(state, this); } - - /** - * Returns the postings format that should be used for writing new segments of field. - * - *

The field to format mapping is written to the index, so this method is only invoked when - * writing, not when reading. - */ - public abstract PostingsFormat getPostingsFormatForField(String field); } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java index 3237da73cf830..44690566c6acc 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene40/blocktree/Lucene40BlockTreeTermsReader.java @@ -32,6 +32,7 @@ import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.core.SuppressForbidden; import org.elasticsearch.core.internal.io.IOUtils; import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.ByteSequenceOutputs; import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.fst.Outputs; @@ -280,7 +281,7 @@ public Lucene40BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentRe if (metaIn != null) { CodecUtil.checkFooter(metaIn, priorE); } else if (priorE != null) { - IOUtils.rethrowAlways(priorE); + rethrowAlways(priorE); } } } @@ -327,6 +328,11 @@ private static void seekDir(IndexInput input) throws IOException { input.seek(offset); } + @SuppressForbidden(reason = "Lucene class") + private static Error rethrowAlways(Throwable th) throws IOException, RuntimeException { + return org.apache.lucene.util.IOUtils.rethrowAlways(th); + } + // for debugging // private static String toHex(int v) { // return "0x" + Integer.toHexString(v); diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java index 55fe5c3b98f64..d507d49907433 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java @@ -59,11 +59,6 @@ public DocValuesFormat getDocValuesFormatForField(String field) { } }; private final PostingsFormat postingsFormat = new LegacyAdaptingPerFieldPostingsFormat() { - @Override - public PostingsFormat getPostingsFormatForField(String field) { - throw new IllegalStateException("This codec should only be used for reading, not writing"); - } - @Override protected PostingsFormat getPostingsFormat(String formatName) { if (formatName.equals("Lucene50")) { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java index e3317a1c00c8c..85084317977b3 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java @@ -58,11 +58,6 @@ public DocValuesFormat getDocValuesFormatForField(String field) { } }; private final PostingsFormat postingsFormat = new LegacyAdaptingPerFieldPostingsFormat() { - @Override - public PostingsFormat getPostingsFormatForField(String field) { - throw new IllegalStateException("This codec should only be used for reading, not writing"); - } - @Override protected PostingsFormat getPostingsFormat(String formatName) { if (formatName.equals("Lucene50")) { diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java index 90739206b5643..8e52baa9a73c5 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java @@ -3,8 +3,6 @@ * or more contributor license agreements. Licensed under the Elastic License * 2.0; you may not use this file except in compliance with the Elastic License * 2.0. - * - * Modifications copyright (C) 2021 Elasticsearch B.V. */ package org.elasticsearch.xpack.lucene.bwc.codecs.lucene70; diff --git a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java index d63ea5301fcdb..094fdca692081 100644 --- a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java +++ b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java @@ -491,7 +491,7 @@ private void assertDocs( ); logger.info(searchResponse); // check match - assertEquals(List.of(id), Arrays.stream(searchResponse.getHits().getHits()).map(SearchHit::getId).collect(Collectors.toList())); + ElasticsearchAssertions.assertSearchHits(searchResponse, id); if (oldVersion.before(Version.fromString("6.0.0"))) { // search on _type and check that results contain _type information From 0d40083f68ffd01448d3261e6235d90d23acfb23 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 27 Apr 2022 11:27:24 +0200 Subject: [PATCH 04/19] Verify / rewrite mappings using full analysis service --- .../metadata/IndexMetadataVerifier.java | 11 ++--------- .../java/org/elasticsearch/node/Node.java | 3 ++- .../snapshots/RestoreService.java | 19 +++++++++++++++---- .../snapshots/SnapshotResiliencyTests.java | 3 ++- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadataVerifier.java b/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadataVerifier.java index 2d07a5abc6cca..f8f77409db89d 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadataVerifier.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadataVerifier.java @@ -18,15 +18,12 @@ import org.elasticsearch.common.settings.IndexScopedSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.LoggingDeprecationHandler; -import org.elasticsearch.core.Nullable; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalyzerScope; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; -import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.MapperRegistry; import org.elasticsearch.index.mapper.MapperService; -import org.elasticsearch.index.mapper.Mapping; import org.elasticsearch.index.similarity.SimilarityService; import org.elasticsearch.script.ScriptCompiler; import org.elasticsearch.script.ScriptService; @@ -92,7 +89,7 @@ public IndexMetadata verifyIndexMetadata(IndexMetadata indexMetadata, Version mi // Next we have to run this otherwise if we try to create IndexSettings // with broken settings it would fail in checkMappingsCompatibility newMetadata = archiveBrokenIndexSettings(newMetadata); - createAndValidateMapping(newMetadata); + checkMappingsCompatibility(newMetadata); return newMetadata; } @@ -129,10 +126,8 @@ private static void checkSupportedVersion(IndexMetadata indexMetadata, Version m * Note that we don't expect users to encounter mapping incompatibilities, since our index compatibility * policy guarantees we can read mappings from previous compatible index versions. A failure here would * indicate a compatibility bug (which are unfortunately not that uncommon). - * @return the mapping */ - @Nullable - public Mapping createAndValidateMapping(IndexMetadata indexMetadata) { + private void checkMappingsCompatibility(IndexMetadata indexMetadata) { try { // We cannot instantiate real analysis server or similarity service at this point because the node @@ -199,8 +194,6 @@ public Set> entrySet() { scriptService ); mapperService.merge(indexMetadata, MapperService.MergeReason.MAPPING_RECOVERY); - DocumentMapper documentMapper = mapperService.documentMapper(); - return documentMapper == null ? null : documentMapper.mapping(); } } catch (Exception ex) { // Wrap the inner exception so we have the index name in the exception message diff --git a/server/src/main/java/org/elasticsearch/node/Node.java b/server/src/main/java/org/elasticsearch/node/Node.java index 6e7c52523131b..92df7e81caf82 100644 --- a/server/src/main/java/org/elasticsearch/node/Node.java +++ b/server/src/main/java/org/elasticsearch/node/Node.java @@ -793,7 +793,8 @@ protected Node( clusterModule.getMetadataDeleteIndexService(), indexMetadataVerifier, shardLimitValidator, - systemIndices + systemIndices, + indicesService ); final DiskThresholdMonitor diskThresholdMonitor = new DiskThresholdMonitor( settings, diff --git a/server/src/main/java/org/elasticsearch/snapshots/RestoreService.java b/server/src/main/java/org/elasticsearch/snapshots/RestoreService.java index 1568cfd82a3ce..e0490b9d6dfb8 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/RestoreService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/RestoreService.java @@ -61,10 +61,12 @@ import org.elasticsearch.core.Tuple; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.Mapping; import org.elasticsearch.index.shard.IndexLongFieldRange; import org.elasticsearch.index.shard.IndexShard; import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.indices.IndicesService; import org.elasticsearch.indices.ShardLimitValidator; import org.elasticsearch.indices.SystemDataStreamDescriptor; import org.elasticsearch.indices.SystemIndices; @@ -176,6 +178,8 @@ public class RestoreService implements ClusterStateApplier { private final SystemIndices systemIndices; + private final IndicesService indicesService; + private volatile boolean refreshRepositoryUuidOnRestore; public RestoreService( @@ -186,7 +190,8 @@ public RestoreService( MetadataDeleteIndexService metadataDeleteIndexService, IndexMetadataVerifier indexMetadataVerifier, ShardLimitValidator shardLimitValidator, - SystemIndices systemIndices + SystemIndices systemIndices, + IndicesService indicesService ) { this.clusterService = clusterService; this.repositoriesService = repositoriesService; @@ -200,6 +205,7 @@ public RestoreService( this.clusterSettings = clusterService.getClusterSettings(); this.shardLimitValidator = shardLimitValidator; this.systemIndices = systemIndices; + this.indicesService = indicesService; this.refreshRepositoryUuidOnRestore = REFRESH_REPO_UUID_ON_RESTORE_SETTING.get(clusterService.getSettings()); clusterService.getClusterSettings() .addSettingsUpdateConsumer(REFRESH_REPO_UUID_ON_RESTORE_SETTING, this::setRefreshRepositoryUuidOnRestore); @@ -1286,7 +1292,7 @@ public ClusterState execute(ClusterState currentState) { ); if (snapshotIndexMetadata.getCompatibilityVersion().before(minIndexCompatibilityVersion)) { // adapt index metadata so that it can be understood by current version - snapshotIndexMetadata = convertLegacyIndex(snapshotIndexMetadata, currentState, indexMetadataVerifier); + snapshotIndexMetadata = convertLegacyIndex(snapshotIndexMetadata, currentState, indicesService); } try { snapshotIndexMetadata = indexMetadataVerifier.verifyIndexMetadata(snapshotIndexMetadata, minIndexCompatibilityVersion); @@ -1579,7 +1585,7 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState) private static IndexMetadata convertLegacyIndex( IndexMetadata snapshotIndexMetadata, ClusterState clusterState, - IndexMetadataVerifier indexMetadataVerifier + IndicesService indicesService ) { if (snapshotIndexMetadata.getCreationVersion().before(Version.fromString("5.0.0"))) { throw new IllegalArgumentException("can't restore an index created before version 5.0.0"); @@ -1668,7 +1674,12 @@ private static IndexMetadata convertLegacyIndex( IndexMetadata convertedIndexMetadata = convertedIndexMetadataBuilder.build(); try { - Mapping mapping = indexMetadataVerifier.createAndValidateMapping(convertedIndexMetadata); + Mapping mapping; + try (MapperService mapperService = indicesService.createIndexMapperService(convertedIndexMetadata)) { + // create and validate in-memory mapping + mapperService.merge(convertedIndexMetadata, MapperService.MergeReason.MAPPING_RECOVERY); + mapping = mapperService.documentMapper().mapping(); + } if (mapping != null) { convertedIndexMetadataBuilder = IndexMetadata.builder(convertedIndexMetadata); // using the recomputed mapping allows stripping some fields that we no longer support (e.g. include_in_all) diff --git a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java index d607492ac0d6e..07a2d6c42df7b 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java +++ b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java @@ -1918,7 +1918,8 @@ protected void assertSnapshotOrGenericThread() { new MetadataDeleteIndexService(settings, clusterService, allocationService), new IndexMetadataVerifier(settings, namedXContentRegistry, mapperRegistry, indexScopedSettings, ScriptCompiler.NONE), shardLimitValidator, - EmptySystemIndices.INSTANCE + EmptySystemIndices.INSTANCE, + indicesService ); actions.put( PutMappingAction.INSTANCE, From b7dd42143946cd2e54b6473a39e44fca1c50fcc8 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 27 Apr 2022 11:45:24 +0200 Subject: [PATCH 05/19] allow queries on text field type --- .../java/org/elasticsearch/index/mapper/TextFieldMapper.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index f0b8b6de41493..e60bb4bf64bc7 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -453,7 +453,10 @@ public TextFieldMapper build(MapperBuilderContext context) { } } - public static final TypeParser PARSER = new TypeParser((n, c) -> new Builder(n, c.indexVersionCreated(), c.getIndexAnalyzers())); + private static final Version MINIMUM_COMPATIBILITY_VERSION = Version.fromString("5.0.0"); + + public static final TypeParser PARSER = new TypeParser((n, c) -> new Builder(n, c.indexVersionCreated(), c.getIndexAnalyzers()), + MINIMUM_COMPATIBILITY_VERSION); private static class PhraseWrappedAnalyzer extends AnalyzerWrapper { From 7f590addfbed9c9f1a7e4e532db2ad4f97ea6b47 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 27 Apr 2022 12:06:27 +0200 Subject: [PATCH 06/19] make analyzer lenient and updateable --- .../extras/MatchOnlyTextFieldMapper.java | 3 +- .../extras/SearchAsYouTypeFieldMapper.java | 13 ++++--- .../AnnotatedTextFieldMapper.java | 13 ++++--- .../AnnotatedTextFieldTypeTests.java | 3 +- .../index/mapper/FieldMapper.java | 34 +++++++++++++++++-- .../index/mapper/TextFieldMapper.java | 9 +++-- .../index/mapper/TextParams.java | 19 ++++++++--- 7 files changed, 75 insertions(+), 19 deletions(-) diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java index 7e5f300f78814..edc8c8241f95f 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java @@ -97,7 +97,8 @@ public Builder(String name, Version indexCreatedVersion, IndexAnalyzers indexAna this.analyzers = new TextParams.Analyzers( indexAnalyzers, m -> ((MatchOnlyTextFieldMapper) m).indexAnalyzer, - m -> ((MatchOnlyTextFieldMapper) m).positionIncrementGap + m -> ((MatchOnlyTextFieldMapper) m).positionIncrementGap, + indexCreatedVersion ); } diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SearchAsYouTypeFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SearchAsYouTypeFieldMapper.java index 41b494367d91d..03bbc01a3a0b5 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SearchAsYouTypeFieldMapper.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SearchAsYouTypeFieldMapper.java @@ -35,6 +35,7 @@ import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.Version; import org.elasticsearch.common.collect.Iterators; import org.elasticsearch.index.analysis.AnalyzerScope; import org.elasticsearch.index.analysis.IndexAnalyzers; @@ -92,7 +93,7 @@ public static class Defaults { public static final int MAX_SHINGLE_SIZE = 3; } - public static final TypeParser PARSER = new TypeParser((n, c) -> new Builder(n, c.getIndexAnalyzers())); + public static final TypeParser PARSER = new TypeParser((n, c) -> new Builder(n, c.indexVersionCreated(), c.getIndexAnalyzers())); private static Builder builder(FieldMapper in) { return ((SearchAsYouTypeFieldMapper) in).builder; @@ -141,12 +142,16 @@ public static class Builder extends FieldMapper.Builder { private final Parameter> meta = Parameter.metaParam(); - public Builder(String name, IndexAnalyzers indexAnalyzers) { + private final Version indexCreatedVersion; + + public Builder(String name, Version indexCreatedVersion, IndexAnalyzers indexAnalyzers) { super(name); + this.indexCreatedVersion = indexCreatedVersion; this.analyzers = new TextParams.Analyzers( indexAnalyzers, m -> builder(m).analyzers.getIndexAnalyzer(), - m -> builder(m).analyzers.positionIncrementGap.getValue() + m -> builder(m).analyzers.positionIncrementGap.getValue(), + indexCreatedVersion ); } @@ -702,7 +707,7 @@ protected String contentType() { } public FieldMapper.Builder getMergeBuilder() { - return new Builder(simpleName(), builder.analyzers.indexAnalyzers).init(this); + return new Builder(simpleName(), builder.indexCreatedVersion, builder.analyzers.indexAnalyzers).init(this); } public static String getShingleFieldName(String parentField, int shingleSize) { diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java index 04aaa10e90f84..43ade660ebe5d 100644 --- a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java @@ -21,6 +21,7 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.Version; import org.elasticsearch.index.analysis.AnalyzerScope; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; @@ -86,12 +87,16 @@ public static class Builder extends FieldMapper.Builder { private final Parameter> meta = Parameter.metaParam(); - public Builder(String name, IndexAnalyzers indexAnalyzers) { + private final Version indexCreatedVersion; + + public Builder(String name, Version indexCreatedVersion, IndexAnalyzers indexAnalyzers) { super(name); + this.indexCreatedVersion = indexCreatedVersion; this.analyzers = new TextParams.Analyzers( indexAnalyzers, m -> builder(m).analyzers.getIndexAnalyzer(), - m -> builder(m).analyzers.positionIncrementGap.getValue() + m -> builder(m).analyzers.positionIncrementGap.getValue(), + indexCreatedVersion ); } @@ -145,7 +150,7 @@ public AnnotatedTextFieldMapper build(MapperBuilderContext context) { } } - public static TypeParser PARSER = new TypeParser((n, c) -> new Builder(n, c.getIndexAnalyzers())); + public static TypeParser PARSER = new TypeParser((n, c) -> new Builder(n, c.indexVersionCreated(), c.getIndexAnalyzers())); /** * Parses markdown-like syntax into plain text and AnnotationTokens with offsets for @@ -519,6 +524,6 @@ protected String contentType() { @Override public FieldMapper.Builder getMergeBuilder() { - return new Builder(simpleName(), builder.analyzers.indexAnalyzers).init(this); + return new Builder(simpleName(), builder.indexCreatedVersion, builder.analyzers.indexAnalyzers).init(this); } } diff --git a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldTypeTests.java b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldTypeTests.java index d9d28d34f88d5..0ead11b1e2ae9 100644 --- a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldTypeTests.java +++ b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldTypeTests.java @@ -11,6 +11,7 @@ import org.apache.lucene.queries.intervals.Intervals; import org.apache.lucene.queries.intervals.IntervalsSource; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.Version; import org.elasticsearch.index.mapper.FieldTypeTestCase; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperBuilderContext; @@ -28,7 +29,7 @@ public void testIntervals() throws IOException { } public void testFetchSourceValue() throws IOException { - MappedFieldType fieldType = new AnnotatedTextFieldMapper.Builder("field", createDefaultIndexAnalyzers()).build( + MappedFieldType fieldType = new AnnotatedTextFieldMapper.Builder("field", Version.CURRENT, createDefaultIndexAnalyzers()).build( MapperBuilderContext.ROOT ).fieldType(); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java index f1db921bd452f..39e757d5b9fad 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java @@ -8,6 +8,9 @@ package org.elasticsearch.index.mapper; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.index.LeafReaderContext; import org.elasticsearch.Version; import org.elasticsearch.common.Explicit; @@ -48,6 +51,8 @@ import java.util.function.Supplier; public abstract class FieldMapper extends Mapper implements Cloneable { + private static final Logger logger = LogManager.getLogger(FieldMapper.class); + public static final Setting IGNORE_MALFORMED_SETTING = Setting.boolSetting( "index.mapping.ignore_malformed", false, @@ -1042,23 +1047,48 @@ public static > Parameter restrictedEnumParam( * @param updateable whether the parameter can be changed by a mapping update * @param initializer a function that reads the parameter value from an existing mapper * @param defaultAnalyzer the default value, to be used if the parameter is undefined in a mapping + * @param indexCreatedVersion the version on which this index was created */ public static Parameter analyzerParam( String name, boolean updateable, Function initializer, - Supplier defaultAnalyzer + Supplier defaultAnalyzer, + Version indexCreatedVersion ) { return new Parameter<>(name, updateable, defaultAnalyzer, (n, c, o) -> { String analyzerName = o.toString(); NamedAnalyzer a = c.getIndexAnalyzers().get(analyzerName); if (a == null) { - throw new IllegalArgumentException("analyzer [" + analyzerName + "] has not been configured in mappings"); + if (indexCreatedVersion.isLegacyIndexVersion()) { + logger.warn( + new ParameterizedMessage("Could not find analyzer [{}] of legacy index, falling back to default", analyzerName) + ); + a = defaultAnalyzer.get(); + } else { + throw new IllegalArgumentException("analyzer [" + analyzerName + "] has not been configured in mappings"); + } } return a; }, initializer, (b, n, v) -> b.field(n, v.name()), NamedAnalyzer::name); } + /** + * Defines a parameter that takes an analyzer name + * @param name the parameter name + * @param updateable whether the parameter can be changed by a mapping update + * @param initializer a function that reads the parameter value from an existing mapper + * @param defaultAnalyzer the default value, to be used if the parameter is undefined in a mapping + */ + public static Parameter analyzerParam( + String name, + boolean updateable, + Function initializer, + Supplier defaultAnalyzer + ) { + return analyzerParam(name, updateable, initializer, defaultAnalyzer, Version.CURRENT); + } + /** * Declares a metadata parameter */ diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index e60bb4bf64bc7..5bdf7e6e1230c 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -278,7 +278,8 @@ public Builder(String name, Version indexCreatedVersion, IndexAnalyzers indexAna this.analyzers = new TextParams.Analyzers( indexAnalyzers, m -> ((TextFieldMapper) m).indexAnalyzer, - m -> (((TextFieldMapper) m).positionIncrementGap) + m -> (((TextFieldMapper) m).positionIncrementGap), + indexCreatedVersion ); } @@ -455,8 +456,10 @@ public TextFieldMapper build(MapperBuilderContext context) { private static final Version MINIMUM_COMPATIBILITY_VERSION = Version.fromString("5.0.0"); - public static final TypeParser PARSER = new TypeParser((n, c) -> new Builder(n, c.indexVersionCreated(), c.getIndexAnalyzers()), - MINIMUM_COMPATIBILITY_VERSION); + public static final TypeParser PARSER = new TypeParser( + (n, c) -> new Builder(n, c.indexVersionCreated(), c.getIndexAnalyzers()), + MINIMUM_COMPATIBILITY_VERSION + ); private static class PhraseWrappedAnalyzer extends AnalyzerWrapper { diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextParams.java b/server/src/main/java/org/elasticsearch/index/mapper/TextParams.java index 56410b778d197..b94387e0d59b4 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextParams.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextParams.java @@ -10,6 +10,7 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; +import org.elasticsearch.Version; import org.elasticsearch.index.analysis.AnalysisMode; import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.IndexAnalyzers; @@ -38,9 +39,17 @@ public static final class Analyzers { public Analyzers( IndexAnalyzers indexAnalyzers, Function analyzerInitFunction, - Function positionGapInitFunction + Function positionGapInitFunction, + Version indexCreatedVersion ) { - this.indexAnalyzer = Parameter.analyzerParam("analyzer", false, analyzerInitFunction, indexAnalyzers::getDefaultIndexAnalyzer) + + this.indexAnalyzer = Parameter.analyzerParam( + "analyzer", + indexCreatedVersion.isLegacyIndexVersion(), + analyzerInitFunction, + indexAnalyzers::getDefaultIndexAnalyzer, + indexCreatedVersion + ) .setSerializerCheck( (id, ic, a) -> id || ic @@ -60,7 +69,8 @@ public Analyzers( } } return indexAnalyzer.get(); - } + }, + indexCreatedVersion ) .setSerializerCheck((id, ic, a) -> id || ic || Objects.equals(a, getSearchQuoteAnalyzer()) == false) .addValidator(a -> a.checkAllowedInMode(AnalysisMode.SEARCH_TIME)); @@ -76,7 +86,8 @@ public Analyzers( } } return searchAnalyzer.get(); - } + }, + indexCreatedVersion ).addValidator(a -> a.checkAllowedInMode(AnalysisMode.SEARCH_TIME)); this.positionIncrementGap = Parameter.intParam( "position_increment_gap", From c33e835a3fdb05107347169d2317e6dd29835ee3 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 27 Apr 2022 15:18:37 +0200 Subject: [PATCH 07/19] fix tests --- .../java/org/elasticsearch/index/mapper/ObjectMapperTests.java | 2 +- .../elasticsearch/index/mapper/TextFieldAnalyzerModeTests.java | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/ObjectMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/ObjectMapperTests.java index b0a2c5b8b87cb..6d864b69dbd9a 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/ObjectMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/ObjectMapperTests.java @@ -340,7 +340,7 @@ public void testUnknownLegacyFields() throws Exception { public void testUnmappedLegacyFields() throws Exception { MapperService service = createMapperService(Version.fromString("5.0.0"), Settings.EMPTY, () -> false, mapping(b -> { b.startObject("name"); - b.field("type", "text"); + b.field("type", CompletionFieldMapper.CONTENT_TYPE); b.field("unknown_setting", 5); b.endObject(); })); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldAnalyzerModeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldAnalyzerModeTests.java index 5b2d7eea2153b..cdd3a6480983a 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldAnalyzerModeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldAnalyzerModeTests.java @@ -69,6 +69,7 @@ public void testParseTextFieldCheckAnalyzerAnalysisMode() { Map fieldNode = new HashMap<>(); fieldNode.put("analyzer", "my_analyzer"); MappingParserContext parserContext = mock(MappingParserContext.class); + when(parserContext.indexVersionCreated()).thenReturn(Version.CURRENT); // check AnalysisMode.ALL works Map analyzers = defaultAnalyzers(); @@ -103,6 +104,7 @@ public void testParseTextFieldCheckSearchAnalyzerAnalysisMode() { fieldNode.put("search_analyzer", "standard"); } MappingParserContext parserContext = mock(MappingParserContext.class); + when(parserContext.indexVersionCreated()).thenReturn(Version.CURRENT); // check AnalysisMode.ALL and AnalysisMode.SEARCH_TIME works Map analyzers = defaultAnalyzers(); @@ -143,6 +145,7 @@ public void testParseTextFieldCheckAnalyzerWithSearchAnalyzerAnalysisMode() { Map fieldNode = new HashMap<>(); fieldNode.put("analyzer", "my_analyzer"); MappingParserContext parserContext = mock(MappingParserContext.class); + when(parserContext.indexVersionCreated()).thenReturn(Version.CURRENT); // check that "analyzer" set to AnalysisMode.INDEX_TIME is blocked if there is no search analyzer AnalysisMode mode = AnalysisMode.INDEX_TIME; From ac09d0247bd00072da810f6aab9f94587fc6c07c Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 27 Apr 2022 15:47:38 +0200 Subject: [PATCH 08/19] fix test --- .../java/org/elasticsearch/index/mapper/MultiFieldTests.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/MultiFieldTests.java b/server/src/test/java/org/elasticsearch/index/mapper/MultiFieldTests.java index 29c00cf6b0a5d..d8fbe18a4ac78 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/MultiFieldTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/MultiFieldTests.java @@ -227,7 +227,7 @@ public void testUnmappedLegacyFieldsUnderKnownRootField() throws Exception { b.startObject("name"); b.field("type", "keyword"); b.startObject("fields"); - b.startObject("subfield").field("type", "text").endObject(); + b.startObject("subfield").field("type", CompletionFieldMapper.CONTENT_TYPE).endObject(); b.endObject(); b.endObject(); })); @@ -250,7 +250,7 @@ public void testFieldsUnderUnknownRootField() throws Exception { public void testFieldsUnderUnmappedRootField() throws Exception { MapperService service = createMapperService(Version.fromString("5.0.0"), Settings.EMPTY, () -> false, mapping(b -> { b.startObject("name"); - b.field("type", "text"); + b.field("type", CompletionFieldMapper.CONTENT_TYPE); b.startObject("fields"); b.startObject("subfield").field("type", "keyword").endObject(); b.endObject(); From e99a176fae5421b28ea2d881a9bf488fdeb60620 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 27 Apr 2022 17:21:01 +0200 Subject: [PATCH 09/19] test fixes --- .../java/org/elasticsearch/oldrepos/OldMappingsIT.java | 8 ++++---- .../resources/org/elasticsearch/oldrepos/filebeat.json | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java index a672925a0328c..334e48f904925 100644 --- a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java +++ b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java @@ -197,7 +197,7 @@ public void testSearchOnPlaceHolderField() throws IOException { .startObject() .startObject("query") .startObject("match") - .startObject("apache2.access.agent") + .startObject("completion") .field("query", "some-agent") .endObject() .endObject() @@ -207,7 +207,7 @@ public void testSearchOnPlaceHolderField() throws IOException { ResponseException re = expectThrows(ResponseException.class, () -> entityAsMap(client().performRequest(search))); assertThat( re.getMessage(), - containsString("Field [apache2.access.agent] of type [text] in legacy index does not support match queries") + containsString("Field [completion] of type [completion] in legacy index does not support match queries") ); } @@ -218,14 +218,14 @@ public void testAggregationOnPlaceholderField() throws IOException { .startObject("aggs") .startObject("agents") .startObject("terms") - .field("field", "apache2.access.agent") + .field("field", "completion") .endObject() .endObject() .endObject() .endObject(); search.setJsonEntity(Strings.toString(query)); ResponseException re = expectThrows(ResponseException.class, () -> entityAsMap(client().performRequest(search))); - assertThat(re.getMessage(), containsString("can't run aggregation or sorts on field type text of legacy index")); + assertThat(re.getMessage(), containsString("can't run aggregation or sorts on field type completion of legacy index")); } } diff --git a/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/filebeat.json b/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/filebeat.json index 6fa22f1c36ef9..a5debfb988386 100644 --- a/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/filebeat.json +++ b/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/filebeat.json @@ -676,6 +676,9 @@ "type": { "ignore_above": 1024, "type": "keyword" + }, + "completion": { + "type": "completion" } } } From c0e508fea8bef59c9fb9d6de1647431cd626cf6e Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Thu, 28 Apr 2022 08:40:29 +0200 Subject: [PATCH 10/19] use constant scoring --- .../index/mapper/TextFieldMapper.java | 64 ++++++++++++++++++- .../elasticsearch/oldrepos/OldMappingsIT.java | 25 ++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index 5bdf7e6e1230c..f08bfa37239c4 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -50,6 +50,7 @@ import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.search.AutomatonQueries; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; +import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.index.analysis.AnalyzerScope; import org.elasticsearch.index.analysis.IndexAnalyzers; @@ -330,7 +331,7 @@ protected List> getParameters() { ); } - private TextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext context) { + private TextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext context, Version indexCreatedVersion) { NamedAnalyzer searchAnalyzer = analyzers.getSearchAnalyzer(); NamedAnalyzer searchQuoteAnalyzer = analyzers.getSearchQuoteAnalyzer(); if (analyzers.positionIncrementGap.isConfigured()) { @@ -341,7 +342,12 @@ private TextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext c } } TextSearchInfo tsi = new TextSearchInfo(fieldType, similarity.getValue(), searchAnalyzer, searchQuoteAnalyzer); - TextFieldType ft = new TextFieldType(context.buildFullName(name), index.getValue(), store.getValue(), tsi, meta.getValue()); + TextFieldType ft; + if (indexCreatedVersion.isLegacyIndexVersion()) { + ft = new ConstantScoreTextFieldType(context.buildFullName(name), index.getValue(), store.getValue(), tsi, meta.getValue()); + } else { + ft = new TextFieldType(context.buildFullName(name), index.getValue(), store.getValue(), tsi, meta.getValue()); + } ft.eagerGlobalOrdinals = eagerGlobalOrdinals.getValue(); if (fieldData.getValue()) { ft.setFielddata(true, freqFilter.getValue()); @@ -431,7 +437,7 @@ public Map indexAnalyzers(String name, SubFieldInfo phras @Override public TextFieldMapper build(MapperBuilderContext context) { FieldType fieldType = TextParams.buildFieldType(index, store, indexOptions, norms, termVectors); - TextFieldType tft = buildFieldType(fieldType, context); + TextFieldType tft = buildFieldType(fieldType, context, indexCreatedVersion); SubFieldInfo phraseFieldInfo = buildPhraseInfo(fieldType, tft); SubFieldInfo prefixFieldInfo = buildPrefixInfo(context, fieldType, tft); MultiFields multiFields = multiFieldsBuilder.build(this, context); @@ -903,6 +909,58 @@ public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName, S } + public static class ConstantScoreTextFieldType extends TextFieldType { + + public ConstantScoreTextFieldType(String name, boolean indexed, boolean stored, TextSearchInfo tsi, Map meta) { + super(name, indexed, stored, tsi, meta); + } + + @Override + public Query termQuery(Object value, SearchExecutionContext context) { + // Disable scoring + return new ConstantScoreQuery(super.termQuery(value, context)); + } + + @Override + public Query fuzzyQuery( + Object value, + Fuzziness fuzziness, + int prefixLength, + int maxExpansions, + boolean transpositions, + SearchExecutionContext context + ) { + // Disable scoring + return new ConstantScoreQuery(super.fuzzyQuery(value, fuzziness, prefixLength, maxExpansions, transpositions, context)); + } + + @Override + public Query phraseQuery(TokenStream stream, int slop, boolean enablePosIncrements, SearchExecutionContext queryShardContext) + throws IOException { + // Disable scoring + return new ConstantScoreQuery(super.phraseQuery(stream, slop, enablePosIncrements, queryShardContext)); + } + + @Override + public Query multiPhraseQuery( + TokenStream stream, + int slop, + boolean enablePositionIncrements, + SearchExecutionContext queryShardContext + ) throws IOException { + // Disable scoring + return new ConstantScoreQuery(super.multiPhraseQuery(stream, slop, enablePositionIncrements, queryShardContext)); + } + + @Override + public Query phrasePrefixQuery(TokenStream stream, int slop, int maxExpansions, SearchExecutionContext queryShardContext) + throws IOException { + // Disable scoring + return new ConstantScoreQuery(super.phrasePrefixQuery(stream, slop, maxExpansions, queryShardContext)); + } + + } + private final Version indexCreatedVersion; private final boolean index; private final boolean store; diff --git a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java index 334e48f904925..e8550d44d9af4 100644 --- a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java +++ b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java @@ -36,6 +36,7 @@ import java.util.stream.Collectors; import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.hasKey; import static org.hamcrest.Matchers.hasSize; public class OldMappingsIT extends ESRestTestCase { @@ -98,6 +99,7 @@ public void setupIndex() throws IOException { .startObject("apache2") .startObject("access") .field("url", "myurl1") + .field("agent", "agent1") .endObject() .endObject() .endObject(); @@ -111,6 +113,7 @@ public void setupIndex() throws IOException { .startObject("apache2") .startObject("access") .field("url", "myurl2") + .field("agent", "agent2 agent2") .endObject() .endObject() .endObject(); @@ -228,4 +231,26 @@ public void testAggregationOnPlaceholderField() throws IOException { assertThat(re.getMessage(), containsString("can't run aggregation or sorts on field type completion of legacy index")); } + public void testConstantScoringOnTextField() throws IOException { + Request search = new Request("POST", "/" + "filebeat" + "/_search"); + XContentBuilder query = XContentBuilder.builder(XContentType.JSON.xContent()) + .startObject() + .startObject("query") + .startObject("match") + .startObject("apache2.access.agent") + .field("query", "agent2") + .endObject() + .endObject() + .endObject() + .endObject(); + search.setJsonEntity(Strings.toString(query)); + Map response = entityAsMap(client().performRequest(search)); + List hits = (List) (XContentMapValues.extractValue("hits.hits", response)); + assertThat(hits, hasSize(1)); + @SuppressWarnings("unchecked") + Map hit = (Map) hits.get(0); + assertThat(hit, hasKey("_score")); + assertEquals(1.0d, (double) hit.get("_score"), 0.01d); + } + } From d28cea8b7a11a0eec1f3e6d732e78cd9df50e9f2 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Mon, 2 May 2022 10:54:52 +0200 Subject: [PATCH 11/19] revert change --- libs/core/src/main/java/org/elasticsearch/core/IOUtils.java | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/core/src/main/java/org/elasticsearch/core/IOUtils.java b/libs/core/src/main/java/org/elasticsearch/core/IOUtils.java index afd12b8b00015..0398418e503bc 100644 --- a/libs/core/src/main/java/org/elasticsearch/core/IOUtils.java +++ b/libs/core/src/main/java/org/elasticsearch/core/IOUtils.java @@ -314,5 +314,4 @@ public static void fsync(final Path fileToSync, final boolean isDir, final boole } } } - } From 0415143a4bcc51fea8e81b076b1e69009d1992f3 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Tue, 10 May 2022 08:56:28 +0200 Subject: [PATCH 12/19] tests --- .../index/mapper/TextFieldMapper.java | 20 ++ .../ConstantScoreTextFieldTypeTests.java | 270 ++++++++++++++++++ .../index/mapper/TextFieldMapperTests.java | 18 ++ 3 files changed, 308 insertions(+) create mode 100644 server/src/test/java/org/elasticsearch/index/mapper/ConstantScoreTextFieldTypeTests.java diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index f08bfa37239c4..ed6e4b178b6fa 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -915,6 +915,26 @@ public ConstantScoreTextFieldType(String name, boolean indexed, boolean stored, super(name, indexed, stored, tsi, meta); } + public ConstantScoreTextFieldType(String name) { + this( + name, + true, + false, + new TextSearchInfo(Defaults.FIELD_TYPE, null, Lucene.STANDARD_ANALYZER, Lucene.STANDARD_ANALYZER), + Collections.emptyMap() + ); + } + + public ConstantScoreTextFieldType(String name, boolean indexed, boolean stored, Map meta) { + this( + name, + indexed, + stored, + new TextSearchInfo(Defaults.FIELD_TYPE, null, Lucene.STANDARD_ANALYZER, Lucene.STANDARD_ANALYZER), + meta + ); + } + @Override public Query termQuery(Object value, SearchExecutionContext context) { // Disable scoring diff --git a/server/src/test/java/org/elasticsearch/index/mapper/ConstantScoreTextFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/ConstantScoreTextFieldTypeTests.java new file mode 100644 index 0000000000000..fa2947d9d0b3b --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/ConstantScoreTextFieldTypeTests.java @@ -0,0 +1,270 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ +package org.elasticsearch.index.mapper; + +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.intervals.Intervals; +import org.apache.lucene.queries.intervals.IntervalsSource; +import org.apache.lucene.search.AutomatonQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.TermInSetQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.common.lucene.BytesRefs; +import org.elasticsearch.common.lucene.search.AutomatonQueries; +import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.index.mapper.TextFieldMapper.ConstantScoreTextFieldType; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.apache.lucene.search.MultiTermQuery.CONSTANT_SCORE_REWRITE; +import static org.hamcrest.Matchers.equalTo; + +public class ConstantScoreTextFieldTypeTests extends FieldTypeTestCase { + + private static ConstantScoreTextFieldType createFieldType() { + return new ConstantScoreTextFieldType("field"); + } + + public void testIsAggregatableDependsOnFieldData() { + ConstantScoreTextFieldType ft = createFieldType(); + assertFalse(ft.isAggregatable()); + ft.setFielddata(true); + assertTrue(ft.isAggregatable()); + } + + public void testTermQuery() { + MappedFieldType ft = createFieldType(); + assertEquals(new ConstantScoreQuery(new TermQuery(new Term("field", "foo"))), ft.termQuery("foo", null)); + assertEquals(AutomatonQueries.caseInsensitiveTermQuery(new Term("field", "fOo")), ft.termQueryCaseInsensitive("fOo", null)); + + MappedFieldType unsearchable = new ConstantScoreTextFieldType("field", false, false, Collections.emptyMap()); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> unsearchable.termQuery("bar", null)); + assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + } + + public void testTermsQuery() { + MappedFieldType ft = createFieldType(); + List terms = new ArrayList<>(); + terms.add(new BytesRef("foo")); + terms.add(new BytesRef("bar")); + assertEquals(new TermInSetQuery("field", terms), ft.termsQuery(Arrays.asList("foo", "bar"), null)); + + MappedFieldType unsearchable = new ConstantScoreTextFieldType("field", false, false, Collections.emptyMap()); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> unsearchable.termsQuery(Arrays.asList("foo", "bar"), null) + ); + assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + } + + public void testRangeQuery() { + MappedFieldType ft = createFieldType(); + assertEquals( + new TermRangeQuery("field", BytesRefs.toBytesRef("foo"), BytesRefs.toBytesRef("bar"), true, false), + ft.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT) + ); + + ElasticsearchException ee = expectThrows( + ElasticsearchException.class, + () -> ft.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT_DISALLOW_EXPENSIVE) + ); + assertEquals( + "[range] queries on [text] or [keyword] fields cannot be executed when " + "'search.allow_expensive_queries' is set to false.", + ee.getMessage() + ); + } + + public void testRegexpQuery() { + MappedFieldType ft = createFieldType(); + assertEquals(new RegexpQuery(new Term("field", "foo.*")), ft.regexpQuery("foo.*", 0, 0, 10, null, MOCK_CONTEXT)); + + MappedFieldType unsearchable = new ConstantScoreTextFieldType("field", false, false, Collections.emptyMap()); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> unsearchable.regexpQuery("foo.*", 0, 0, 10, null, MOCK_CONTEXT) + ); + assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + + ElasticsearchException ee = expectThrows( + ElasticsearchException.class, + () -> ft.regexpQuery("foo.*", randomInt(10), 0, randomInt(10) + 1, null, MOCK_CONTEXT_DISALLOW_EXPENSIVE) + ); + assertEquals("[regexp] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", ee.getMessage()); + } + + public void testFuzzyQuery() { + MappedFieldType ft = createFieldType(); + assertEquals( + new ConstantScoreQuery(new FuzzyQuery(new Term("field", "foo"), 2, 1, 50, true)), + ft.fuzzyQuery("foo", Fuzziness.fromEdits(2), 1, 50, true, MOCK_CONTEXT) + ); + + MappedFieldType unsearchable = new ConstantScoreTextFieldType("field", false, false, Collections.emptyMap()); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> unsearchable.fuzzyQuery("foo", Fuzziness.fromEdits(2), 1, 50, true, MOCK_CONTEXT) + ); + assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + + ElasticsearchException ee = expectThrows( + ElasticsearchException.class, + () -> ft.fuzzyQuery( + "foo", + Fuzziness.AUTO, + randomInt(10) + 1, + randomInt(10) + 1, + randomBoolean(), + MOCK_CONTEXT_DISALLOW_EXPENSIVE + ) + ); + assertEquals("[fuzzy] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", ee.getMessage()); + } + + public void testIndexPrefixes() { + ConstantScoreTextFieldType ft = createFieldType(); + ft.setIndexPrefixes(2, 10); + + Query q = ft.prefixQuery("goin", CONSTANT_SCORE_REWRITE, false, randomMockContext()); + assertEquals(new ConstantScoreQuery(new TermQuery(new Term("field._index_prefix", "goin"))), q); + + q = ft.prefixQuery("internationalisatio", CONSTANT_SCORE_REWRITE, false, MOCK_CONTEXT); + assertEquals(new PrefixQuery(new Term("field", "internationalisatio")), q); + + q = ft.prefixQuery("Internationalisatio", CONSTANT_SCORE_REWRITE, true, MOCK_CONTEXT); + assertEquals(AutomatonQueries.caseInsensitivePrefixQuery(new Term("field", "Internationalisatio")), q); + + ElasticsearchException ee = expectThrows( + ElasticsearchException.class, + () -> ft.prefixQuery("internationalisatio", null, false, MOCK_CONTEXT_DISALLOW_EXPENSIVE) + ); + assertEquals( + "[prefix] queries cannot be executed when 'search.allow_expensive_queries' is set to false. " + + "For optimised prefix queries on text fields please enable [index_prefixes].", + ee.getMessage() + ); + + q = ft.prefixQuery("g", CONSTANT_SCORE_REWRITE, false, randomMockContext()); + Automaton automaton = Operations.concatenate(Arrays.asList(Automata.makeChar('g'), Automata.makeAnyChar())); + + Query expected = new ConstantScoreQuery( + new BooleanQuery.Builder().add(new AutomatonQuery(new Term("field._index_prefix", "g*"), automaton), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "g")), BooleanClause.Occur.SHOULD) + .build() + ); + + assertThat(q, equalTo(expected)); + } + + public void testFetchSourceValue() throws IOException { + ConstantScoreTextFieldType fieldType = createFieldType(); + + assertEquals(List.of("value"), fetchSourceValue(fieldType, "value")); + assertEquals(List.of("42"), fetchSourceValue(fieldType, 42L)); + assertEquals(List.of("true"), fetchSourceValue(fieldType, true)); + } + + public void testWildcardQuery() { + ConstantScoreTextFieldType ft = createFieldType(); + + // case sensitive + AutomatonQuery actual = (AutomatonQuery) ft.wildcardQuery("*Butterflies*", null, false, MOCK_CONTEXT); + AutomatonQuery expected = new WildcardQuery(new Term("field", new BytesRef("*Butterflies*"))); + assertEquals(expected, actual); + assertFalse(new CharacterRunAutomaton(actual.getAutomaton()).run("some butterflies somewhere")); + + // case insensitive + actual = (AutomatonQuery) ft.wildcardQuery("*Butterflies*", null, true, MOCK_CONTEXT); + expected = AutomatonQueries.caseInsensitiveWildcardQuery(new Term("field", new BytesRef("*Butterflies*"))); + assertEquals(expected, actual); + assertTrue(new CharacterRunAutomaton(actual.getAutomaton()).run("some butterflies somewhere")); + assertTrue(new CharacterRunAutomaton(actual.getAutomaton()).run("some Butterflies somewhere")); + + ElasticsearchException ee = expectThrows( + ElasticsearchException.class, + () -> ft.wildcardQuery("valu*", null, MOCK_CONTEXT_DISALLOW_EXPENSIVE) + ); + assertEquals("[wildcard] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", ee.getMessage()); + } + + /** + * we use this e.g. in query string query parser to normalize terms on text fields + */ + public void testNormalizedWildcardQuery() { + ConstantScoreTextFieldType ft = createFieldType(); + + AutomatonQuery actual = (AutomatonQuery) ft.normalizedWildcardQuery("*Butterflies*", null, MOCK_CONTEXT); + AutomatonQuery expected = new WildcardQuery(new Term("field", new BytesRef("*butterflies*"))); + assertEquals(expected, actual); + assertTrue(new CharacterRunAutomaton(actual.getAutomaton()).run("some butterflies somewhere")); + assertFalse(new CharacterRunAutomaton(actual.getAutomaton()).run("some Butterflies somewhere")); + + ElasticsearchException ee = expectThrows( + ElasticsearchException.class, + () -> ft.wildcardQuery("valu*", null, MOCK_CONTEXT_DISALLOW_EXPENSIVE) + ); + assertEquals("[wildcard] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", ee.getMessage()); + } + + public void testTermIntervals() throws IOException { + MappedFieldType ft = createFieldType(); + IntervalsSource termIntervals = ft.termIntervals(new BytesRef("foo"), MOCK_CONTEXT); + assertEquals(Intervals.term(new BytesRef("foo")), termIntervals); + } + + public void testPrefixIntervals() throws IOException { + MappedFieldType ft = createFieldType(); + IntervalsSource prefixIntervals = ft.prefixIntervals(new BytesRef("foo"), MOCK_CONTEXT); + assertEquals(Intervals.prefix(new BytesRef("foo")), prefixIntervals); + } + + public void testWildcardIntervals() throws IOException { + MappedFieldType ft = createFieldType(); + IntervalsSource wildcardIntervals = ft.wildcardIntervals(new BytesRef("foo"), MOCK_CONTEXT); + assertEquals(Intervals.wildcard(new BytesRef("foo")), wildcardIntervals); + } + + public void testFuzzyIntervals() throws IOException { + MappedFieldType ft = createFieldType(); + IntervalsSource fuzzyIntervals = ft.fuzzyIntervals("foo", 1, 2, true, MOCK_CONTEXT); + FuzzyQuery fq = new FuzzyQuery(new Term("field", "foo"), 1, 2, 128, true); + IntervalsSource expectedIntervals = Intervals.multiterm(fq.getAutomata(), "foo"); + assertEquals(expectedIntervals, fuzzyIntervals); + } + + public void testPrefixIntervalsWithIndexedPrefixes() { + ConstantScoreTextFieldType ft = createFieldType(); + ft.setIndexPrefixes(1, 4); + IntervalsSource prefixIntervals = ft.prefixIntervals(new BytesRef("foo"), MOCK_CONTEXT); + assertEquals(Intervals.fixField("field._index_prefix", Intervals.term(new BytesRef("foo"))), prefixIntervals); + } + + public void testWildcardIntervalsWithIndexedPrefixes() { + ConstantScoreTextFieldType ft = createFieldType(); + ft.setIndexPrefixes(1, 4); + IntervalsSource wildcardIntervals = ft.wildcardIntervals(new BytesRef("foo"), MOCK_CONTEXT); + assertEquals(Intervals.wildcard(new BytesRef("foo")), wildcardIntervals); + } +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java index 7684c8e695b6a..e7f83efa2e49f 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java @@ -43,6 +43,7 @@ import org.apache.lucene.tests.analysis.MockSynonymAnalyzer; import org.apache.lucene.tests.analysis.Token; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.Version; import org.elasticsearch.common.Strings; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; import org.elasticsearch.index.IndexSettings; @@ -1087,4 +1088,21 @@ protected Object generateRandomInputValue(MappedFieldType ft) { protected void randomFetchTestFieldConfig(XContentBuilder b) throws IOException { assumeFalse("We don't have a way to assert things here", true); } + + public void testUnknownAnalyzerOnLegacyIndex() throws IOException { + XContentBuilder startingMapping = fieldMapping(b -> b.field("type", "text").field("analyzer", "does_not_exist")); + + expectThrows(MapperParsingException.class, () -> createMapperService(startingMapping)); + + MapperService mapperService = createMapperService(Version.fromString("5.0.0"), startingMapping); + assertThat(mapperService.documentMapper().mappers().getMapper("field"), instanceOf(TextFieldMapper.class)); + + merge(mapperService, startingMapping); + assertThat(mapperService.documentMapper().mappers().getMapper("field"), instanceOf(TextFieldMapper.class)); + + // check that analyzer can be swapped out on legacy index + XContentBuilder differentAnalyzer = fieldMapping(b -> b.field("type", "text").field("analyzer", "keyword")); + merge(mapperService, differentAnalyzer); + assertThat(mapperService.documentMapper().mappers().getMapper("field"), instanceOf(TextFieldMapper.class)); + } } From 7ef03bd054cc17e520cc69071792bce79c5dd157 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Tue, 10 May 2022 09:41:15 +0200 Subject: [PATCH 13/19] tests --- .../xpack/lucene/bwc/codecs/BWCCodec.java | 64 +++++++++++++++++++ .../bwc/codecs/lucene60/Lucene60Codec.java | 2 +- .../bwc/codecs/lucene62/Lucene62Codec.java | 2 +- .../elasticsearch/oldrepos/OldMappingsIT.java | 19 +++--- .../org/elasticsearch/oldrepos/custom.json | 23 +++++++ 5 files changed, 99 insertions(+), 11 deletions(-) create mode 100644 x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/custom.json diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java index be5be0bc6a965..6a85c3533c17a 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java @@ -10,20 +10,29 @@ import org.apache.lucene.backward_codecs.lucene70.Lucene70Codec; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.BWCLucene70Codec; import java.io.IOException; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; /** @@ -136,4 +145,59 @@ public static SegmentInfo wrap(SegmentInfo segmentInfo) { return segmentInfo1; } + /** + * In-memory postings format that shows no postings available. + */ + public static class EmptyPostingsFormat extends PostingsFormat { + + public EmptyPostingsFormat() { + super("EmptyPostingsFormat"); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) { + return new FieldsConsumer() { + @Override + public void write(Fields fields, NormsProducer norms) { + throw new UnsupportedOperationException(); + } + + @Override + public void close() { + + } + }; + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) { + return new FieldsProducer() { + @Override + public void close() { + + } + + @Override + public void checkIntegrity() { + + } + + @Override + public Iterator iterator() { + return null; + } + + @Override + public Terms terms(String field) { + return null; + } + + @Override + public int size() { + return 0; + } + }; + } + } + } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java index d507d49907433..39bf8d5a431a5 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java @@ -64,7 +64,7 @@ protected PostingsFormat getPostingsFormat(String formatName) { if (formatName.equals("Lucene50")) { return new BWCLucene50PostingsFormat(); } else { - return super.getPostingsFormat(formatName); + return new EmptyPostingsFormat(); } } }; diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java index 85084317977b3..a2f3225d66e70 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java @@ -63,7 +63,7 @@ protected PostingsFormat getPostingsFormat(String formatName) { if (formatName.equals("Lucene50")) { return new BWCLucene50PostingsFormat(); } else { - return super.getPostingsFormat(formatName); + return new EmptyPostingsFormat(); } } }; diff --git a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java index e5db2380f177b..f82b640681950 100644 --- a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java +++ b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java @@ -79,9 +79,9 @@ public void setupIndex() throws IOException { String snapshotName = "snap"; List indices; if (oldVersion.before(Version.fromString("6.0.0"))) { - indices = Arrays.asList("filebeat", "winlogbeat"); + indices = Arrays.asList("filebeat", "winlogbeat", "custom"); } else { - indices = Arrays.asList("filebeat"); + indices = Arrays.asList("filebeat", "custom"); } int oldEsPort = Integer.parseInt(System.getProperty("tests.es.port")); @@ -91,8 +91,9 @@ public void setupIndex() throws IOException { if (oldVersion.before(Version.fromString("6.0.0"))) { assertOK(oldEs.performRequest(createIndex("winlogbeat", "winlogbeat.json"))); } + assertOK(oldEs.performRequest(createIndex("custom", "custom.json"))); - Request doc1 = new Request("PUT", "/" + "filebeat" + "/" + "doc" + "/" + "1"); + Request doc1 = new Request("PUT", "/" + "custom" + "/" + "doc" + "/" + "1"); doc1.addParameter("refresh", "true"); XContentBuilder bodyDoc1 = XContentFactory.jsonBuilder() .startObject() @@ -106,7 +107,7 @@ public void setupIndex() throws IOException { doc1.setJsonEntity(Strings.toString(bodyDoc1)); assertOK(oldEs.performRequest(doc1)); - Request doc2 = new Request("PUT", "/" + "filebeat" + "/" + "doc" + "/" + "2"); + Request doc2 = new Request("PUT", "/" + "custom" + "/" + "doc" + "/" + "2"); doc2.addParameter("refresh", "true"); XContentBuilder bodyDoc2 = XContentFactory.jsonBuilder() .startObject() @@ -178,7 +179,7 @@ public void testMappingOk() throws IOException { } public void testSearchKeyword() throws IOException { - Request search = new Request("POST", "/" + "filebeat" + "/_search"); + Request search = new Request("POST", "/" + "custom" + "/_search"); XContentBuilder query = XContentBuilder.builder(XContentType.JSON.xContent()) .startObject() .startObject("query") @@ -196,7 +197,7 @@ public void testSearchKeyword() throws IOException { } public void testSearchOnPlaceHolderField() throws IOException { - Request search = new Request("POST", "/" + "filebeat" + "/_search"); + Request search = new Request("POST", "/" + "custom" + "/_search"); XContentBuilder query = XContentBuilder.builder(XContentType.JSON.xContent()) .startObject() .startObject("query") @@ -216,7 +217,7 @@ public void testSearchOnPlaceHolderField() throws IOException { } public void testAggregationOnPlaceholderField() throws IOException { - Request search = new Request("POST", "/" + "filebeat" + "/_search"); + Request search = new Request("POST", "/" + "custom" + "/_search"); XContentBuilder query = XContentBuilder.builder(XContentType.JSON.xContent()) .startObject() .startObject("aggs") @@ -233,7 +234,7 @@ public void testAggregationOnPlaceholderField() throws IOException { } public void testConstantScoringOnTextField() throws IOException { - Request search = new Request("POST", "/" + "filebeat" + "/_search"); + Request search = new Request("POST", "/" + "custom" + "/_search"); XContentBuilder query = XContentBuilder.builder(XContentType.JSON.xContent()) .startObject() .startObject("query") @@ -255,7 +256,7 @@ public void testConstantScoringOnTextField() throws IOException { } public void testSearchFieldsOnPlaceholderField() throws IOException { - Request search = new Request("POST", "/" + "filebeat" + "/_search"); + Request search = new Request("POST", "/" + "custom" + "/_search"); XContentBuilder query = XContentBuilder.builder(XContentType.JSON.xContent()) .startObject() .startObject("query") diff --git a/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/custom.json b/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/custom.json new file mode 100644 index 0000000000000..c9c4b34179223 --- /dev/null +++ b/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/custom.json @@ -0,0 +1,23 @@ +"_default_": { + "properties": { + "apache2": { + "properties": { + "access": { + "properties": { + "agent": { + "norms": false, + "type": "text" + }, + "url": { + "ignore_above": 1024, + "type": "keyword" + } + } + } + } + }, + "completion": { + "type": "completion" + } + } +} From 1da6dd1fe5896329ad6155937b324ec3a4b6b68d Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Tue, 10 May 2022 09:47:18 +0200 Subject: [PATCH 14/19] =?UTF-8?q?fix=C3=B8=20o=20x?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../elasticsearch/oldrepos/OldRepositoryAccessIT.java | 9 --------- .../resources/org/elasticsearch/oldrepos/filebeat.json | 3 --- 2 files changed, 12 deletions(-) diff --git a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java index a6f4e59e56668..a5b0472f2f845 100644 --- a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java +++ b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java @@ -11,8 +11,6 @@ import org.elasticsearch.Version; import org.elasticsearch.action.admin.cluster.repositories.put.PutRepositoryRequest; import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotRequest; -import org.elasticsearch.action.get.GetRequest; -import org.elasticsearch.action.get.GetResponse; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.Request; @@ -441,13 +439,6 @@ private void assertDocs( Arrays.stream(searchResponse.getHits().getHits()).map(SearchHit::getId).collect(Collectors.toList()) ); - // look up by id (only 6.0+ as we would otherwise need ability to specify _type in GET API) - if (oldVersion.onOrAfter(Version.fromString("6.0.0"))) { - GetResponse getResponse = client.get(new GetRequest(index, id), RequestOptions.DEFAULT); - assertTrue(getResponse.isExists()); - assertEquals(sourceForDoc(getIdAsNumeric(id)), getResponse.getSourceAsString()); - } - // look up postings searchResponse = client.search( new SearchRequest(index).source(SearchSourceBuilder.searchSource().query(QueryBuilders.matchQuery("test", "test" + num))), diff --git a/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/filebeat.json b/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/filebeat.json index a5debfb988386..6fa22f1c36ef9 100644 --- a/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/filebeat.json +++ b/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/filebeat.json @@ -676,9 +676,6 @@ "type": { "ignore_above": 1024, "type": "keyword" - }, - "completion": { - "type": "completion" } } } From 948f2a95f2c0a6e2bc9cbc68dfc9b24bd81683b7 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Tue, 10 May 2022 10:00:07 +0200 Subject: [PATCH 15/19] remove --- .../org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java index 6a85c3533c17a..036cd042f8ec0 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java @@ -138,7 +138,7 @@ public static SegmentInfo wrap(SegmentInfo segmentInfo) { codec, segmentInfo.getDiagnostics(), segmentInfo.getId(), - segmentInfo.getAttributes(), // adapt attributes so that per-field format codecs are overriden + segmentInfo.getAttributes(), segmentInfo.getIndexSort() ); segmentInfo1.setFiles(segmentInfo.files()); From 56d391c364c486ee935aaad75d351fe241d0b040 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Tue, 10 May 2022 14:40:50 +0200 Subject: [PATCH 16/19] no fielddata --- .../index/mapper/TextFieldMapper.java | 9 ++++--- .../ConstantScoreTextFieldTypeTests.java | 7 ------ .../index/mapper/TextFieldMapperTests.java | 25 +++++++++++++++++++ 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index ed6e4b178b6fa..df86e5ae05568 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -345,12 +345,13 @@ private TextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext c TextFieldType ft; if (indexCreatedVersion.isLegacyIndexVersion()) { ft = new ConstantScoreTextFieldType(context.buildFullName(name), index.getValue(), store.getValue(), tsi, meta.getValue()); + // ignore fieldData and eagerGlobalOrdinals } else { ft = new TextFieldType(context.buildFullName(name), index.getValue(), store.getValue(), tsi, meta.getValue()); - } - ft.eagerGlobalOrdinals = eagerGlobalOrdinals.getValue(); - if (fieldData.getValue()) { - ft.setFielddata(true, freqFilter.getValue()); + ft.eagerGlobalOrdinals = eagerGlobalOrdinals.getValue(); + if (fieldData.getValue()) { + ft.setFielddata(true, freqFilter.getValue()); + } } return ft; } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/ConstantScoreTextFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/ConstantScoreTextFieldTypeTests.java index fa2947d9d0b3b..e63e5e816483f 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/ConstantScoreTextFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/ConstantScoreTextFieldTypeTests.java @@ -48,13 +48,6 @@ private static ConstantScoreTextFieldType createFieldType() { return new ConstantScoreTextFieldType("field"); } - public void testIsAggregatableDependsOnFieldData() { - ConstantScoreTextFieldType ft = createFieldType(); - assertFalse(ft.isAggregatable()); - ft.setFielddata(true); - assertTrue(ft.isAggregatable()); - } - public void testTermQuery() { MappedFieldType ft = createFieldType(); assertEquals(new ConstantScoreQuery(new TermQuery(new Term("field", "foo"))), ft.termQuery("foo", null)); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java index 00567082ceac3..0edef028c8123 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java @@ -1105,4 +1105,29 @@ public void testUnknownAnalyzerOnLegacyIndex() throws IOException { merge(mapperService, differentAnalyzer); assertThat(mapperService.documentMapper().mappers().getMapper("field"), instanceOf(TextFieldMapper.class)); } + + public void testIgnoreFieldDataOnLegacyIndex() throws IOException { + XContentBuilder mapping = fieldMapping(b -> b.field("type", "text").field("fielddata", true)); + MapperService mapperService = createMapperService(mapping); + assertTrue(((TextFieldMapper) mapperService.documentMapper().mappers().getMapper("field")).fieldType().fielddata()); + + mapperService = createMapperService(Version.fromString("5.0.0"), mapping); + assertFalse(((TextFieldMapper) mapperService.documentMapper().mappers().getMapper("field")).fieldType().fielddata()); + + MapperService finalMapperService = mapperService; + expectThrows( + IllegalArgumentException.class, + () -> ((TextFieldMapper) finalMapperService.documentMapper().mappers().getMapper("field")).fieldType() + .fielddataBuilder("test", null) + ); + } + + public void testIgnoreEagerGlobalOrdinalsOnLegacyIndex() throws IOException { + XContentBuilder mapping = fieldMapping(b -> b.field("type", "text").field("eager_global_ordinals", true)); + MapperService mapperService = createMapperService(mapping); + assertTrue(((TextFieldMapper) mapperService.documentMapper().mappers().getMapper("field")).fieldType().eagerGlobalOrdinals()); + + mapperService = createMapperService(Version.fromString("5.0.0"), mapping); + assertFalse(((TextFieldMapper) mapperService.documentMapper().mappers().getMapper("field")).fieldType().eagerGlobalOrdinals()); + } } From 604d70c05b677de421b18d6ab4849efa24b02607 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Tue, 10 May 2022 15:24:27 +0200 Subject: [PATCH 17/19] no spans --- .../index/mapper/TextFieldMapper.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index a1d33f63ee20d..e91a8b848ce3b 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -345,7 +345,7 @@ private TextFieldType buildFieldType(FieldType fieldType, MapperBuilderContext c TextSearchInfo tsi = new TextSearchInfo(fieldType, similarity.getValue(), searchAnalyzer, searchQuoteAnalyzer); TextFieldType ft; if (indexCreatedVersion.isLegacyIndexVersion()) { - ft = new ConstantScoreTextFieldType(context.buildFullName(name), index.getValue(), store.getValue(), tsi, meta.getValue()); + ft = new LegacyTextFieldType(context.buildFullName(name), index.getValue(), store.getValue(), tsi, meta.getValue()); // ignore fieldData and eagerGlobalOrdinals } else { ft = new TextFieldType(context.buildFullName(name), index.getValue(), store.getValue(), tsi, meta.getValue()); @@ -983,6 +983,19 @@ public Query phrasePrefixQuery(TokenStream stream, int slop, int maxExpansions, } + static class LegacyTextFieldType extends ConstantScoreTextFieldType { + + LegacyTextFieldType(String name, boolean indexed, boolean stored, TextSearchInfo tsi, Map meta) { + super(name, indexed, stored, tsi, meta); + } + + @Override + public SpanQuery spanPrefixQuery(String value, SpanMultiTermQueryWrapper.SpanRewriteMethod method, SearchExecutionContext context) { + throw new IllegalArgumentException("Cannot use span prefix queries on text field " + name() + " of a legacy index"); + } + + } + private final Version indexCreatedVersion; private final boolean index; private final boolean store; From 817b5a3d347f21f82d4a3550682ab01c69f5bd16 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 11 May 2022 11:31:03 +0200 Subject: [PATCH 18/19] disable norms properly --- .../index/mapper/TextFieldMapper.java | 9 ++++++++- .../elasticsearch/oldrepos/OldMappingsIT.java | 16 ++++++++++++++++ .../org/elasticsearch/oldrepos/custom.json | 1 - 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index e91a8b848ce3b..5911d3abe796c 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -438,7 +438,14 @@ public Map indexAnalyzers(String name, SubFieldInfo phras @Override public TextFieldMapper build(MapperBuilderContext context) { - FieldType fieldType = TextParams.buildFieldType(index, store, indexOptions, norms, termVectors); + FieldType fieldType = TextParams.buildFieldType( + index, + store, + indexOptions, + // legacy indices do not have access to norms + indexCreatedVersion.isLegacyIndexVersion() ? () -> false : norms, + termVectors + ); TextFieldType tft = buildFieldType(fieldType, context, indexCreatedVersion); SubFieldInfo phraseFieldInfo = buildPhraseInfo(fieldType, tft); SubFieldInfo prefixFieldInfo = buildPrefixInfo(context, fieldType, tft); diff --git a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java index f82b640681950..6f49856cdcb25 100644 --- a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java +++ b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldMappingsIT.java @@ -255,6 +255,22 @@ public void testConstantScoringOnTextField() throws IOException { assertEquals(1.0d, (double) hit.get("_score"), 0.01d); } + public void testFieldsExistQueryOnTextField() throws IOException { + Request search = new Request("POST", "/" + "custom" + "/_search"); + XContentBuilder query = XContentBuilder.builder(XContentType.JSON.xContent()) + .startObject() + .startObject("query") + .startObject("exists") + .field("field", "apache2.access.agent") + .endObject() + .endObject() + .endObject(); + search.setJsonEntity(Strings.toString(query)); + Map response = entityAsMap(client().performRequest(search)); + List hits = (List) (XContentMapValues.extractValue("hits.hits", response)); + assertThat(hits, hasSize(2)); + } + public void testSearchFieldsOnPlaceholderField() throws IOException { Request search = new Request("POST", "/" + "custom" + "/_search"); XContentBuilder query = XContentBuilder.builder(XContentType.JSON.xContent()) diff --git a/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/custom.json b/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/custom.json index c9c4b34179223..ae52ccbcce330 100644 --- a/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/custom.json +++ b/x-pack/qa/repository-old-versions/src/test/resources/org/elasticsearch/oldrepos/custom.json @@ -5,7 +5,6 @@ "access": { "properties": { "agent": { - "norms": false, "type": "text" }, "url": { From 1f96dfc911bcbde329331f6284f57c3254d5cd81 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 11 May 2022 12:51:53 +0200 Subject: [PATCH 19/19] fix existsQuery on text fields --- .../index/mapper/TextFieldMapper.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index 5911d3abe796c..2b0874ba84aba 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -46,6 +46,7 @@ import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.search.AutomatonQueries; @@ -79,6 +80,8 @@ import java.util.function.IntPredicate; import java.util.function.Supplier; +import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES; + /** A {@link FieldMapper} for full-text fields. */ public class TextFieldMapper extends FieldMapper { @@ -992,8 +995,12 @@ public Query phrasePrefixQuery(TokenStream stream, int slop, int maxExpansions, static class LegacyTextFieldType extends ConstantScoreTextFieldType { + private final MappedFieldType existQueryFieldType; + LegacyTextFieldType(String name, boolean indexed, boolean stored, TextSearchInfo tsi, Map meta) { super(name, indexed, stored, tsi, meta); + // norms are not available, neither are doc-values, so fall back to _source to run exists query + existQueryFieldType = KeywordScriptFieldType.sourceOnly(name()).asMappedFieldTypes().findFirst().get(); } @Override @@ -1001,6 +1008,16 @@ public SpanQuery spanPrefixQuery(String value, SpanMultiTermQueryWrapper.SpanRew throw new IllegalArgumentException("Cannot use span prefix queries on text field " + name() + " of a legacy index"); } + @Override + public Query existsQuery(SearchExecutionContext context) { + if (context.allowExpensiveQueries() == false) { + throw new ElasticsearchException( + "runtime-computed exists query cannot be executed while [" + ALLOW_EXPENSIVE_QUERIES.getKey() + "] is set to [false]." + ); + } + return existQueryFieldType.existsQuery(context); + } + } private final Version indexCreatedVersion;