/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.core;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.RollingBuffer;

Converts an incoming graph token stream, such as one from SynonymGraphFilter, into a flat form so that all nodes form a single linear chain with no side paths. Every path through the graph touches every node. This is necessary when indexing a graph token stream, because the index does not save PositionLengthAttribute and so it cannot preserve the graph structure. However, at search time, query parsers can correctly handle the graph and this token filter should not be used.

If the graph was not already flat to start, this is likely a lossy process, i.e. it will often cause the graph to accept token sequences it should not, and to reject token sequences it should not.

However, when applying synonyms during indexing, this is necessary because Lucene already does not index a graph and so the indexing process is already lossy (it ignores the PositionLengthAttribute).

@lucene.experimental
/** * Converts an incoming graph token stream, such as one from * {@link SynonymGraphFilter}, into a flat form so that * all nodes form a single linear chain with no side paths. Every * path through the graph touches every node. This is necessary * when indexing a graph token stream, because the index does not * save {@link PositionLengthAttribute} and so it cannot * preserve the graph structure. However, at search time, * query parsers can correctly handle the graph and this token * filter should <b>not</b> be used. * * <p>If the graph was not already flat to start, this * is likely a lossy process, i.e. it will often cause the * graph to accept token sequences it should not, and to * reject token sequences it should not. * * <p>However, when applying synonyms during indexing, this * is necessary because Lucene already does not index a graph * and so the indexing process is already lossy * (it ignores the {@link PositionLengthAttribute}). * * @lucene.experimental */
public final class FlattenGraphFilter extends TokenFilter {
Holds all tokens leaving a given input position.
/** Holds all tokens leaving a given input position. */
private final static class InputNode implements RollingBuffer.Resettable { private final List<AttributeSource.State> tokens = new ArrayList<>();
Our input node, or -1 if we haven't been assigned yet
/** Our input node, or -1 if we haven't been assigned yet */
int node = -1;
Maximum to input node for all tokens leaving here; we use this to know when we can freeze.
/** Maximum to input node for all tokens leaving here; we use this * to know when we can freeze. */
int maxToNode = -1;
Where we currently map to; this changes (can only increase as we see more input tokens), until we are finished with this position.
/** Where we currently map to; this changes (can only * increase as we see more input tokens), until we are finished * with this position. */
int outputNode = -1;
Which token (index into tokens) we will next output.
/** Which token (index into {@link #tokens}) we will next output. */
int nextOut; @Override public void reset() { tokens.clear(); node = -1; outputNode = -1; maxToNode = -1; nextOut = 0; } }
Gathers up merged input positions into a single output position, only for the current "frontier" of nodes we've seen but can't yet output because they are not frozen.
/** Gathers up merged input positions into a single output position, * only for the current "frontier" of nodes we've seen but can't yet * output because they are not frozen. */
private final static class OutputNode implements RollingBuffer.Resettable { private final List<Integer> inputNodes = new ArrayList<>();
Node ID for this output, or -1 if we haven't been assigned yet.
/** Node ID for this output, or -1 if we haven't been assigned yet. */
int node = -1;
Which input node (index into inputNodes) we will next output.
/** Which input node (index into {@link #inputNodes}) we will next output. */
int nextOut;
Start offset of tokens leaving this node.
/** Start offset of tokens leaving this node. */
int startOffset = -1;
End offset of tokens arriving to this node.
/** End offset of tokens arriving to this node. */
int endOffset = -1; @Override public void reset() { inputNodes.clear(); node = -1; nextOut = 0; startOffset = -1; endOffset = -1; } } private final RollingBuffer<InputNode> inputNodes = new RollingBuffer<InputNode>() { @Override protected InputNode newInstance() { return new InputNode(); } }; private final RollingBuffer<OutputNode> outputNodes = new RollingBuffer<OutputNode>() { @Override protected OutputNode newInstance() { return new OutputNode(); } }; private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
Which input node the last seen token leaves from
/** Which input node the last seen token leaves from */
private int inputFrom;
We are currently releasing tokens leaving from this output node
/** We are currently releasing tokens leaving from this output node */
private int outputFrom; // for debugging: //private int retOutputFrom; private boolean done; private int lastOutputFrom; private int finalOffset; private int finalPosInc; private int maxLookaheadUsed; private int lastStartOffset; public FlattenGraphFilter(TokenStream in) { super(in); } private boolean releaseBufferedToken() { // We only need the while loop (retry) if we have a hole (an output node that has no tokens leaving): while (outputFrom < outputNodes.getMaxPos()) { OutputNode output = outputNodes.get(outputFrom); if (output.inputNodes.isEmpty()) { // No tokens arrived to this node, which happens for the first node // after a hole: //System.out.println(" skip empty outputFrom=" + outputFrom); outputFrom++; continue; } int maxToNode = -1; for(int inputNodeID : output.inputNodes) { InputNode inputNode = inputNodes.get(inputNodeID); assert inputNode.outputNode == outputFrom; maxToNode = Math.max(maxToNode, inputNode.maxToNode); } //System.out.println(" release maxToNode=" + maxToNode + " vs inputFrom=" + inputFrom); // TODO: we could shrink the frontier here somewhat if we // always output posLen=1 as part of our "sausagizing": if (maxToNode <= inputFrom || done) { //System.out.println(" output node merged these inputs: " + output.inputNodes); // These tokens are now frozen assert output.nextOut < output.inputNodes.size(): "output.nextOut=" + output.nextOut + " vs output.inputNodes.size()=" + output.inputNodes.size(); InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut)); if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) { return false; } if (inputNode.tokens.size() == 0) { assert inputNode.nextOut == 0; assert output.nextOut == 0; // Hole dest nodes should never be merged since 1) we always // assign them to a new output position, and 2) since they never // have arriving tokens they cannot be pushed: assert output.inputNodes.size() == 1: output.inputNodes.size(); outputFrom++; inputNodes.freeBefore(output.inputNodes.get(0)); outputNodes.freeBefore(outputFrom); continue; } assert inputNode.nextOut < inputNode.tokens.size(); restoreState(inputNode.tokens.get(inputNode.nextOut)); // Correct posInc assert outputFrom >= lastOutputFrom; posIncAtt.setPositionIncrement(outputFrom - lastOutputFrom); int toInputNodeID = inputNode.node + posLenAtt.getPositionLength(); InputNode toInputNode = inputNodes.get(toInputNodeID); // Correct posLen assert toInputNode.outputNode > outputFrom; posLenAtt.setPositionLength(toInputNode.outputNode - outputFrom); lastOutputFrom = outputFrom; inputNode.nextOut++; //System.out.println(" ret " + this); OutputNode outputEndNode = outputNodes.get(toInputNode.outputNode); // Correct offsets // This is a bit messy; we must do this so offset don't go backwards, // which would otherwise happen if the replacement has more tokens // than the input: int startOffset = Math.max(lastStartOffset, output.startOffset); // We must do this in case the incoming tokens have broken offsets: int endOffset = Math.max(startOffset, outputEndNode.endOffset); offsetAtt.setOffset(startOffset, endOffset); lastStartOffset = startOffset; if (inputNode.nextOut == inputNode.tokens.size()) { output.nextOut++; if (output.nextOut == output.inputNodes.size()) { outputFrom++; inputNodes.freeBefore(output.inputNodes.get(0)); outputNodes.freeBefore(outputFrom); } } return true; } else { return false; } } //System.out.println(" break false"); return false; } @Override public boolean incrementToken() throws IOException { //System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom); while (true) { if (releaseBufferedToken()) { //retOutputFrom += posIncAtt.getPositionIncrement(); //System.out.println(" return buffered: " + termAtt + " " + retOutputFrom + "-" + (retOutputFrom + posLenAtt.getPositionLength())); //printStates(); return true; } else if (done) { //System.out.println(" done, return false"); return false; } if (input.incrementToken()) { // Input node this token leaves from: inputFrom += posIncAtt.getPositionIncrement(); int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); // Input node this token goes to: int inputTo = inputFrom + posLenAtt.getPositionLength(); //System.out.println(" input.inc " + termAtt + ": " + inputFrom + "-" + inputTo); InputNode src = inputNodes.get(inputFrom); if (src.node == -1) { // This means the "from" node of this token was never seen as a "to" node, // which should only happen if we just crossed a hole. This is a challenging // case for us because we normally rely on the full dependencies expressed // by the arcs to assign outgoing node IDs. It would be better if tokens // were never dropped but instead just marked deleted with a new // TermDeletedAttribute (boolean valued) ... but until that future, we have // a hack here to forcefully jump the output node ID: assert src.outputNode == -1; src.node = inputFrom; src.outputNode = outputNodes.getMaxPos() + 1; //System.out.println(" hole: force to outputNode=" + src.outputNode); OutputNode outSrc = outputNodes.get(src.outputNode); // Not assigned yet: assert outSrc.node == -1; outSrc.node = src.outputNode; outSrc.inputNodes.add(inputFrom); outSrc.startOffset = startOffset; } else { OutputNode outSrc = outputNodes.get(src.outputNode); if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) { // "shrink wrap" the offsets so the original tokens (with most // restrictive offsets) win: outSrc.startOffset = Math.max(startOffset, outSrc.startOffset); } } // Buffer this token: src.tokens.add(captureState()); src.maxToNode = Math.max(src.maxToNode, inputTo); maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize()); InputNode dest = inputNodes.get(inputTo); if (dest.node == -1) { // Common case: first time a token is arriving to this input position: dest.node = inputTo; } // Always number output nodes sequentially: int outputEndNode = src.outputNode + 1; if (outputEndNode > dest.outputNode) { if (dest.outputNode != -1) { boolean removed = outputNodes.get(dest.outputNode).inputNodes.remove(Integer.valueOf(inputTo)); assert removed; } //System.out.println(" increase output node: " + dest.outputNode + " vs " + outputEndNode); outputNodes.get(outputEndNode).inputNodes.add(inputTo); dest.outputNode = outputEndNode; // Since all we ever do is merge incoming nodes together, and then renumber // the merged nodes sequentially, we should only ever assign smaller node // numbers: assert outputEndNode <= inputTo: "outputEndNode=" + outputEndNode + " vs inputTo=" + inputTo; } OutputNode outDest = outputNodes.get(dest.outputNode); // "shrink wrap" the offsets so the original tokens (with most // restrictive offsets) win: if (outDest.endOffset == -1 || endOffset < outDest.endOffset) { outDest.endOffset = endOffset; } } else { //System.out.println(" got false from input"); input.end(); finalPosInc = posIncAtt.getPositionIncrement(); finalOffset = offsetAtt.endOffset(); done = true; // Don't return false here: we need to force release any buffered tokens now } } } // Only for debugging: /* private void printStates() { System.out.println("states:"); for(int i=outputFrom;i<outputNodes.getMaxPos();i++) { OutputNode outputNode = outputNodes.get(i); System.out.println(" output " + i + ": inputs " + outputNode.inputNodes); for(int inputNodeID : outputNode.inputNodes) { InputNode inputNode = inputNodes.get(inputNodeID); assert inputNode.outputNode == i; } } } */ @Override public void end() throws IOException { if (done == false) { super.end(); } else { // NOTE, shady: don't call super.end, because we did already from incrementToken } clearAttributes(); if (done) { // On exc, done is false, and we will not have set these: posIncAtt.setPositionIncrement(finalPosInc); offsetAtt.setOffset(finalOffset, finalOffset); } else { super.end(); } } @Override public void reset() throws IOException { //System.out.println("F: reset"); super.reset(); inputFrom = -1; inputNodes.reset(); InputNode in = inputNodes.get(0); in.node = 0; in.outputNode = 0; outputNodes.reset(); OutputNode out = outputNodes.get(0); out.node = 0; out.inputNodes.add(0); out.startOffset = 0; outputFrom = 0; //retOutputFrom = -1; lastOutputFrom = -1; done = false; finalPosInc = -1; finalOffset = -1; lastStartOffset = 0; maxLookaheadUsed = 0; }
For testing
/** For testing */
public int getMaxLookaheadUsed() { return maxLookaheadUsed; } }