/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest;

import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiBits;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;



Dictionary with terms, weights, payload (optional) and contexts (optional) information taken from stored/indexed fields in a Lucene index.

NOTE:
  • The term field has to be stored; if it is missing, the document is skipped.
  • The payload and contexts field are optional and are not required to be stored.
  • The weight field can be stored or can be a NumericDocValues. If the weight field is not defined, the value of the weight is 0
/** * <p> * Dictionary with terms, weights, payload (optional) and contexts (optional) * information taken from stored/indexed fields in a Lucene index. * </p> * <b>NOTE:</b> * <ul> * <li> * The term field has to be stored; if it is missing, the document is skipped. * </li> * <li> * The payload and contexts field are optional and are not required to be stored. * </li> * <li> * The weight field can be stored or can be a {@link NumericDocValues}. * If the weight field is not defined, the value of the weight is <code>0</code> * </li> * </ul> */
public class DocumentDictionary implements Dictionary {
IndexReader to load documents from
/** {@link IndexReader} to load documents from */
protected final IndexReader reader;
Field to read payload from
/** Field to read payload from */
protected final String payloadField;
Field to read contexts from
/** Field to read contexts from */
protected final String contextsField; private final String field; private final String weightField;
Creates a new dictionary with the contents of the fields named field for the terms and weightField for the weights that will be used for the corresponding terms.
/** * Creates a new dictionary with the contents of the fields named <code>field</code> * for the terms and <code>weightField</code> for the weights that will be used for * the corresponding terms. */
public DocumentDictionary(IndexReader reader, String field, String weightField) { this(reader, field, weightField, null); }
Creates a new dictionary with the contents of the fields named field for the terms, weightField for the weights that will be used for the the corresponding terms and payloadField for the corresponding payloads for the entry.
/** * Creates a new dictionary with the contents of the fields named <code>field</code> * for the terms, <code>weightField</code> for the weights that will be used for the * the corresponding terms and <code>payloadField</code> for the corresponding payloads * for the entry. */
public DocumentDictionary(IndexReader reader, String field, String weightField, String payloadField) { this(reader, field, weightField, payloadField, null); }
Creates a new dictionary with the contents of the fields named field for the terms, weightField for the weights that will be used for the the corresponding terms, payloadField for the corresponding payloads for the entry and contextsField for associated contexts.
/** * Creates a new dictionary with the contents of the fields named <code>field</code> * for the terms, <code>weightField</code> for the weights that will be used for the * the corresponding terms, <code>payloadField</code> for the corresponding payloads * for the entry and <code>contextsField</code> for associated contexts. */
public DocumentDictionary(IndexReader reader, String field, String weightField, String payloadField, String contextsField) { this.reader = reader; this.field = field; this.weightField = weightField; this.payloadField = payloadField; this.contextsField = contextsField; } @Override public InputIterator getEntryIterator() throws IOException { return new DocumentInputIterator(payloadField!=null, contextsField!=null); }
Implements InputIterator from stored fields.
/** Implements {@link InputIterator} from stored fields. */
protected class DocumentInputIterator implements InputIterator { private final int docCount; private final Set<String> relevantFields; private final boolean hasPayloads; private final boolean hasContexts; private final Bits liveDocs; private int currentDocId = -1; private long currentWeight = 0; private BytesRef currentPayload = null; private Set<BytesRef> currentContexts; private final NumericDocValues weightValues; IndexableField[] currentDocFields = new IndexableField[0]; int nextFieldsPosition = 0;
Creates an iterator over term, weight and payload fields from the lucene index. setting withPayload to false, implies an iterator over only term and weight.
/** * Creates an iterator over term, weight and payload fields from the lucene * index. setting <code>withPayload</code> to false, implies an iterator * over only term and weight. */
public DocumentInputIterator(boolean hasPayloads, boolean hasContexts) throws IOException { this.hasPayloads = hasPayloads; this.hasContexts = hasContexts; docCount = reader.maxDoc() - 1; weightValues = (weightField != null) ? MultiDocValues.getNumericValues(reader, weightField) : null; liveDocs = (reader.leaves().size() > 0) ? MultiBits.getLiveDocs(reader) : null; relevantFields = getRelevantFields(new String [] {field, weightField, payloadField, contextsField}); } @Override public long weight() { return currentWeight; } @Override public BytesRef next() throws IOException { while (true) { if (nextFieldsPosition < currentDocFields.length) { // Still values left from the document IndexableField fieldValue = currentDocFields[nextFieldsPosition++]; if (fieldValue.binaryValue() != null) { return fieldValue.binaryValue(); } else if (fieldValue.stringValue() != null) { return new BytesRef(fieldValue.stringValue()); } else { continue; } } if (currentDocId == docCount) { // Iterated over all the documents. break; } currentDocId++; if (liveDocs != null && !liveDocs.get(currentDocId)) { continue; } Document doc = reader.document(currentDocId, relevantFields); BytesRef tempPayload = null; if (hasPayloads) { IndexableField payload = doc.getField(payloadField); if (payload != null) { if (payload.binaryValue() != null) { tempPayload = payload.binaryValue(); } else if (payload.stringValue() != null) { tempPayload = new BytesRef(payload.stringValue()); } } // in case that the iterator has payloads configured, use empty values // instead of null for payload if (tempPayload == null) { tempPayload = new BytesRef(); } } Set<BytesRef> tempContexts; if (hasContexts) { tempContexts = new HashSet<>(); final IndexableField[] contextFields = doc.getFields(contextsField); for (IndexableField contextField : contextFields) { if (contextField.binaryValue() != null) { tempContexts.add(contextField.binaryValue()); } else if (contextField.stringValue() != null) { tempContexts.add(new BytesRef(contextField.stringValue())); } else { continue; } } } else { tempContexts = Collections.emptySet(); } currentDocFields = doc.getFields(field); nextFieldsPosition = 0; if (currentDocFields.length == 0) { // no values in this document continue; } IndexableField fieldValue = currentDocFields[nextFieldsPosition++]; BytesRef tempTerm; if (fieldValue.binaryValue() != null) { tempTerm = fieldValue.binaryValue(); } else if (fieldValue.stringValue() != null) { tempTerm = new BytesRef(fieldValue.stringValue()); } else { continue; } currentPayload = tempPayload; currentContexts = tempContexts; currentWeight = getWeight(doc, currentDocId); return tempTerm; } return null; } @Override public BytesRef payload() { return currentPayload; } @Override public boolean hasPayloads() { return hasPayloads; }
Returns the value of the weightField for the current document. Retrieves the value for the weightField if it's stored (using doc) or if it's indexed as NumericDocValues (using docId) for the document. If no value is found, then the weight is 0.
/** * Returns the value of the <code>weightField</code> for the current document. * Retrieves the value for the <code>weightField</code> if it's stored (using <code>doc</code>) * or if it's indexed as {@link NumericDocValues} (using <code>docId</code>) for the document. * If no value is found, then the weight is 0. */
protected long getWeight(Document doc, int docId) throws IOException { IndexableField weight = doc.getField(weightField); if (weight != null) { // found weight as stored return (weight.numericValue() != null) ? weight.numericValue().longValue() : 0; } else if (weightValues != null) { // found weight as NumericDocValue if (weightValues.docID() < docId) { weightValues.advance(docId); } if (weightValues.docID() == docId) { return weightValues.longValue(); } else { // missing return 0; } } else { // fall back return 0; } } private Set<String> getRelevantFields(String... fields) { Set<String> relevantFields = new HashSet<>(); for (String relevantField : fields) { if (relevantField != null) { relevantFields.add(relevantField); } } return relevantFields; } @Override public Set<BytesRef> contexts() { if (hasContexts) { return currentContexts; } return null; } @Override public boolean hasContexts() { return hasContexts; } } }