/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.pt;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.CharArraySet;
import static org.apache.lucene.analysis.util.StemmerUtil.*;
Base class for stemmers that use a set of RSLP-like stemming steps.
RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed
originally for stemming the Portuguese language, described in the paper
A Stemming Algorithm for the Portuguese Language, Orengo et. al.
Since this time a plural-only modification (RSLP-S) as well as a modification for the Galician language have been implemented. This class parses a configuration file that describes Step
s, where each Step contains a set of Rule
s.
The general rule format is:
{ "suffix", N, "replacement", { "exception1", "exception2", ...}}
where:
suffix
is the suffix to be removed (such as "inho").
N
is the min stem size, where stem is defined as the candidate stem
after removing the suffix (but before appending the replacement!)
replacement
is an optimal string to append after removing the suffix.
This can be the empty string.
exceptions
is an optional list of exceptions, patterns that should
not be stemmed. These patterns can be specified as whole word or suffix (ends-with)
patterns, depending upon the exceptions format flag in the step header.
A step is an ordered list of rules, with a structure in this format:
{ "name", N, B, { "cond1", "cond2", ... }
... rules ... };
where:
name
is a name for the step (such as "Plural").
N
is the min word size. Words that are less than this length bypass
the step completely, as an optimization. Note: N can be zero, in this case this
implementation will automatically calculate the appropriate value from the underlying
rules.
B
is a "boolean" flag specifying how exceptions in the rules are matched.
A value of 1 indicates whole-word pattern matching, a value of 0 indicates that
exceptions are actually suffixes and should be matched with ends-with.
conds
are an optional list of conditions to enter the step at all. If
the list is non-empty, then a word must end with one of these conditions or it will
bypass the step completely as an optimization.
See Also: @lucene.internal
/**
* Base class for stemmers that use a set of RSLP-like stemming steps.
* <p>
* RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed
* originally for stemming the Portuguese language, described in the paper
* <i>A Stemming Algorithm for the Portuguese Language</i>, Orengo et. al.
* <p>
* Since this time a plural-only modification (RSLP-S) as well as a modification
* for the Galician language have been implemented. This class parses a configuration
* file that describes {@link Step}s, where each Step contains a set of {@link Rule}s.
* <p>
* The general rule format is:
* <blockquote>{ "suffix", N, "replacement", { "exception1", "exception2", ...}}</blockquote>
* where:
* <ul>
* <li><code>suffix</code> is the suffix to be removed (such as "inho").
* <li><code>N</code> is the min stem size, where stem is defined as the candidate stem
* after removing the suffix (but before appending the replacement!)
* <li><code>replacement</code> is an optimal string to append after removing the suffix.
* This can be the empty string.
* <li><code>exceptions</code> is an optional list of exceptions, patterns that should
* not be stemmed. These patterns can be specified as whole word or suffix (ends-with)
* patterns, depending upon the exceptions format flag in the step header.
* </ul>
* <p>
* A step is an ordered list of rules, with a structure in this format:
* <blockquote>{ "name", N, B, { "cond1", "cond2", ... }
* ... rules ... };
* </blockquote>
* where:
* <ul>
* <li><code>name</code> is a name for the step (such as "Plural").
* <li><code>N</code> is the min word size. Words that are less than this length bypass
* the step completely, as an optimization. Note: N can be zero, in this case this
* implementation will automatically calculate the appropriate value from the underlying
* rules.
* <li><code>B</code> is a "boolean" flag specifying how exceptions in the rules are matched.
* A value of 1 indicates whole-word pattern matching, a value of 0 indicates that
* exceptions are actually suffixes and should be matched with ends-with.
* <li><code>conds</code> are an optional list of conditions to enter the step at all. If
* the list is non-empty, then a word must end with one of these conditions or it will
* bypass the step completely as an optimization.
* </ul>
* <p>
* @see <a href="http://www.inf.ufrgs.br/~viviane/rslp/index.htm">RSLP description</a>
* @lucene.internal
*/
public abstract class RSLPStemmerBase {
A basic rule, with no exceptions.
/**
* A basic rule, with no exceptions.
*/
protected static class Rule {
protected final char suffix[];
protected final char replacement[];
protected final int min;
Create a rule.
Params: - suffix – suffix to remove
- min – minimum stem length
- replacement – replacement string
/**
* Create a rule.
* @param suffix suffix to remove
* @param min minimum stem length
* @param replacement replacement string
*/
public Rule(String suffix, int min, String replacement) {
this.suffix = suffix.toCharArray();
this.replacement = replacement.toCharArray();
this.min = min;
}
Returns: true if the word matches this rule.
/**
* @return true if the word matches this rule.
*/
public boolean matches(char s[], int len) {
return (len - suffix.length >= min && endsWith(s, len, suffix));
}
Returns: new valid length of the string after firing this rule.
/**
* @return new valid length of the string after firing this rule.
*/
public int replace(char s[], int len) {
if (replacement.length > 0) {
System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length);
}
return len - suffix.length + replacement.length;
}
}
A rule with a set of whole-word exceptions.
/**
* A rule with a set of whole-word exceptions.
*/
protected static class RuleWithSetExceptions extends Rule {
protected final CharArraySet exceptions;
public RuleWithSetExceptions(String suffix, int min, String replacement,
String[] exceptions) {
super(suffix, min, replacement);
for (int i = 0; i < exceptions.length; i++) {
if (!exceptions[i].endsWith(suffix))
throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
}
this.exceptions = new CharArraySet(Arrays.asList(exceptions), false);
}
@Override
public boolean matches(char s[], int len) {
return super.matches(s, len) && !exceptions.contains(s, 0, len);
}
}
A rule with a set of exceptional suffixes.
/**
* A rule with a set of exceptional suffixes.
*/
protected static class RuleWithSuffixExceptions extends Rule {
// TODO: use a more efficient datastructure: automaton?
protected final char[][] exceptions;
public RuleWithSuffixExceptions(String suffix, int min, String replacement,
String[] exceptions) {
super(suffix, min, replacement);
for (int i = 0; i < exceptions.length; i++) {
if (!exceptions[i].endsWith(suffix))
throw new RuntimeException("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
}
this.exceptions = new char[exceptions.length][];
for (int i = 0; i < exceptions.length; i++)
this.exceptions[i] = exceptions[i].toCharArray();
}
@Override
public boolean matches(char s[], int len) {
if (!super.matches(s, len))
return false;
for (int i = 0; i < exceptions.length; i++)
if (endsWith(s, len, exceptions[i]))
return false;
return true;
}
}
A step containing a list of rules.
/**
* A step containing a list of rules.
*/
protected static class Step {
protected final String name;
protected final Rule rules[];
protected final int min;
protected final char[][] suffixes;
Create a new step
Params: - name – Step's name.
- rules – an ordered list of rules.
- min – minimum word size. if this is 0 it is automatically calculated.
- suffixes – optional list of conditional suffixes. may be null.
/**
* Create a new step
* @param name Step's name.
* @param rules an ordered list of rules.
* @param min minimum word size. if this is 0 it is automatically calculated.
* @param suffixes optional list of conditional suffixes. may be null.
*/
public Step(String name, Rule rules[], int min, String suffixes[]) {
this.name = name;
this.rules = rules;
if (min == 0) {
min = Integer.MAX_VALUE;
for (Rule r : rules)
min = Math.min(min, r.min + r.suffix.length);
}
this.min = min;
if (suffixes == null || suffixes.length == 0) {
this.suffixes = null;
} else {
this.suffixes = new char[suffixes.length][];
for (int i = 0; i < suffixes.length; i++)
this.suffixes[i] = suffixes[i].toCharArray();
}
}
Returns: new valid length of the string after applying the entire step.
/**
* @return new valid length of the string after applying the entire step.
*/
public int apply(char s[], int len) {
if (len < min)
return len;
if (suffixes != null) {
boolean found = false;
for (int i = 0; i < suffixes.length; i++)
if (endsWith(s, len, suffixes[i])) {
found = true;
break;
}
if (!found) return len;
}
for (int i = 0; i < rules.length; i++) {
if (rules[i].matches(s, len))
return rules[i].replace(s, len);
}
return len;
}
}
Parse a resource file into an RSLP stemmer description.
Returns: a Map containing the named Steps in this description.
/**
* Parse a resource file into an RSLP stemmer description.
* @return a Map containing the named Steps in this description.
*/
protected static Map<String,Step> parse(Class<? extends RSLPStemmerBase> clazz, String resource) {
// TODO: this parser is ugly, but works. use a jflex grammar instead.
try {
InputStream is = clazz.getResourceAsStream(resource);
LineNumberReader r = new LineNumberReader(new InputStreamReader(is, StandardCharsets.UTF_8));
Map<String,Step> steps = new HashMap<>();
String step;
while ((step = readLine(r)) != null) {
Step s = parseStep(r, step);
steps.put(s.name, s);
}
r.close();
return steps;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static final Pattern headerPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$");
private static final Pattern stripPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$");
private static final Pattern repPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$");
private static final Pattern excPattern =
Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$");
private static Step parseStep(LineNumberReader r, String header) throws IOException {
Matcher matcher = headerPattern.matcher(header);
if (!matcher.find()) {
throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber());
}
assert matcher.groupCount() == 4;
String name = matcher.group(1);
int min = Integer.parseInt(matcher.group(2));
int type = Integer.parseInt(matcher.group(3));
String suffixes[] = parseList(matcher.group(4));
Rule rules[] = parseRules(r, type);
return new Step(name, rules, min, suffixes);
}
private static Rule[] parseRules(LineNumberReader r, int type) throws IOException {
List<Rule> rules = new ArrayList<>();
String line;
while ((line = readLine(r)) != null) {
Matcher matcher = stripPattern.matcher(line);
if (matcher.matches()) {
rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), ""));
} else {
matcher = repPattern.matcher(line);
if (matcher.matches()) {
rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3)));
} else {
matcher = excPattern.matcher(line);
if (matcher.matches()) {
if (type == 0) {
rules.add(new RuleWithSuffixExceptions(matcher.group(1),
Integer.parseInt(matcher.group(2)),
matcher.group(3),
parseList(matcher.group(4))));
} else {
rules.add(new RuleWithSetExceptions(matcher.group(1),
Integer.parseInt(matcher.group(2)),
matcher.group(3),
parseList(matcher.group(4))));
}
} else {
throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber());
}
}
}
if (line.endsWith(";"))
return rules.toArray(new Rule[rules.size()]);
}
return null;
}
private static String[] parseList(String s) {
if (s.length() == 0)
return null;
String list[] = s.split(",");
for (int i = 0; i < list.length; i++)
list[i] = parseString(list[i].trim());
return list;
}
private static String parseString(String s) {
return s.substring(1, s.length()-1);
}
private static String readLine(LineNumberReader r) throws IOException {
String line = null;
while ((line = r.readLine()) != null) {
line = line.trim();
if (line.length() > 0 && line.charAt(0) != '#')
return line;
}
return line;
}
}