/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.pt;


import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.CharArraySet;

import static org.apache.lucene.analysis.util.StemmerUtil.*;

Base class for stemmers that use a set of RSLP-like stemming steps.

RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed originally for stemming the Portuguese language, described in the paper A Stemming Algorithm for the Portuguese Language, Orengo et. al.

Since this time a plural-only modification (RSLP-S) as well as a modification for the Galician language have been implemented. This class parses a configuration file that describes Steps, where each Step contains a set of Rules.

The general rule format is:

{ "suffix", N, "replacement", { "exception1", "exception2", ...}}
where:
  • suffix is the suffix to be removed (such as "inho").
  • N is the min stem size, where stem is defined as the candidate stem after removing the suffix (but before appending the replacement!)
  • replacement is an optimal string to append after removing the suffix. This can be the empty string.
  • exceptions is an optional list of exceptions, patterns that should not be stemmed. These patterns can be specified as whole word or suffix (ends-with) patterns, depending upon the exceptions format flag in the step header.

A step is an ordered list of rules, with a structure in this format:

{ "name", N, B, { "cond1", "cond2", ... } ... rules ... };
where:
  • name is a name for the step (such as "Plural").
  • N is the min word size. Words that are less than this length bypass the step completely, as an optimization. Note: N can be zero, in this case this implementation will automatically calculate the appropriate value from the underlying rules.
  • B is a "boolean" flag specifying how exceptions in the rules are matched. A value of 1 indicates whole-word pattern matching, a value of 0 indicates that exceptions are actually suffixes and should be matched with ends-with.
  • conds are an optional list of conditions to enter the step at all. If the list is non-empty, then a word must end with one of these conditions or it will bypass the step completely as an optimization.

See Also:
@lucene.internal
/** * Base class for stemmers that use a set of RSLP-like stemming steps. * <p> * RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed * originally for stemming the Portuguese language, described in the paper * <i>A Stemming Algorithm for the Portuguese Language</i>, Orengo et. al. * <p> * Since this time a plural-only modification (RSLP-S) as well as a modification * for the Galician language have been implemented. This class parses a configuration * file that describes {@link Step}s, where each Step contains a set of {@link Rule}s. * <p> * The general rule format is: * <blockquote>{ "suffix", N, "replacement", { "exception1", "exception2", ...}}</blockquote> * where: * <ul> * <li><code>suffix</code> is the suffix to be removed (such as "inho"). * <li><code>N</code> is the min stem size, where stem is defined as the candidate stem * after removing the suffix (but before appending the replacement!) * <li><code>replacement</code> is an optimal string to append after removing the suffix. * This can be the empty string. * <li><code>exceptions</code> is an optional list of exceptions, patterns that should * not be stemmed. These patterns can be specified as whole word or suffix (ends-with) * patterns, depending upon the exceptions format flag in the step header. * </ul> * <p> * A step is an ordered list of rules, with a structure in this format: * <blockquote>{ "name", N, B, { "cond1", "cond2", ... } * ... rules ... }; * </blockquote> * where: * <ul> * <li><code>name</code> is a name for the step (such as "Plural"). * <li><code>N</code> is the min word size. Words that are less than this length bypass * the step completely, as an optimization. Note: N can be zero, in this case this * implementation will automatically calculate the appropriate value from the underlying * rules. * <li><code>B</code> is a "boolean" flag specifying how exceptions in the rules are matched. * A value of 1 indicates whole-word pattern matching, a value of 0 indicates that * exceptions are actually suffixes and should be matched with ends-with. * <li><code>conds</code> are an optional list of conditions to enter the step at all. If * the list is non-empty, then a word must end with one of these conditions or it will * bypass the step completely as an optimization. * </ul> * <p> * @see <a href="http://www.inf.ufrgs.br/~viviane/rslp/index.htm">RSLP description</a> * @lucene.internal */
public abstract class RSLPStemmerBase {
A basic rule, with no exceptions.
/** * A basic rule, with no exceptions. */
protected static class Rule { protected final char suffix[]; protected final char replacement[]; protected final int min;
Create a rule.
Params:
  • suffix – suffix to remove
  • min – minimum stem length
  • replacement – replacement string
/** * Create a rule. * @param suffix suffix to remove * @param min minimum stem length * @param replacement replacement string */
public Rule(String suffix, int min, String replacement) { this.suffix = suffix.toCharArray(); this.replacement = replacement.toCharArray(); this.min = min; }
Returns:true if the word matches this rule.
/** * @return true if the word matches this rule. */
public boolean matches(char s[], int len) { return (len - suffix.length >= min && endsWith(s, len, suffix)); }
Returns:new valid length of the string after firing this rule.
/** * @return new valid length of the string after firing this rule. */
public int replace(char s[], int len) { if (replacement.length > 0) { System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length); } return len - suffix.length + replacement.length; } }
A rule with a set of whole-word exceptions.
/** * A rule with a set of whole-word exceptions. */
protected static class RuleWithSetExceptions extends Rule { protected final CharArraySet exceptions; public RuleWithSetExceptions(String suffix, int min, String replacement, String[] exceptions) { super(suffix, min, replacement); for (int i = 0; i < exceptions.length; i++) { if (!exceptions[i].endsWith(suffix)) throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'"); } this.exceptions = new CharArraySet(Arrays.asList(exceptions), false); } @Override public boolean matches(char s[], int len) { return super.matches(s, len) && !exceptions.contains(s, 0, len); } }
A rule with a set of exceptional suffixes.
/** * A rule with a set of exceptional suffixes. */
protected static class RuleWithSuffixExceptions extends Rule { // TODO: use a more efficient datastructure: automaton? protected final char[][] exceptions; public RuleWithSuffixExceptions(String suffix, int min, String replacement, String[] exceptions) { super(suffix, min, replacement); for (int i = 0; i < exceptions.length; i++) { if (!exceptions[i].endsWith(suffix)) throw new RuntimeException("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'"); } this.exceptions = new char[exceptions.length][]; for (int i = 0; i < exceptions.length; i++) this.exceptions[i] = exceptions[i].toCharArray(); } @Override public boolean matches(char s[], int len) { if (!super.matches(s, len)) return false; for (int i = 0; i < exceptions.length; i++) if (endsWith(s, len, exceptions[i])) return false; return true; } }
A step containing a list of rules.
/** * A step containing a list of rules. */
protected static class Step { protected final String name; protected final Rule rules[]; protected final int min; protected final char[][] suffixes;
Create a new step
Params:
  • name – Step's name.
  • rules – an ordered list of rules.
  • min – minimum word size. if this is 0 it is automatically calculated.
  • suffixes – optional list of conditional suffixes. may be null.
/** * Create a new step * @param name Step's name. * @param rules an ordered list of rules. * @param min minimum word size. if this is 0 it is automatically calculated. * @param suffixes optional list of conditional suffixes. may be null. */
public Step(String name, Rule rules[], int min, String suffixes[]) { this.name = name; this.rules = rules; if (min == 0) { min = Integer.MAX_VALUE; for (Rule r : rules) min = Math.min(min, r.min + r.suffix.length); } this.min = min; if (suffixes == null || suffixes.length == 0) { this.suffixes = null; } else { this.suffixes = new char[suffixes.length][]; for (int i = 0; i < suffixes.length; i++) this.suffixes[i] = suffixes[i].toCharArray(); } }
Returns:new valid length of the string after applying the entire step.
/** * @return new valid length of the string after applying the entire step. */
public int apply(char s[], int len) { if (len < min) return len; if (suffixes != null) { boolean found = false; for (int i = 0; i < suffixes.length; i++) if (endsWith(s, len, suffixes[i])) { found = true; break; } if (!found) return len; } for (int i = 0; i < rules.length; i++) { if (rules[i].matches(s, len)) return rules[i].replace(s, len); } return len; } }
Parse a resource file into an RSLP stemmer description.
Returns:a Map containing the named Steps in this description.
/** * Parse a resource file into an RSLP stemmer description. * @return a Map containing the named Steps in this description. */
protected static Map<String,Step> parse(Class<? extends RSLPStemmerBase> clazz, String resource) { // TODO: this parser is ugly, but works. use a jflex grammar instead. try { InputStream is = clazz.getResourceAsStream(resource); LineNumberReader r = new LineNumberReader(new InputStreamReader(is, StandardCharsets.UTF_8)); Map<String,Step> steps = new HashMap<>(); String step; while ((step = readLine(r)) != null) { Step s = parseStep(r, step); steps.put(s.name, s); } r.close(); return steps; } catch (IOException e) { throw new RuntimeException(e); } } private static final Pattern headerPattern = Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$"); private static final Pattern stripPattern = Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$"); private static final Pattern repPattern = Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$"); private static final Pattern excPattern = Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$"); private static Step parseStep(LineNumberReader r, String header) throws IOException { Matcher matcher = headerPattern.matcher(header); if (!matcher.find()) { throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber()); } assert matcher.groupCount() == 4; String name = matcher.group(1); int min = Integer.parseInt(matcher.group(2)); int type = Integer.parseInt(matcher.group(3)); String suffixes[] = parseList(matcher.group(4)); Rule rules[] = parseRules(r, type); return new Step(name, rules, min, suffixes); } private static Rule[] parseRules(LineNumberReader r, int type) throws IOException { List<Rule> rules = new ArrayList<>(); String line; while ((line = readLine(r)) != null) { Matcher matcher = stripPattern.matcher(line); if (matcher.matches()) { rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), "")); } else { matcher = repPattern.matcher(line); if (matcher.matches()) { rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3))); } else { matcher = excPattern.matcher(line); if (matcher.matches()) { if (type == 0) { rules.add(new RuleWithSuffixExceptions(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3), parseList(matcher.group(4)))); } else { rules.add(new RuleWithSetExceptions(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3), parseList(matcher.group(4)))); } } else { throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber()); } } } if (line.endsWith(";")) return rules.toArray(new Rule[rules.size()]); } return null; } private static String[] parseList(String s) { if (s.length() == 0) return null; String list[] = s.split(","); for (int i = 0; i < list.length; i++) list[i] = parseString(list[i].trim()); return list; } private static String parseString(String s) { return s.substring(1, s.length()-1); } private static String readLine(LineNumberReader r) throws IOException { String line = null; while ((line = r.readLine()) != null) { line = line.trim(); if (line.length() > 0 && line.charAt(0) != '#') return line; } return line; } }