/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ckb;

import static org.apache.lucene.analysis.util.StemmerUtil.delete;

Normalizes the Unicode representation of Sorani text.

Normalization consists of:

  • Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH)
  • Alternate form of 'k' (0643) is converted to 06A9 (KEHEH)
  • Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE)
  • Alternate (joining) form of 'h' (06BE) is converted to 0647
  • Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW)
  • Harakat, tatweel, and formatting characters such as directional controls are removed.
/** * Normalizes the Unicode representation of Sorani text. * <p> * Normalization consists of: * <ul> * <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH) * <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH) * <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE) * <li>Alternate (joining) form of 'h' (06BE) is converted to 0647 * <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW) * <li>Harakat, tatweel, and formatting characters such as directional controls are removed. * </ul> */
public class SoraniNormalizer { static final char YEH = '\u064A'; static final char DOTLESS_YEH = '\u0649'; static final char FARSI_YEH = '\u06CC'; static final char KAF = '\u0643'; static final char KEHEH = '\u06A9'; static final char HEH = '\u0647'; static final char AE = '\u06D5'; static final char ZWNJ = '\u200C'; static final char HEH_DOACHASHMEE = '\u06BE'; static final char TEH_MARBUTA = '\u0629'; static final char REH = '\u0631'; static final char RREH = '\u0695'; static final char RREH_ABOVE = '\u0692'; static final char TATWEEL = '\u0640'; static final char FATHATAN = '\u064B'; static final char DAMMATAN = '\u064C'; static final char KASRATAN = '\u064D'; static final char FATHA = '\u064E'; static final char DAMMA = '\u064F'; static final char KASRA = '\u0650'; static final char SHADDA = '\u0651'; static final char SUKUN = '\u0652';
Normalize an input buffer of Sorani text
Params:
  • s – input buffer
  • len – length of input buffer
Returns:length of input buffer after normalization
/** * Normalize an input buffer of Sorani text * * @param s input buffer * @param len length of input buffer * @return length of input buffer after normalization */
public int normalize(char s[], int len) { for (int i = 0; i < len; i++) { switch (s[i]) { case YEH: case DOTLESS_YEH: s[i] = FARSI_YEH; break; case KAF: s[i] = KEHEH; break; case ZWNJ: if (i > 0 && s[i-1] == HEH) { s[i-1] = AE; } len = delete(s, i, len); i--; break; case HEH: if (i == len-1) { s[i] = AE; } break; case TEH_MARBUTA: s[i] = AE; break; case HEH_DOACHASHMEE: s[i] = HEH; break; case REH: if (i == 0) { s[i] = RREH; } break; case RREH_ABOVE: s[i] = RREH; break; case TATWEEL: case KASRATAN: case DAMMATAN: case FATHATAN: case FATHA: case DAMMA: case KASRA: case SHADDA: case SUKUN: len = delete(s, i, len); i--; break; default: if (Character.getType(s[i]) == Character.FORMAT) { len = delete(s, i, len); i--; } } } return len; } }