/*
 * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package java.util.regex;

import java.util.HashMap;
import java.util.Locale;
import java.util.regex.Pattern.CharPredicate;
import java.util.regex.Pattern.BmpCharPredicate;

class CharPredicates {

    static final CharPredicate ALPHABETIC() {
        return Character::isAlphabetic;
    }

    // \p{gc=Decimal_Number}
    static final CharPredicate DIGIT() {
        return Character::isDigit;
    }

    static final CharPredicate LETTER() {
        return Character::isLetter;
    }

    static final CharPredicate IDEOGRAPHIC() {
        return Character::isIdeographic;
    }

    static final CharPredicate LOWERCASE() {
        return Character::isLowerCase;
    }

    static final CharPredicate UPPERCASE() {
        return Character::isUpperCase;
    }

    static final CharPredicate TITLECASE() {
        return Character::isTitleCase;
    }

    // \p{Whitespace}
    static final CharPredicate WHITE_SPACE() {
        return ch ->
            ((((1 << Character.SPACE_SEPARATOR) |
               (1 << Character.LINE_SEPARATOR) |
               (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
            != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
    }

    // \p{gc=Control}
    static final CharPredicate CONTROL() {
        return ch -> Character.getType(ch) == Character.CONTROL;
    }

    // \p{gc=Punctuation}
    static final CharPredicate PUNCTUATION() {
        return ch ->
            ((((1 << Character.CONNECTOR_PUNCTUATION) |
               (1 << Character.DASH_PUNCTUATION) |
               (1 << Character.START_PUNCTUATION) |
               (1 << Character.END_PUNCTUATION) |
               (1 << Character.OTHER_PUNCTUATION) |
               (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
               (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
            != 0;
    }

    // \p{gc=Decimal_Number}
    // \p{Hex_Digit}    -> PropList.txt: Hex_Digit
    static final CharPredicate HEX_DIGIT() {
        return DIGIT().union(ch -> (ch >= 0x0030 && ch <= 0x0039) ||
                (ch >= 0x0041 && ch <= 0x0046) ||
                (ch >= 0x0061 && ch <= 0x0066) ||
                (ch >= 0xFF10 && ch <= 0xFF19) ||
                (ch >= 0xFF21 && ch <= 0xFF26) ||
                (ch >= 0xFF41 && ch <= 0xFF46));
    }

    static final CharPredicate ASSIGNED() {
        return ch -> Character.getType(ch) != Character.UNASSIGNED;
    }

    // PropList.txt:Noncharacter_Code_Point
    static final CharPredicate NONCHARACTER_CODE_POINT() {
        return ch -> (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
    }

    // \p{alpha}
    // \p{digit}
    static final CharPredicate ALNUM() {
        return ALPHABETIC().union(DIGIT());
    }

    // \p{Whitespace} --
    // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL}  -> 0xa, 0xb, 0xc, 0xd, 0x85
    //  \p{gc=Line_Separator}
    //  \p{gc=Paragraph_Separator}]
    static final CharPredicate BLANK() {
        return ch ->
            Character.getType(ch) == Character.SPACE_SEPARATOR ||
            ch == 0x9; // \N{HT}
    }

    // [^
    //  \p{space}
    //  \p{gc=Control}
    //  \p{gc=Surrogate}
    //  \p{gc=Unassigned}]
    static final CharPredicate GRAPH() {
        return ch ->
            ((((1 << Character.SPACE_SEPARATOR) |
               (1 << Character.LINE_SEPARATOR) |
               (1 << Character.PARAGRAPH_SEPARATOR) |
               (1 << Character.CONTROL) |
               (1 << Character.SURROGATE) |
               (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
            == 0;
    }

    // \p{graph}
    // \p{blank}
    // -- \p{cntrl}
    static final CharPredicate PRINT() {
        return GRAPH().union(BLANK()).and(CONTROL().negate());
    }

    //  200C..200D    PropList.txt:Join_Control
    static final CharPredicate JOIN_CONTROL() {
        return ch -> ch == 0x200C || ch == 0x200D;
    }

    //  \p{alpha}
    //  \p{gc=Mark}
    //  \p{digit}
    //  \p{gc=Connector_Punctuation}
    //  \p{Join_Control}    200C..200D
    static final CharPredicate WORD() {
        return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) |
                                  (1 << Character.ENCLOSING_MARK) |
                                  (1 << Character.COMBINING_SPACING_MARK) |
                                  (1 << Character.DECIMAL_DIGIT_NUMBER) |
                                  (1 << Character.CONNECTOR_PUNCTUATION))
                                 >> Character.getType(ch)) & 1) != 0,
                         JOIN_CONTROL());
    }

    /////////////////////////////////////////////////////////////////////////////

    private static CharPredicate getPosixPredicate(String name) {
        switch (name) {
            case "ALPHA": return ALPHABETIC();
            case "LOWER": return LOWERCASE();
            case "UPPER": return UPPERCASE();
            case "SPACE": return WHITE_SPACE();
            case "PUNCT": return PUNCTUATION();
            case "XDIGIT": return HEX_DIGIT();
            case "ALNUM": return ALNUM();
            case "CNTRL": return CONTROL();
            case "DIGIT": return DIGIT();
            case "BLANK": return BLANK();
            case "GRAPH": return GRAPH();
            case "PRINT": return PRINT();
            default: return null;
        }
    }

    private static CharPredicate getUnicodePredicate(String name) {
        switch (name) {
            case "ALPHABETIC": return ALPHABETIC();
            case "ASSIGNED": return ASSIGNED();
            case "CONTROL": return CONTROL();
            case "HEXDIGIT": return HEX_DIGIT();
            case "IDEOGRAPHIC": return IDEOGRAPHIC();
            case "JOINCONTROL": return JOIN_CONTROL();
            case "LETTER": return LETTER();
            case "LOWERCASE": return LOWERCASE();
            case "NONCHARACTERCODEPOINT": return NONCHARACTER_CODE_POINT();
            case "TITLECASE": return TITLECASE();
            case "PUNCTUATION": return PUNCTUATION();
            case "UPPERCASE": return UPPERCASE();
            case "WHITESPACE": return WHITE_SPACE();
            case "WORD": return WORD();
            case "WHITE_SPACE": return WHITE_SPACE();
            case "HEX_DIGIT": return HEX_DIGIT();
            case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT();
            case "JOIN_CONTROL": return JOIN_CONTROL();
            default: return null;
        }
    }

    public static CharPredicate forUnicodeProperty(String propName) {
        propName = propName.toUpperCase(Locale.ROOT);
        CharPredicate p = getUnicodePredicate(propName);
        if (p != null)
            return p;
        return getPosixPredicate(propName);
    }

    public static CharPredicate forPOSIXName(String propName) {
        return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH));
    }

    /////////////////////////////////////////////////////////////////////////////

    
Returns a predicate matching all characters belong to a named UnicodeScript.
/** * Returns a predicate matching all characters belong to a named * UnicodeScript. */
static CharPredicate forUnicodeScript(String name) { final Character.UnicodeScript script; try { script = Character.UnicodeScript.forName(name); return ch -> script == Character.UnicodeScript.of(ch); } catch (IllegalArgumentException iae) {} return null; }
Returns a predicate matching all characters in a UnicodeBlock.
/** * Returns a predicate matching all characters in a UnicodeBlock. */
static CharPredicate forUnicodeBlock(String name) { final Character.UnicodeBlock block; try { block = Character.UnicodeBlock.forName(name); return ch -> block == Character.UnicodeBlock.of(ch); } catch (IllegalArgumentException iae) {} return null; } ///////////////////////////////////////////////////////////////////////////// // unicode categories, aliases, properties, java methods ... static CharPredicate forProperty(String name) { // Unicode character property aliases, defined in // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt switch (name) { case "Cn": return category(1<<Character.UNASSIGNED); case "Lu": return category(1<<Character.UPPERCASE_LETTER); case "Ll": return category(1<<Character.LOWERCASE_LETTER); case "Lt": return category(1<<Character.TITLECASE_LETTER); case "Lm": return category(1<<Character.MODIFIER_LETTER); case "Lo": return category(1<<Character.OTHER_LETTER); case "Mn": return category(1<<Character.NON_SPACING_MARK); case "Me": return category(1<<Character.ENCLOSING_MARK); case "Mc": return category(1<<Character.COMBINING_SPACING_MARK); case "Nd": return category(1<<Character.DECIMAL_DIGIT_NUMBER); case "Nl": return category(1<<Character.LETTER_NUMBER); case "No": return category(1<<Character.OTHER_NUMBER); case "Zs": return category(1<<Character.SPACE_SEPARATOR); case "Zl": return category(1<<Character.LINE_SEPARATOR); case "Zp": return category(1<<Character.PARAGRAPH_SEPARATOR); case "Cc": return category(1<<Character.CONTROL); case "Cf": return category(1<<Character.FORMAT); case "Co": return category(1<<Character.PRIVATE_USE); case "Cs": return category(1<<Character.SURROGATE); case "Pd": return category(1<<Character.DASH_PUNCTUATION); case "Ps": return category(1<<Character.START_PUNCTUATION); case "Pe": return category(1<<Character.END_PUNCTUATION); case "Pc": return category(1<<Character.CONNECTOR_PUNCTUATION); case "Po": return category(1<<Character.OTHER_PUNCTUATION); case "Sm": return category(1<<Character.MATH_SYMBOL); case "Sc": return category(1<<Character.CURRENCY_SYMBOL); case "Sk": return category(1<<Character.MODIFIER_SYMBOL); case "So": return category(1<<Character.OTHER_SYMBOL); case "Pi": return category(1<<Character.INITIAL_QUOTE_PUNCTUATION); case "Pf": return category(1<<Character.FINAL_QUOTE_PUNCTUATION); case "L": return category(((1<<Character.UPPERCASE_LETTER) | (1<<Character.LOWERCASE_LETTER) | (1<<Character.TITLECASE_LETTER) | (1<<Character.MODIFIER_LETTER) | (1<<Character.OTHER_LETTER))); case "M": return category(((1<<Character.NON_SPACING_MARK) | (1<<Character.ENCLOSING_MARK) | (1<<Character.COMBINING_SPACING_MARK))); case "N": return category(((1<<Character.DECIMAL_DIGIT_NUMBER) | (1<<Character.LETTER_NUMBER) | (1<<Character.OTHER_NUMBER))); case "Z": return category(((1<<Character.SPACE_SEPARATOR) | (1<<Character.LINE_SEPARATOR) | (1<<Character.PARAGRAPH_SEPARATOR))); case "C": return category(((1<<Character.CONTROL) | (1<<Character.FORMAT) | (1<<Character.PRIVATE_USE) | (1<<Character.SURROGATE) | (1<<Character.UNASSIGNED))); // Other case "P": return category(((1<<Character.DASH_PUNCTUATION) | (1<<Character.START_PUNCTUATION) | (1<<Character.END_PUNCTUATION) | (1<<Character.CONNECTOR_PUNCTUATION) | (1<<Character.OTHER_PUNCTUATION) | (1<<Character.INITIAL_QUOTE_PUNCTUATION) | (1<<Character.FINAL_QUOTE_PUNCTUATION))); case "S": return category(((1<<Character.MATH_SYMBOL) | (1<<Character.CURRENCY_SYMBOL) | (1<<Character.MODIFIER_SYMBOL) | (1<<Character.OTHER_SYMBOL))); case "LC": return category(((1<<Character.UPPERCASE_LETTER) | (1<<Character.LOWERCASE_LETTER) | (1<<Character.TITLECASE_LETTER))); case "LD": return category(((1<<Character.UPPERCASE_LETTER) | (1<<Character.LOWERCASE_LETTER) | (1<<Character.TITLECASE_LETTER) | (1<<Character.MODIFIER_LETTER) | (1<<Character.OTHER_LETTER) | (1<<Character.DECIMAL_DIGIT_NUMBER))); case "L1": return range(0x00, 0xFF); // Latin-1 case "all": return Pattern.ALL(); // Posix regular expression character classes, defined in // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html case "ASCII": return range(0x00, 0x7F); // ASCII case "Alnum": return ctype(ASCII.ALNUM); // Alphanumeric characters case "Alpha": return ctype(ASCII.ALPHA); // Alphabetic characters case "Blank": return ctype(ASCII.BLANK); // Space and tab characters case "Cntrl": return ctype(ASCII.CNTRL); // Control characters case "Digit": return range('0', '9'); // Numeric characters case "Graph": return ctype(ASCII.GRAPH); // printable and visible case "Lower": return range('a', 'z'); // Lower-case alphabetic case "Print": return range(0x20, 0x7E); // Printable characters case "Punct": return ctype(ASCII.PUNCT); // Punctuation characters case "Space": return ctype(ASCII.SPACE); // Space characters case "Upper": return range('A', 'Z'); // Upper-case alphabetic case "XDigit": return ctype(ASCII.XDIGIT); // hexadecimal digits // Java character properties, defined by methods in Character.java case "javaLowerCase": return java.lang.Character::isLowerCase; case "javaUpperCase": return Character::isUpperCase; case "javaAlphabetic": return java.lang.Character::isAlphabetic; case "javaIdeographic": return java.lang.Character::isIdeographic; case "javaTitleCase": return java.lang.Character::isTitleCase; case "javaDigit": return java.lang.Character::isDigit; case "javaDefined": return java.lang.Character::isDefined; case "javaLetter": return java.lang.Character::isLetter; case "javaLetterOrDigit": return java.lang.Character::isLetterOrDigit; case "javaJavaIdentifierStart": return java.lang.Character::isJavaIdentifierStart; case "javaJavaIdentifierPart": return java.lang.Character::isJavaIdentifierPart; case "javaUnicodeIdentifierStart": return java.lang.Character::isUnicodeIdentifierStart; case "javaUnicodeIdentifierPart": return java.lang.Character::isUnicodeIdentifierPart; case "javaIdentifierIgnorable": return java.lang.Character::isIdentifierIgnorable; case "javaSpaceChar": return java.lang.Character::isSpaceChar; case "javaWhitespace": return java.lang.Character::isWhitespace; case "javaISOControl": return java.lang.Character::isISOControl; case "javaMirrored": return java.lang.Character::isMirrored; default: return null; } } private static CharPredicate category(final int typeMask) { return ch -> (typeMask & (1 << Character.getType(ch))) != 0; } private static CharPredicate range(final int lower, final int upper) { return (BmpCharPredicate)ch -> lower <= ch && ch <= upper; } private static CharPredicate ctype(final int ctype) { return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype); } /////////////////////////////////////////////////////////////////////////////
Posix ASCII variants, not in the lookup map
/** * Posix ASCII variants, not in the lookup map */
static final BmpCharPredicate ASCII_DIGIT() { return ch -> ch < 128 && ASCII.isDigit(ch); } static final BmpCharPredicate ASCII_WORD() { return ch -> ch < 128 && ASCII.isWord(ch); } static final BmpCharPredicate ASCII_SPACE() { return ch -> ch < 128 && ASCII.isSpace(ch); } }