/*
* Copyright (C) 2008 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.common.base;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkPositionIndex;
import com.google.common.annotations.GwtCompatible;
import com.google.common.annotations.GwtIncompatible;
import com.google.common.annotations.VisibleForTesting;
import java.util.Arrays;
import java.util.BitSet;
Determines a true or false value for any Java char
value, just as Predicate
does for any Object
. Also offers basic text processing methods based on this function. Implementations are strongly encouraged to be side-effect-free and immutable. Throughout the documentation of this class, the phrase "matching character" is used to mean "any char
value c
for which this.matches(c)
returns true
".
Warning: This class deals only with char
values, that is, BMP characters. It does not understand
supplementary Unicode code
points in the range 0x10000
to 0x10FFFF
which includes the majority of assigned characters, including important CJK characters and emoji.
Supplementary characters are encoded into a String
using surrogate pairs, and a CharMatcher
treats these just as two separate characters. countIn
counts each supplementary character as 2 char
s.
For up-to-date Unicode character properties (digit, letter, etc.) and support for
supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For
basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner.
Example usages:
String trimmed = whitespace()
.trimFrom
(userInput); if (ascii()
.matchesAllOf
(s)) { ... }
See the Guava User Guide article on CharMatcher
.
Author: Kevin Bourrillion Since: 1.0
/**
* Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
* for any {@link Object}. Also offers basic text processing methods based on this function.
* Implementations are strongly encouraged to be side-effect-free and immutable.
*
* <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
* "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}".
*
* <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a
* href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand
* <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code
* points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of
* assigned characters, including important CJK characters and emoji.
*
* <p>Supplementary characters are <a
* href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded
* into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as
* two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s.
*
* <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for
* supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For
* basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner.
*
* <p>Example usages:
*
* <pre>
* String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput);
* if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
*
* <p>See the Guava User Guide article on <a
* href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher}
* </a>.
*
* @author Kevin Bourrillion
* @since 1.0
*/
@GwtCompatible(emulated = true)
public abstract class CharMatcher implements Predicate<Character> {
/*
* N777777777NO
* N7777777777777N
* M777777777777777N
* $N877777777D77777M
* N M77777777ONND777M
* MN777777777NN D777
* N7ZN777777777NN ~M7778
* N777777777777MMNN88777N
* N777777777777MNZZZ7777O
* DZN7777O77777777777777
* N7OONND7777777D77777N
* 8$M++++?N???$77777$
* M7++++N+M77777777N
* N77O777777777777$ M
* DNNM$$$$777777N D
* N$N:=N$777N7777M NZ
* 77Z::::N777777777 ODZZZ
* 77N::::::N77777777M NNZZZ$
* $777:::::::77777777MN ZM8ZZZZZ
* 777M::::::Z7777777Z77 N++ZZZZNN
* 7777M:::::M7777777$777M $++IZZZZM
* M777$:::::N777777$M7777M +++++ZZZDN
* NN$::::::7777$$M777777N N+++ZZZZNZ
* N::::::N:7$O:77777777 N++++ZZZZN
* M::::::::::::N77777777+ +?+++++ZZZM
* 8::::::::::::D77777777M O+++++ZZ
* ::::::::::::M777777777N O+?D
* M:::::::::::M77777777778 77=
* D=::::::::::N7777777777N 777
* INN===::::::=77777777777N I777N
* ?777N========N7777777777787M N7777
* 77777$D======N77777777777N777N? N777777
* I77777$$$N7===M$$77777777$77777777$MMZ77777777N
* $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON
* M$$$$$$$$M M$$$$$$$$N=N$$$$7777777$$$ND
* O77Z$$$$$$$ M$$$$$$$$MNI==$DNNNNM=~N
* 7 :N MNN$$$$M$ $$$777$8 8D8I
* NMM.:7O 777777778
* 7777777MN
* M NO .7:
* M : M
* 8
*/
// Constant matcher factory methods
Matches any character.
Since: 19.0 (since 1.0 as constant ANY
)
/**
* Matches any character.
*
* @since 19.0 (since 1.0 as constant {@code ANY})
*/
public static CharMatcher any() {
return Any.INSTANCE;
}
Matches no characters.
Since: 19.0 (since 1.0 as constant NONE
)
/**
* Matches no characters.
*
* @since 19.0 (since 1.0 as constant {@code NONE})
*/
public static CharMatcher none() {
return None.INSTANCE;
}
Determines whether a character is whitespace according to the latest Unicode standard, as
illustrated here.
This is not the same definition used by other Java APIs. (See a comparison of several definitions of "whitespace".)
All Unicode White_Space characters are on the BMP and thus supported by this API.
Note: as the Unicode definition evolves, we will modify this matcher to keep it up to
date.
Since: 19.0 (since 1.0 as constant WHITESPACE
)
/**
* Determines whether a character is whitespace according to the latest Unicode standard, as
* illustrated <a
* href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
* This is not the same definition used by other Java APIs. (See a <a
* href="https://goo.gl/Y6SLWx">comparison of several definitions of "whitespace"</a>.)
*
* <p>All Unicode White_Space characters are on the BMP and thus supported by this API.
*
* <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to
* date.
*
* @since 19.0 (since 1.0 as constant {@code WHITESPACE})
*/
public static CharMatcher whitespace() {
return Whitespace.INSTANCE;
}
Determines whether a character is a breaking whitespace (that is, a whitespace which can be interpreted as a break between words for formatting purposes). See whitespace()
for a discussion of that term. Since: 19.0 (since 2.0 as constant BREAKING_WHITESPACE
)
/**
* Determines whether a character is a breaking whitespace (that is, a whitespace which can be
* interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a
* discussion of that term.
*
* @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE})
*/
public static CharMatcher breakingWhitespace() {
return BreakingWhitespace.INSTANCE;
}
Determines whether a character is ASCII, meaning that its code point is less than 128.
Since: 19.0 (since 1.0 as constant ASCII
)
/**
* Determines whether a character is ASCII, meaning that its code point is less than 128.
*
* @since 19.0 (since 1.0 as constant {@code ASCII})
*/
public static CharMatcher ascii() {
return Ascii.INSTANCE;
}
Determines whether a character is a BMP digit according to Unicode. If you only care to match ASCII digits, you can use inRange('0', '9')
. Deprecated: Many digits are supplementary characters; see the class documentation. Since: 19.0 (since 1.0 as constant DIGIT
)
/**
* Determines whether a character is a BMP digit according to <a
* href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If
* you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
*
* @deprecated Many digits are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code DIGIT})
*/
@Deprecated
public static CharMatcher digit() {
return Digit.INSTANCE;
}
Determines whether a character is a BMP digit according to
Java's definition. If you only care to match ASCII digits, you can use inRange('0',
'9')
. Deprecated: Many digits are supplementary characters; see the class documentation. Since: 19.0 (since 1.0 as constant JAVA_DIGIT
)
/**
* Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char)
* Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0',
* '9')}.
*
* @deprecated Many digits are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT})
*/
@Deprecated
public static CharMatcher javaDigit() {
return JavaDigit.INSTANCE;
}
Determines whether a character is a BMP letter according to Java's definition. If you only care to match letters of the Latin alphabet, you can use inRange('a', 'z').or(inRange('A', 'Z'))
. Deprecated: Most letters are supplementary characters; see the class documentation. Since: 19.0 (since 1.0 as constant JAVA_LETTER
)
/**
* Determines whether a character is a BMP letter according to {@linkplain
* Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin
* alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}.
*
* @deprecated Most letters are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_LETTER})
*/
@Deprecated
public static CharMatcher javaLetter() {
return JavaLetter.INSTANCE;
}
Determines whether a character is a BMP letter or digit according to Java's definition. Deprecated: Most letters and digits are supplementary characters; see the class documentation. Since: 19.0 (since 1.0 as constant JAVA_LETTER_OR_DIGIT
).
/**
* Determines whether a character is a BMP letter or digit according to {@linkplain
* Character#isLetterOrDigit(char) Java's definition}.
*
* @deprecated Most letters and digits are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}).
*/
@Deprecated
public static CharMatcher javaLetterOrDigit() {
return JavaLetterOrDigit.INSTANCE;
}
Determines whether a BMP character is upper case according to Java's definition. Deprecated: Some uppercase characters are supplementary characters; see the class
documentation. Since: 19.0 (since 1.0 as constant JAVA_UPPER_CASE
)
/**
* Determines whether a BMP character is upper case according to {@linkplain
* Character#isUpperCase(char) Java's definition}.
*
* @deprecated Some uppercase characters are supplementary characters; see the class
* documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE})
*/
@Deprecated
public static CharMatcher javaUpperCase() {
return JavaUpperCase.INSTANCE;
}
Determines whether a BMP character is lower case according to Java's definition. Deprecated: Some lowercase characters are supplementary characters; see the class
documentation. Since: 19.0 (since 1.0 as constant JAVA_LOWER_CASE
)
/**
* Determines whether a BMP character is lower case according to {@linkplain
* Character#isLowerCase(char) Java's definition}.
*
* @deprecated Some lowercase characters are supplementary characters; see the class
* documentation.
* @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE})
*/
@Deprecated
public static CharMatcher javaLowerCase() {
return JavaLowerCase.INSTANCE;
}
Determines whether a character is an ISO control character as specified by Character.isISOControl(char)
. All ISO control codes are on the BMP and thus supported by this API.
Since: 19.0 (since 1.0 as constant JAVA_ISO_CONTROL
)
/**
* Determines whether a character is an ISO control character as specified by {@link
* Character#isISOControl(char)}.
*
* <p>All ISO control codes are on the BMP and thus supported by this API.
*
* @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL})
*/
public static CharMatcher javaIsoControl() {
return JavaIsoControl.INSTANCE;
}
Determines whether a character is invisible; that is, if its Unicode category is any of
SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
PRIVATE_USE according to ICU4J.
See also the Unicode Default_Ignorable_Code_Point property (available via ICU).
Deprecated: Most invisible characters are supplementary characters; see the class
documentation. Since: 19.0 (since 1.0 as constant INVISIBLE
)
/**
* Determines whether a character is invisible; that is, if its Unicode category is any of
* SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
* PRIVATE_USE according to ICU4J.
*
* <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU).
*
* @deprecated Most invisible characters are supplementary characters; see the class
* documentation.
* @since 19.0 (since 1.0 as constant {@code INVISIBLE})
*/
@Deprecated
public static CharMatcher invisible() {
return Invisible.INSTANCE;
}
Determines whether a character is single-width (not double-width). When in doubt, this matcher errs on the side of returning false
(that is, it tends to assume a character is double-width). Note: as the reference file evolves, we will modify this matcher to keep it up to
date.
See also UAX #11 East Asian Width.
Deprecated: Many such characters are supplementary characters; see the class documentation. Since: 19.0 (since 1.0 as constant SINGLE_WIDTH
)
/**
* Determines whether a character is single-width (not double-width). When in doubt, this matcher
* errs on the side of returning {@code false} (that is, it tends to assume a character is
* double-width).
*
* <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to
* date.
*
* <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>.
*
* @deprecated Many such characters are supplementary characters; see the class documentation.
* @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH})
*/
@Deprecated
public static CharMatcher singleWidth() {
return SingleWidth.INSTANCE;
}
// Legacy constants
Determines whether a character is whitespace according to the latest Unicode
standard, as illustrated
here.
This is not the same definition used by other Java APIs. (See a
comparison of several definitions of
"whitespace".)
Note: as the Unicode definition evolves, we will modify this constant
to keep it up to date.
Deprecated: Use whitespace()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is whitespace according to the latest Unicode
* standard, as illustrated
* <a
// href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
* This is not the same definition used by other Java APIs. (See a
* <a href="https://goo.gl/Y6SLWx">comparison of several definitions of
* "whitespace"</a>.)
*
* <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant
* to keep it up to date.
*
* @deprecated Use {@link #whitespace()} instead. This constant is scheduled to be
* removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher WHITESPACE = whitespace();
Determines whether a character is a breaking whitespace (that is, a whitespace which can be interpreted as a break between words for formatting purposes). See whitespace
for a discussion of that term. Since: 2.0 Deprecated: Use breakingWhitespace()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is a breaking whitespace (that is, a whitespace
* which can be interpreted as a break between words for formatting purposes). See
* {@link #whitespace} for a discussion of that term.
*
* @since 2.0
* @deprecated Use {@link #breakingWhitespace()} instead. This constant is scheduled
* to be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher BREAKING_WHITESPACE = breakingWhitespace();
Determines whether a character is ASCII, meaning that its code point is less than
128.
Deprecated: Use ascii()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is ASCII, meaning that its code point is less than
* 128.
*
* @deprecated Use {@link #ascii()} instead. This constant is scheduled to be
* removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher ASCII = ascii();
Determines whether a character is a digit according to
Unicode. If you only care to match ASCII digits, you can use inRange('0', '9')
. Deprecated: Many digits are supplementary characters; see the class documentation. If you need to use this, use digit()
instead. This . constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is a digit according to
* <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">
* Unicode</a>. If you only care to match ASCII digits, you can use
* {@code inRange('0', '9')}.
*
* @deprecated Many digits are supplementary characters; see the class
* documentation. If you need to use this, use {@link #digit()} instead. This
* . constant is scheduled to be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher DIGIT = digit();
Determines whether a character is a digit according to Java's definition. If you only care to match ASCII digits, you can use inRange('0', '9')
. Deprecated: Many digits are supplementary characters; see the class documentation. If you need to use this, use javaDigit()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is a digit according to
* {@linkplain Character#isDigit(char) Java's definition}. If you only care to match
* ASCII digits, you can use {@code inRange('0', '9')}.
*
* @deprecated Many digits are supplementary characters; see the class
* documentation. If you need to use this, use {@link #javaDigit()} instead.
* This constant is scheduled to be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher JAVA_DIGIT = javaDigit();
Determines whether a character is a letter according to Java's definition. If you only care to match letters of the Latin alphabet, you can use inRange('a', 'z').or(inRange('A', 'Z'))
. Deprecated: Most letters are supplementary characters; see the class documentation. If you need to use this, use javaLetter()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is a letter according to
* {@linkplain Character#isLetter(char) Java's definition}. If you only care to
* match letters of the Latin alphabet, you can use
* {@code inRange('a', 'z').or(inRange('A', 'Z'))}.
*
* @deprecated Most letters are supplementary characters; see the class
* documentation. If you need to use this, use {@link #javaLetter()} instead.
* This constant is scheduled to be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher JAVA_LETTER = javaLetter();
Determines whether a character is a letter or digit according to Java's definition. Deprecated: Most letters and digits are supplementary characters; see the class documentation. If you need to use this, use javaLetterOrDigit()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is a letter or digit according to
* {@linkplain Character#isLetterOrDigit(char) Java's definition}.
*
* @deprecated Most letters and digits are supplementary characters; see the class
* documentation. If you need to use this, use {@link #javaLetterOrDigit()}
* instead. This constant is scheduled to be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher JAVA_LETTER_OR_DIGIT = javaLetterOrDigit();
Determines whether a character is upper case according to Java's definition. Deprecated: Some uppercase letters are supplementary characters; see the class documentation. If you need to use this, use javaUpperCase()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is upper case according to
* {@linkplain Character#isUpperCase(char) Java's definition}.
*
* @deprecated Some uppercase letters are supplementary characters; see the class
* documentation. If you need to use this, use {@link #javaUpperCase()} instead.
* This constant is scheduled to be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher JAVA_UPPER_CASE = javaUpperCase();
Determines whether a character is lower case according to Java's definition. Deprecated: Some lowercase letters are supplementary characters; see the class documentation. If you need to use this, use javaLowerCase()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is lower case according to
* {@linkplain Character#isLowerCase(char) Java's definition}.
*
* @deprecated Some lowercase letters are supplementary characters; see the class
* documentation. If you need to use this, use {@link #javaLowerCase()} instead.
* This constant is scheduled to be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher JAVA_LOWER_CASE = javaLowerCase();
Determines whether a character is an ISO control character as specified by Character.isISOControl(char)
. Deprecated: Use javaIsoControl()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is an ISO control character as specified by
* {@link Character#isISOControl(char)}.
*
* @deprecated Use {@link #javaIsoControl()} instead. This constant is scheduled to
* be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher JAVA_ISO_CONTROL = javaIsoControl();
Determines whether a character is invisible; that is, if its Unicode category is
any of SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT,
SURROGATE, and PRIVATE_USE according to ICU4J.
Deprecated: Most invisible characters are supplementary characters; see the class documentation. If you need to use this, use invisible()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is invisible; that is, if its Unicode category is
* any of SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT,
* SURROGATE, and PRIVATE_USE according to ICU4J.
*
* @deprecated Most invisible characters are supplementary characters; see the class
* documentation. If you need to use this, use {@link #invisible()} instead.
* This constant is scheduled to be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher INVISIBLE = invisible();
Determines whether a character is single-width (not double-width). When in doubt, this matcher errs on the side of returning false
(that is, it tends to assume a character is double-width). Note: as the reference file evolves, we will modify this constant to
keep it up to date.
Deprecated: Many such characters are supplementary characters; see the class documentation. If you need to use this, use singleWidth()
instead. This constant is scheduled to be removed in June 2018.
/**
* Determines whether a character is single-width (not double-width). When in doubt,
* this matcher errs on the side of returning {@code false} (that is, it tends to
* assume a character is double-width).
*
* <p><b>Note:</b> as the reference file evolves, we will modify this constant to
* keep it up to date.
*
* @deprecated Many such characters are supplementary characters; see the class
* documentation. If you need to use this, use {@link #singleWidth()} instead.
* This constant is scheduled to be removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher SINGLE_WIDTH = singleWidth();
Matches any character.
Deprecated: Use any()
instead. This constant is scheduled to be removed in June 2018.
/**
* Matches any character.
*
* @deprecated Use {@link #any()} instead. This constant is scheduled to be
* removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher ANY = any();
Matches no characters.
Deprecated: Use none()
instead. This constant is scheduled to be removed in June 2018.
/**
* Matches no characters.
*
* @deprecated Use {@link #none()} instead. This constant is scheduled to be
* removed in June 2018.
*/
@com.google.common.annotations.Beta
@Deprecated
public static final CharMatcher NONE = none();
// Static factories
Returns a char
matcher that matches only one specified BMP character. /** Returns a {@code char} matcher that matches only one specified BMP character. */
public static CharMatcher is(final char match) {
return new Is(match);
}
Returns a char
matcher that matches any character except the BMP character specified. To negate another CharMatcher
, use negate()
.
/**
* Returns a {@code char} matcher that matches any character except the BMP character specified.
*
* <p>To negate another {@code CharMatcher}, use {@link #negate()}.
*/
public static CharMatcher isNot(final char match) {
return new IsNot(match);
}
Returns a char
matcher that matches any BMP character present in the given character sequence. Returns a bogus matcher if the sequence contains supplementary characters. /**
* Returns a {@code char} matcher that matches any BMP character present in the given character
* sequence. Returns a bogus matcher if the sequence contains supplementary characters.
*/
public static CharMatcher anyOf(final CharSequence sequence) {
switch (sequence.length()) {
case 0:
return none();
case 1:
return is(sequence.charAt(0));
case 2:
return isEither(sequence.charAt(0), sequence.charAt(1));
default:
// TODO(lowasser): is it potentially worth just going ahead and building a precomputed
// matcher?
return new AnyOf(sequence);
}
}
Returns a char
matcher that matches any BMP character not present in the given character sequence. Returns a bogus matcher if the sequence contains supplementary characters. /**
* Returns a {@code char} matcher that matches any BMP character not present in the given
* character sequence. Returns a bogus matcher if the sequence contains supplementary characters.
*/
public static CharMatcher noneOf(CharSequence sequence) {
return anyOf(sequence).negate();
}
Returns a char
matcher that matches any character in a given BMP range (both endpoints are inclusive). For example, to match any lowercase letter of the English alphabet, use
CharMatcher.inRange('a', 'z')
. Throws: - IllegalArgumentException – if
endInclusive < startInclusive
/**
* Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints
* are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
* CharMatcher.inRange('a', 'z')}.
*
* @throws IllegalArgumentException if {@code endInclusive < startInclusive}
*/
public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
return new InRange(startInclusive, endInclusive);
}
Returns a matcher with identical behavior to the given Character
-based predicate, but which operates on primitive char
instances instead. /**
* Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
* which operates on primitive {@code char} instances instead.
*/
public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate);
}
// Constructors
Constructor for use by subclasses. When subclassing, you may want to override
toString()
to provide a useful description. /**
* Constructor for use by subclasses. When subclassing, you may want to override {@code
* toString()} to provide a useful description.
*/
protected CharMatcher() {}
// Abstract methods
Determines a true or false value for the given character. /** Determines a true or false value for the given character. */
public abstract boolean matches(char c);
// Non-static factories
Returns a matcher that matches any character not matched by this matcher. /** Returns a matcher that matches any character not matched by this matcher. */
// @Override under Java 8 but not under Java 7
public CharMatcher negate() {
return new Negated(this);
}
Returns a matcher that matches any character matched by both this matcher and other
. /**
* Returns a matcher that matches any character matched by both this matcher and {@code other}.
*/
public CharMatcher and(CharMatcher other) {
return new And(this, other);
}
Returns a matcher that matches any character matched by either this matcher or other
. /**
* Returns a matcher that matches any character matched by either this matcher or {@code other}.
*/
public CharMatcher or(CharMatcher other) {
return new Or(this, other);
}
Returns a char
matcher functionally equivalent to this one, but which may be faster to query than the original; your mileage may vary. Precomputation takes time and is likely to be worthwhile only if the precomputed matcher is queried many thousands of times. This method has no effect (returns this
) when called in GWT: it's unclear whether a precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a worthwhile tradeoff in a browser.
/**
* Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
* query than the original; your mileage may vary. Precomputation takes time and is likely to be
* worthwhile only if the precomputed matcher is queried many thousands of times.
*
* <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
* precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
* worthwhile tradeoff in a browser.
*/
public CharMatcher precomputed() {
return Platform.precomputeCharMatcher(this);
}
private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
This is the actual implementation of precomputed
, but we bounce calls through a method on Platform
so that we can have different behavior in GWT. This implementation tries to be smart in a number of ways. It recognizes cases where the
negation is cheaper to precompute than the matcher itself; it tries to build small hash tables
for matchers that only match a few characters, and so on. In the worst-case scenario, it
constructs an eight-kilobyte bit array and queries that. In many situations this produces a
matcher which is faster to query than the original.
/**
* This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
* on {@link Platform} so that we can have different behavior in GWT.
*
* <p>This implementation tries to be smart in a number of ways. It recognizes cases where the
* negation is cheaper to precompute than the matcher itself; it tries to build small hash tables
* for matchers that only match a few characters, and so on. In the worst-case scenario, it
* constructs an eight-kilobyte bit array and queries that. In many situations this produces a
* matcher which is faster to query than the original.
*/
@GwtIncompatible // SmallCharMatcher
CharMatcher precomputedInternal() {
final BitSet table = new BitSet();
setBits(table);
int totalCharacters = table.cardinality();
if (totalCharacters * 2 <= DISTINCT_CHARS) {
return precomputedPositive(totalCharacters, table, toString());
} else {
// TODO(lowasser): is it worth it to worry about the last character of large matchers?
table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
int negatedCharacters = DISTINCT_CHARS - totalCharacters;
String suffix = ".negate()";
final String description = toString();
String negatedDescription =
description.endsWith(suffix)
? description.substring(0, description.length() - suffix.length())
: description + suffix;
return new NegatedFastMatcher(
precomputedPositive(negatedCharacters, table, negatedDescription)) {
@Override
public String toString() {
return description;
}
};
}
}
Helper method for precomputedInternal
that doesn't test if the negation is cheaper. /**
* Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
*/
@GwtIncompatible // SmallCharMatcher
private static CharMatcher precomputedPositive(
int totalCharacters, BitSet table, String description) {
switch (totalCharacters) {
case 0:
return none();
case 1:
return is((char) table.nextSetBit(0));
case 2:
char c1 = (char) table.nextSetBit(0);
char c2 = (char) table.nextSetBit(c1 + 1);
return isEither(c1, c2);
default:
return isSmall(totalCharacters, table.length())
? SmallCharMatcher.from(table, description)
: new BitSetMatcher(table, description);
}
}
@GwtIncompatible // SmallCharMatcher
private static boolean isSmall(int totalCharacters, int tableLength) {
return totalCharacters <= SmallCharMatcher.MAX_SIZE
&& tableLength > (totalCharacters * 4 * Character.SIZE);
// err on the side of BitSetMatcher
}
Sets bits in table
matched by this matcher. /** Sets bits in {@code table} matched by this matcher. */
@GwtIncompatible // used only from other GwtIncompatible code
void setBits(BitSet table) {
for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
if (matches((char) c)) {
table.set(c);
}
}
}
// Text processing routines
Returns true
if a character sequence contains at least one matching BMP character. Equivalent to !matchesNoneOf(sequence)
. The default implementation iterates over the sequence, invoking matches
for each character, until this returns true
or the end is reached.
Params: - sequence – the character sequence to examine, possibly empty
Returns: true
if this matcher matches at least one character in the sequenceSince: 8.0
/**
* Returns {@code true} if a character sequence contains at least one matching BMP character.
* Equivalent to {@code !matchesNoneOf(sequence)}.
*
* <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
* character, until this returns {@code true} or the end is reached.
*
* @param sequence the character sequence to examine, possibly empty
* @return {@code true} if this matcher matches at least one character in the sequence
* @since 8.0
*/
public boolean matchesAnyOf(CharSequence sequence) {
return !matchesNoneOf(sequence);
}
Returns true
if a character sequence contains only matching BMP characters. The default implementation iterates over the sequence, invoking matches
for each character, until this returns false
or the end is reached.
Params: - sequence – the character sequence to examine, possibly empty
Returns: true
if this matcher matches every character in the sequence, including when the sequence is empty
/**
* Returns {@code true} if a character sequence contains only matching BMP characters.
*
* <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
* character, until this returns {@code false} or the end is reached.
*
* @param sequence the character sequence to examine, possibly empty
* @return {@code true} if this matcher matches every character in the sequence, including when
* the sequence is empty
*/
public boolean matchesAllOf(CharSequence sequence) {
for (int i = sequence.length() - 1; i >= 0; i--) {
if (!matches(sequence.charAt(i))) {
return false;
}
}
return true;
}
Returns true
if a character sequence contains no matching BMP characters. Equivalent to !matchesAnyOf(sequence)
. The default implementation iterates over the sequence, invoking matches
for each character, until this returns true
or the end is reached.
Params: - sequence – the character sequence to examine, possibly empty
Returns: true
if this matcher matches no characters in the sequence, including when the sequence is empty
/**
* Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to
* {@code !matchesAnyOf(sequence)}.
*
* <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
* character, until this returns {@code true} or the end is reached.
*
* @param sequence the character sequence to examine, possibly empty
* @return {@code true} if this matcher matches no characters in the sequence, including when the
* sequence is empty
*/
public boolean matchesNoneOf(CharSequence sequence) {
return indexIn(sequence) == -1;
}
Returns the index of the first matching BMP character in a character sequence, or -1
if no matching character is present. The default implementation iterates over the sequence in forward order calling matches
for each character.
Params: - sequence – the character sequence to examine from the beginning
Returns: an index, or -1
if no character matches
/**
* Returns the index of the first matching BMP character in a character sequence, or {@code -1} if
* no matching character is present.
*
* <p>The default implementation iterates over the sequence in forward order calling {@link
* #matches} for each character.
*
* @param sequence the character sequence to examine from the beginning
* @return an index, or {@code -1} if no character matches
*/
public int indexIn(CharSequence sequence) {
return indexIn(sequence, 0);
}
Returns the index of the first matching BMP character in a character sequence, starting from a given position, or -1
if no character matches after that position. The default implementation iterates over the sequence in forward order, beginning at
start
, calling matches
for each character.
Params: - sequence – the character sequence to examine
- start – the first index to examine; must be nonnegative and no greater than
sequence.length()
Throws: - IndexOutOfBoundsException – if start is negative or greater than
sequence.length()
Returns: the index of the first matching character, guaranteed to be no less than start
, or -1
if no character matches
/**
* Returns the index of the first matching BMP character in a character sequence, starting from a
* given position, or {@code -1} if no character matches after that position.
*
* <p>The default implementation iterates over the sequence in forward order, beginning at {@code
* start}, calling {@link #matches} for each character.
*
* @param sequence the character sequence to examine
* @param start the first index to examine; must be nonnegative and no greater than {@code
* sequence.length()}
* @return the index of the first matching character, guaranteed to be no less than {@code start},
* or {@code -1} if no character matches
* @throws IndexOutOfBoundsException if start is negative or greater than {@code
* sequence.length()}
*/
public int indexIn(CharSequence sequence, int start) {
int length = sequence.length();
checkPositionIndex(start, length);
for (int i = start; i < length; i++) {
if (matches(sequence.charAt(i))) {
return i;
}
}
return -1;
}
Returns the index of the last matching BMP character in a character sequence, or -1
if no matching character is present. The default implementation iterates over the sequence in reverse order calling matches
for each character.
Params: - sequence – the character sequence to examine from the end
Returns: an index, or -1
if no character matches
/**
* Returns the index of the last matching BMP character in a character sequence, or {@code -1} if
* no matching character is present.
*
* <p>The default implementation iterates over the sequence in reverse order calling {@link
* #matches} for each character.
*
* @param sequence the character sequence to examine from the end
* @return an index, or {@code -1} if no character matches
*/
public int lastIndexIn(CharSequence sequence) {
for (int i = sequence.length() - 1; i >= 0; i--) {
if (matches(sequence.charAt(i))) {
return i;
}
}
return -1;
}
Returns the number of matching char
s found in a character sequence. Counts 2 per supplementary character, such as for whitespace
().negate
().
/**
* Returns the number of matching {@code char}s found in a character sequence.
*
* <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}().
*/
public int countIn(CharSequence sequence) {
int count = 0;
for (int i = 0; i < sequence.length(); i++) {
if (matches(sequence.charAt(i))) {
count++;
}
}
return count;
}
Returns a string containing all non-matching characters of a character sequence, in order. For
example:
CharMatcher.is('a').removeFrom("bazaar")
... returns "bzr"
. /**
* Returns a string containing all non-matching characters of a character sequence, in order. For
* example:
*
* <pre>{@code
* CharMatcher.is('a').removeFrom("bazaar")
* }</pre>
*
* ... returns {@code "bzr"}.
*/
public String removeFrom(CharSequence sequence) {
String string = sequence.toString();
int pos = indexIn(string);
if (pos == -1) {
return string;
}
char[] chars = string.toCharArray();
int spread = 1;
// This unusual loop comes from extensive benchmarking
OUT:
while (true) {
pos++;
while (true) {
if (pos == chars.length) {
break OUT;
}
if (matches(chars[pos])) {
break;
}
chars[pos - spread] = chars[pos];
pos++;
}
spread++;
}
return new String(chars, 0, pos - spread);
}
Returns a string containing all matching BMP characters of a character sequence, in order. For
example:
CharMatcher.is('a').retainFrom("bazaar")
... returns "aaa"
. /**
* Returns a string containing all matching BMP characters of a character sequence, in order. For
* example:
*
* <pre>{@code
* CharMatcher.is('a').retainFrom("bazaar")
* }</pre>
*
* ... returns {@code "aaa"}.
*/
public String retainFrom(CharSequence sequence) {
return negate().removeFrom(sequence);
}
Returns a string copy of the input character sequence, with each matching BMP character
replaced by a given replacement character. For example:
CharMatcher.is('a').replaceFrom("radar", 'o')
... returns "rodor"
. The default implementation uses indexIn(CharSequence)
to find the first matching character, then iterates the remainder of the sequence calling matches(char)
for each character.
Params: - sequence – the character sequence to replace matching characters in
- replacement – the character to append to the result string in place of each matching character in
sequence
Returns: the new string
/**
* Returns a string copy of the input character sequence, with each matching BMP character
* replaced by a given replacement character. For example:
*
* <pre>{@code
* CharMatcher.is('a').replaceFrom("radar", 'o')
* }</pre>
*
* ... returns {@code "rodor"}.
*
* <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
* character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
* character.
*
* @param sequence the character sequence to replace matching characters in
* @param replacement the character to append to the result string in place of each matching
* character in {@code sequence}
* @return the new string
*/
public String replaceFrom(CharSequence sequence, char replacement) {
String string = sequence.toString();
int pos = indexIn(string);
if (pos == -1) {
return string;
}
char[] chars = string.toCharArray();
chars[pos] = replacement;
for (int i = pos + 1; i < chars.length; i++) {
if (matches(chars[i])) {
chars[i] = replacement;
}
}
return new String(chars);
}
Returns a string copy of the input character sequence, with each matching BMP character
replaced by a given replacement sequence. For example:
CharMatcher.is('a').replaceFrom("yaha", "oo")
... returns "yoohoo"
. Note: If the replacement is a fixed string with only one character, you are better off calling replaceFrom(CharSequence, char)
directly.
Params: - sequence – the character sequence to replace matching characters in
- replacement – the characters to append to the result string in place of each matching character in
sequence
Returns: the new string
/**
* Returns a string copy of the input character sequence, with each matching BMP character
* replaced by a given replacement sequence. For example:
*
* <pre>{@code
* CharMatcher.is('a').replaceFrom("yaha", "oo")
* }</pre>
*
* ... returns {@code "yoohoo"}.
*
* <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
* off calling {@link #replaceFrom(CharSequence, char)} directly.
*
* @param sequence the character sequence to replace matching characters in
* @param replacement the characters to append to the result string in place of each matching
* character in {@code sequence}
* @return the new string
*/
public String replaceFrom(CharSequence sequence, CharSequence replacement) {
int replacementLen = replacement.length();
if (replacementLen == 0) {
return removeFrom(sequence);
}
if (replacementLen == 1) {
return replaceFrom(sequence, replacement.charAt(0));
}
String string = sequence.toString();
int pos = indexIn(string);
if (pos == -1) {
return string;
}
int len = string.length();
StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
int oldpos = 0;
do {
buf.append(string, oldpos, pos);
buf.append(replacement);
oldpos = pos + 1;
pos = indexIn(string, oldpos);
} while (pos != -1);
buf.append(string, oldpos, len);
return buf.toString();
}
Returns a substring of the input character sequence that omits all matching BMP characters from
the beginning and from the end of the string. For example:
CharMatcher.anyOf("ab").trimFrom("abacatbab")
... returns "cat"
. Note that:
CharMatcher.inRange('\0', ' ').trimFrom(str)
... is equivalent to String.trim()
. /**
* Returns a substring of the input character sequence that omits all matching BMP characters from
* the beginning and from the end of the string. For example:
*
* <pre>{@code
* CharMatcher.anyOf("ab").trimFrom("abacatbab")
* }</pre>
*
* ... returns {@code "cat"}.
*
* <p>Note that:
*
* <pre>{@code
* CharMatcher.inRange('\0', ' ').trimFrom(str)
* }</pre>
*
* ... is equivalent to {@link String#trim()}.
*/
public String trimFrom(CharSequence sequence) {
int len = sequence.length();
int first;
int last;
for (first = 0; first < len; first++) {
if (!matches(sequence.charAt(first))) {
break;
}
}
for (last = len - 1; last > first; last--) {
if (!matches(sequence.charAt(last))) {
break;
}
}
return sequence.subSequence(first, last + 1).toString();
}
Returns a substring of the input character sequence that omits all matching BMP characters from
the beginning of the string. For example:
CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")
... returns "catbab"
. /**
* Returns a substring of the input character sequence that omits all matching BMP characters from
* the beginning of the string. For example:
*
* <pre>{@code
* CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")
* }</pre>
*
* ... returns {@code "catbab"}.
*/
public String trimLeadingFrom(CharSequence sequence) {
int len = sequence.length();
for (int first = 0; first < len; first++) {
if (!matches(sequence.charAt(first))) {
return sequence.subSequence(first, len).toString();
}
}
return "";
}
Returns a substring of the input character sequence that omits all matching BMP characters from
the end of the string. For example:
CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")
... returns "abacat"
. /**
* Returns a substring of the input character sequence that omits all matching BMP characters from
* the end of the string. For example:
*
* <pre>{@code
* CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")
* }</pre>
*
* ... returns {@code "abacat"}.
*/
public String trimTrailingFrom(CharSequence sequence) {
int len = sequence.length();
for (int last = len - 1; last >= 0; last--) {
if (!matches(sequence.charAt(last))) {
return sequence.subSequence(0, last + 1).toString();
}
}
return "";
}
Returns a string copy of the input character sequence, with each group of consecutive matching
BMP characters replaced by a single replacement character. For example:
CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')
... returns "b-p-r"
. The default implementation uses indexIn(CharSequence)
to find the first matching character, then iterates the remainder of the sequence calling matches(char)
for each character.
Params: - sequence – the character sequence to replace matching groups of characters in
- replacement – the character to append to the result string in place of each group of matching characters in
sequence
Returns: the new string
/**
* Returns a string copy of the input character sequence, with each group of consecutive matching
* BMP characters replaced by a single replacement character. For example:
*
* <pre>{@code
* CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')
* }</pre>
*
* ... returns {@code "b-p-r"}.
*
* <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
* character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
* character.
*
* @param sequence the character sequence to replace matching groups of characters in
* @param replacement the character to append to the result string in place of each group of
* matching characters in {@code sequence}
* @return the new string
*/
public String collapseFrom(CharSequence sequence, char replacement) {
// This implementation avoids unnecessary allocation.
int len = sequence.length();
for (int i = 0; i < len; i++) {
char c = sequence.charAt(i);
if (matches(c)) {
if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
// a no-op replacement
i++;
} else {
StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement);
return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
}
}
}
// no replacement needed
return sequence.toString();
}
Collapses groups of matching characters exactly as collapseFrom
does, except that groups of matching BMP characters at the start or end of the sequence are removed without replacement. /**
* Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
* groups of matching BMP characters at the start or end of the sequence are removed without
* replacement.
*/
public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
// This implementation avoids unnecessary allocation.
int len = sequence.length();
int first = 0;
int last = len - 1;
while (first < len && matches(sequence.charAt(first))) {
first++;
}
while (last > first && matches(sequence.charAt(last))) {
last--;
}
return (first == 0 && last == len - 1)
? collapseFrom(sequence, replacement)
: finishCollapseFrom(
sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false);
}
private String finishCollapseFrom(
CharSequence sequence,
int start,
int end,
char replacement,
StringBuilder builder,
boolean inMatchingGroup) {
for (int i = start; i < end; i++) {
char c = sequence.charAt(i);
if (matches(c)) {
if (!inMatchingGroup) {
builder.append(replacement);
inMatchingGroup = true;
}
} else {
builder.append(c);
inMatchingGroup = false;
}
}
return builder.toString();
}
Deprecated: Provided only to satisfy the Predicate
interface; use matches
instead.
/**
* @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches}
* instead.
*/
@Deprecated
@Override
public boolean apply(Character character) {
return matches(character);
}
Returns a string representation of this CharMatcher
, such as
CharMatcher.or(WHITESPACE, JAVA_DIGIT)
. /**
* Returns a string representation of this {@code CharMatcher}, such as {@code
* CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
*/
@Override
public String toString() {
return super.toString();
}
Returns the Java Unicode escape sequence for the given char
, in the form "\u12AB" where "12AB" is the four hexadecimal digits representing the 16-bit code unit. /**
* Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where
* "12AB" is the four hexadecimal digits representing the 16-bit code unit.
*/
private static String showCharacter(char c) {
String hex = "0123456789ABCDEF";
char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
for (int i = 0; i < 4; i++) {
tmp[5 - i] = hex.charAt(c & 0xF);
c = (char) (c >> 4);
}
return String.copyValueOf(tmp);
}
// Fast matchers
A matcher for which precomputation will not yield any significant benefit. /** A matcher for which precomputation will not yield any significant benefit. */
abstract static class FastMatcher extends CharMatcher {
@Override
public final CharMatcher precomputed() {
return this;
}
@Override
public CharMatcher negate() {
return new NegatedFastMatcher(this);
}
}
FastMatcher
which overrides toString()
with a custom name. /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */
abstract static class NamedFastMatcher extends FastMatcher {
private final String description;
NamedFastMatcher(String description) {
this.description = checkNotNull(description);
}
@Override
public final String toString() {
return description;
}
}
Negation of a FastMatcher
. /** Negation of a {@link FastMatcher}. */
static class NegatedFastMatcher extends Negated {
NegatedFastMatcher(CharMatcher original) {
super(original);
}
@Override
public final CharMatcher precomputed() {
return this;
}
}
Fast matcher using a BitSet
table of matching characters. /** Fast matcher using a {@link BitSet} table of matching characters. */
@GwtIncompatible // used only from other GwtIncompatible code
private static final class BitSetMatcher extends NamedFastMatcher {
private final BitSet table;
private BitSetMatcher(BitSet table, String description) {
super(description);
if (table.length() + Long.SIZE < table.size()) {
table = (BitSet) table.clone();
// If only we could actually call BitSet.trimToSize() ourselves...
}
this.table = table;
}
@Override
public boolean matches(char c) {
return table.get(c);
}
@Override
void setBits(BitSet bitSet) {
bitSet.or(table);
}
}
// Static constant implementation classes
Implementation of CharMatcher.any()
. /** Implementation of {@link #any()}. */
private static final class Any extends NamedFastMatcher {
static final Any INSTANCE = new Any();
private Any() {
super("CharMatcher.any()");
}
@Override
public boolean matches(char c) {
return true;
}
@Override
public int indexIn(CharSequence sequence) {
return (sequence.length() == 0) ? -1 : 0;
}
@Override
public int indexIn(CharSequence sequence, int start) {
int length = sequence.length();
checkPositionIndex(start, length);
return (start == length) ? -1 : start;
}
@Override
public int lastIndexIn(CharSequence sequence) {
return sequence.length() - 1;
}
@Override
public boolean matchesAllOf(CharSequence sequence) {
checkNotNull(sequence);
return true;
}
@Override
public boolean matchesNoneOf(CharSequence sequence) {
return sequence.length() == 0;
}
@Override
public String removeFrom(CharSequence sequence) {
checkNotNull(sequence);
return "";
}
@Override
public String replaceFrom(CharSequence sequence, char replacement) {
char[] array = new char[sequence.length()];
Arrays.fill(array, replacement);
return new String(array);
}
@Override
public String replaceFrom(CharSequence sequence, CharSequence replacement) {
StringBuilder result = new StringBuilder(sequence.length() * replacement.length());
for (int i = 0; i < sequence.length(); i++) {
result.append(replacement);
}
return result.toString();
}
@Override
public String collapseFrom(CharSequence sequence, char replacement) {
return (sequence.length() == 0) ? "" : String.valueOf(replacement);
}
@Override
public String trimFrom(CharSequence sequence) {
checkNotNull(sequence);
return "";
}
@Override
public int countIn(CharSequence sequence) {
return sequence.length();
}
@Override
public CharMatcher and(CharMatcher other) {
return checkNotNull(other);
}
@Override
public CharMatcher or(CharMatcher other) {
checkNotNull(other);
return this;
}
@Override
public CharMatcher negate() {
return none();
}
}
Implementation of CharMatcher.none()
. /** Implementation of {@link #none()}. */
private static final class None extends NamedFastMatcher {
static final None INSTANCE = new None();
private None() {
super("CharMatcher.none()");
}
@Override
public boolean matches(char c) {
return false;
}
@Override
public int indexIn(CharSequence sequence) {
checkNotNull(sequence);
return -1;
}
@Override
public int indexIn(CharSequence sequence, int start) {
int length = sequence.length();
checkPositionIndex(start, length);
return -1;
}
@Override
public int lastIndexIn(CharSequence sequence) {
checkNotNull(sequence);
return -1;
}
@Override
public boolean matchesAllOf(CharSequence sequence) {
return sequence.length() == 0;
}
@Override
public boolean matchesNoneOf(CharSequence sequence) {
checkNotNull(sequence);
return true;
}
@Override
public String removeFrom(CharSequence sequence) {
return sequence.toString();
}
@Override
public String replaceFrom(CharSequence sequence, char replacement) {
return sequence.toString();
}
@Override
public String replaceFrom(CharSequence sequence, CharSequence replacement) {
checkNotNull(replacement);
return sequence.toString();
}
@Override
public String collapseFrom(CharSequence sequence, char replacement) {
return sequence.toString();
}
@Override
public String trimFrom(CharSequence sequence) {
return sequence.toString();
}
@Override
public String trimLeadingFrom(CharSequence sequence) {
return sequence.toString();
}
@Override
public String trimTrailingFrom(CharSequence sequence) {
return sequence.toString();
}
@Override
public int countIn(CharSequence sequence) {
checkNotNull(sequence);
return 0;
}
@Override
public CharMatcher and(CharMatcher other) {
checkNotNull(other);
return this;
}
@Override
public CharMatcher or(CharMatcher other) {
return checkNotNull(other);
}
@Override
public CharMatcher negate() {
return any();
}
}
Implementation of CharMatcher.whitespace()
. /** Implementation of {@link #whitespace()}. */
@VisibleForTesting
static final class Whitespace extends NamedFastMatcher {
static final String TABLE =
"\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000"
+ "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680"
+ "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009"
+ "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000";
static final int MULTIPLIER = 1682554634;
static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1);
static final Whitespace INSTANCE = new Whitespace();
Whitespace() {
super("CharMatcher.whitespace()");
}
@Override
public boolean matches(char c) {
return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c;
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
for (int i = 0; i < TABLE.length(); i++) {
table.set(TABLE.charAt(i));
}
}
}
Implementation of CharMatcher.breakingWhitespace()
. /** Implementation of {@link #breakingWhitespace()}. */
private static final class BreakingWhitespace extends CharMatcher {
static final CharMatcher INSTANCE = new BreakingWhitespace();
@Override
public boolean matches(char c) {
switch (c) {
case '\t':
case '\n':
case '\013':
case '\f':
case '\r':
case ' ':
case '\u0085':
case '\u1680':
case '\u2028':
case '\u2029':
case '\u205f':
case '\u3000':
return true;
case '\u2007':
return false;
default:
return c >= '\u2000' && c <= '\u200a';
}
}
@Override
public String toString() {
return "CharMatcher.breakingWhitespace()";
}
}
Implementation of CharMatcher.ascii()
. /** Implementation of {@link #ascii()}. */
private static final class Ascii extends NamedFastMatcher {
static final Ascii INSTANCE = new Ascii();
Ascii() {
super("CharMatcher.ascii()");
}
@Override
public boolean matches(char c) {
return c <= '\u007f';
}
}
Implementation that matches characters that fall within multiple ranges. /** Implementation that matches characters that fall within multiple ranges. */
private static class RangesMatcher extends CharMatcher {
private final String description;
private final char[] rangeStarts;
private final char[] rangeEnds;
RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
this.description = description;
this.rangeStarts = rangeStarts;
this.rangeEnds = rangeEnds;
checkArgument(rangeStarts.length == rangeEnds.length);
for (int i = 0; i < rangeStarts.length; i++) {
checkArgument(rangeStarts[i] <= rangeEnds[i]);
if (i + 1 < rangeStarts.length) {
checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
}
}
}
@Override
public boolean matches(char c) {
int index = Arrays.binarySearch(rangeStarts, c);
if (index >= 0) {
return true;
} else {
index = ~index - 1;
return index >= 0 && c <= rangeEnds[index];
}
}
@Override
public String toString() {
return description;
}
}
Implementation of CharMatcher.digit()
. /** Implementation of {@link #digit()}. */
private static final class Digit extends RangesMatcher {
// Plug the following UnicodeSet pattern into
// https://unicode.org/cldr/utility/list-unicodeset.jsp
// [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]]
// and get the zeroes from there.
// Must be in ascending order.
private static final String ZEROES =
"0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6"
+ "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0"
+ "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10";
private static char[] zeroes() {
return ZEROES.toCharArray();
}
private static char[] nines() {
char[] nines = new char[ZEROES.length()];
for (int i = 0; i < ZEROES.length(); i++) {
nines[i] = (char) (ZEROES.charAt(i) + 9);
}
return nines;
}
static final Digit INSTANCE = new Digit();
private Digit() {
super("CharMatcher.digit()", zeroes(), nines());
}
}
Implementation of CharMatcher.javaDigit()
. /** Implementation of {@link #javaDigit()}. */
private static final class JavaDigit extends CharMatcher {
static final JavaDigit INSTANCE = new JavaDigit();
@Override
public boolean matches(char c) {
return Character.isDigit(c);
}
@Override
public String toString() {
return "CharMatcher.javaDigit()";
}
}
Implementation of CharMatcher.javaLetter()
. /** Implementation of {@link #javaLetter()}. */
private static final class JavaLetter extends CharMatcher {
static final JavaLetter INSTANCE = new JavaLetter();
@Override
public boolean matches(char c) {
return Character.isLetter(c);
}
@Override
public String toString() {
return "CharMatcher.javaLetter()";
}
}
Implementation of CharMatcher.javaLetterOrDigit()
. /** Implementation of {@link #javaLetterOrDigit()}. */
private static final class JavaLetterOrDigit extends CharMatcher {
static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit();
@Override
public boolean matches(char c) {
return Character.isLetterOrDigit(c);
}
@Override
public String toString() {
return "CharMatcher.javaLetterOrDigit()";
}
}
Implementation of CharMatcher.javaUpperCase()
. /** Implementation of {@link #javaUpperCase()}. */
private static final class JavaUpperCase extends CharMatcher {
static final JavaUpperCase INSTANCE = new JavaUpperCase();
@Override
public boolean matches(char c) {
return Character.isUpperCase(c);
}
@Override
public String toString() {
return "CharMatcher.javaUpperCase()";
}
}
Implementation of CharMatcher.javaLowerCase()
. /** Implementation of {@link #javaLowerCase()}. */
private static final class JavaLowerCase extends CharMatcher {
static final JavaLowerCase INSTANCE = new JavaLowerCase();
@Override
public boolean matches(char c) {
return Character.isLowerCase(c);
}
@Override
public String toString() {
return "CharMatcher.javaLowerCase()";
}
}
Implementation of CharMatcher.javaIsoControl()
. /** Implementation of {@link #javaIsoControl()}. */
private static final class JavaIsoControl extends NamedFastMatcher {
static final JavaIsoControl INSTANCE = new JavaIsoControl();
private JavaIsoControl() {
super("CharMatcher.javaIsoControl()");
}
@Override
public boolean matches(char c) {
return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f');
}
}
Implementation of CharMatcher.invisible()
. /** Implementation of {@link #invisible()}. */
private static final class Invisible extends RangesMatcher {
// Plug the following UnicodeSet pattern into
// https://unicode.org/cldr/utility/list-unicodeset.jsp
// [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]]
// with the "Abbreviate" option, and get the ranges from there.
private static final String RANGE_STARTS =
"\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066"
+ "\u3000\ud800\ufeff\ufff9";
private static final String RANGE_ENDS = // inclusive ends
"\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f"
+ "\u3000\uf8ff\ufeff\ufffb";
static final Invisible INSTANCE = new Invisible();
private Invisible() {
super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray());
}
}
Implementation of CharMatcher.singleWidth()
. /** Implementation of {@link #singleWidth()}. */
private static final class SingleWidth extends RangesMatcher {
static final SingleWidth INSTANCE = new SingleWidth();
private SingleWidth() {
super(
"CharMatcher.singleWidth()",
"\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
"\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
}
}
// Non-static factory implementation classes
Implementation of negate()
. /** Implementation of {@link #negate()}. */
private static class Negated extends CharMatcher {
final CharMatcher original;
Negated(CharMatcher original) {
this.original = checkNotNull(original);
}
@Override
public boolean matches(char c) {
return !original.matches(c);
}
@Override
public boolean matchesAllOf(CharSequence sequence) {
return original.matchesNoneOf(sequence);
}
@Override
public boolean matchesNoneOf(CharSequence sequence) {
return original.matchesAllOf(sequence);
}
@Override
public int countIn(CharSequence sequence) {
return sequence.length() - original.countIn(sequence);
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
BitSet tmp = new BitSet();
original.setBits(tmp);
tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
table.or(tmp);
}
@Override
public CharMatcher negate() {
return original;
}
@Override
public String toString() {
return original + ".negate()";
}
}
Implementation of CharMatcher.and(CharMatcher)
. /** Implementation of {@link #and(CharMatcher)}. */
private static final class And extends CharMatcher {
final CharMatcher first;
final CharMatcher second;
And(CharMatcher a, CharMatcher b) {
first = checkNotNull(a);
second = checkNotNull(b);
}
@Override
public boolean matches(char c) {
return first.matches(c) && second.matches(c);
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
BitSet tmp1 = new BitSet();
first.setBits(tmp1);
BitSet tmp2 = new BitSet();
second.setBits(tmp2);
tmp1.and(tmp2);
table.or(tmp1);
}
@Override
public String toString() {
return "CharMatcher.and(" + first + ", " + second + ")";
}
}
Implementation of CharMatcher.or(CharMatcher)
. /** Implementation of {@link #or(CharMatcher)}. */
private static final class Or extends CharMatcher {
final CharMatcher first;
final CharMatcher second;
Or(CharMatcher a, CharMatcher b) {
first = checkNotNull(a);
second = checkNotNull(b);
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
first.setBits(table);
second.setBits(table);
}
@Override
public boolean matches(char c) {
return first.matches(c) || second.matches(c);
}
@Override
public String toString() {
return "CharMatcher.or(" + first + ", " + second + ")";
}
}
// Static factory implementations
Implementation of CharMatcher.is(char)
. /** Implementation of {@link #is(char)}. */
private static final class Is extends FastMatcher {
private final char match;
Is(char match) {
this.match = match;
}
@Override
public boolean matches(char c) {
return c == match;
}
@Override
public String replaceFrom(CharSequence sequence, char replacement) {
return sequence.toString().replace(match, replacement);
}
@Override
public CharMatcher and(CharMatcher other) {
return other.matches(match) ? this : none();
}
@Override
public CharMatcher or(CharMatcher other) {
return other.matches(match) ? other : super.or(other);
}
@Override
public CharMatcher negate() {
return isNot(match);
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
table.set(match);
}
@Override
public String toString() {
return "CharMatcher.is('" + showCharacter(match) + "')";
}
}
Implementation of CharMatcher.isNot(char)
. /** Implementation of {@link #isNot(char)}. */
private static final class IsNot extends FastMatcher {
private final char match;
IsNot(char match) {
this.match = match;
}
@Override
public boolean matches(char c) {
return c != match;
}
@Override
public CharMatcher and(CharMatcher other) {
return other.matches(match) ? super.and(other) : other;
}
@Override
public CharMatcher or(CharMatcher other) {
return other.matches(match) ? any() : this;
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
table.set(0, match);
table.set(match + 1, Character.MAX_VALUE + 1);
}
@Override
public CharMatcher negate() {
return is(match);
}
@Override
public String toString() {
return "CharMatcher.isNot('" + showCharacter(match) + "')";
}
}
private static CharMatcher.IsEither isEither(char c1, char c2) {
return new CharMatcher.IsEither(c1, c2);
}
Implementation of CharMatcher.anyOf(CharSequence)
for exactly two characters. /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */
private static final class IsEither extends FastMatcher {
private final char match1;
private final char match2;
IsEither(char match1, char match2) {
this.match1 = match1;
this.match2 = match2;
}
@Override
public boolean matches(char c) {
return c == match1 || c == match2;
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
table.set(match1);
table.set(match2);
}
@Override
public String toString() {
return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")";
}
}
Implementation of CharMatcher.anyOf(CharSequence)
for three or more characters. /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */
private static final class AnyOf extends CharMatcher {
private final char[] chars;
public AnyOf(CharSequence chars) {
this.chars = chars.toString().toCharArray();
Arrays.sort(this.chars);
}
@Override
public boolean matches(char c) {
return Arrays.binarySearch(chars, c) >= 0;
}
@Override
@GwtIncompatible // used only from other GwtIncompatible code
void setBits(BitSet table) {
for (char c : chars) {
table.set(c);
}
}
@Override
public String toString() {
StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
for (char c : chars) {
description.append(showCharacter(c));
}
description.append("\")");
return description.toString();
}
}
Implementation of CharMatcher.inRange(char, char)
. /** Implementation of {@link #inRange(char, char)}. */
private static final class InRange extends FastMatcher {
private final char startInclusive;
private final char endInclusive;
InRange(char startInclusive, char endInclusive) {
checkArgument(endInclusive >= startInclusive);
this.startInclusive = startInclusive;
this.endInclusive = endInclusive;
}
@Override
public boolean matches(char c) {
return startInclusive <= c && c <= endInclusive;
}
@GwtIncompatible // used only from other GwtIncompatible code
@Override
void setBits(BitSet table) {
table.set(startInclusive, endInclusive + 1);
}
@Override
public String toString() {
return "CharMatcher.inRange('"
+ showCharacter(startInclusive)
+ "', '"
+ showCharacter(endInclusive)
+ "')";
}
}
Implementation of CharMatcher.forPredicate(Predicate<? super Character>)
. /** Implementation of {@link #forPredicate(Predicate)}. */
private static final class ForPredicate extends CharMatcher {
private final Predicate<? super Character> predicate;
ForPredicate(Predicate<? super Character> predicate) {
this.predicate = checkNotNull(predicate);
}
@Override
public boolean matches(char c) {
return predicate.apply(c);
}
@SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily
@Override
public boolean apply(Character character) {
return predicate.apply(checkNotNull(character));
}
@Override
public String toString() {
return "CharMatcher.forPredicate(" + predicate + ")";
}
}
}