/*
 * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

/*
 *******************************************************************************
 *   Copyright (C) 2009-2014, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 */
package sun.text.normalizer;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.text.Normalizer;

// Original filename in ICU4J: Normalizer2Impl.java
public final class NormalizerImpl {
    public static final class Hangul {
        /* Korean Hangul and Jamo constants */
        public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
        public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
        public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */

        public static final int HANGUL_BASE=0xac00;
        public static final int HANGUL_END=0xd7a3;

        public static final int JAMO_L_COUNT=19;
        public static final int JAMO_V_COUNT=21;
        public static final int JAMO_T_COUNT=28;

        public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
        public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;

        public static boolean isHangul(int c) {
            return HANGUL_BASE<=c && c<HANGUL_LIMIT;
        }
        public static boolean isHangulLV(int c) {
            c-=HANGUL_BASE;
            return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
        }

                
Decomposes c, which must be a Hangul syllable, into buffer and returns the length of the decomposition (2 or 3).
/** * Decomposes c, which must be a Hangul syllable, into buffer * and returns the length of the decomposition (2 or 3). */
public static int decompose(int c, Appendable buffer) { try { c-=HANGUL_BASE; int c2=c%JAMO_T_COUNT; c/=JAMO_T_COUNT; buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); if(c2==0) { return 2; } else { buffer.append((char)(JAMO_T_BASE+c2)); return 3; } } catch(IOException e) { throw new InternalError(e); } } }
Writable buffer that takes care of canonical ordering. Its Appendable methods behave like the C++ implementation's appendZeroCC() methods.

If dest is a StringBuilder, then the buffer writes directly to it. Otherwise, the buffer maintains a StringBuilder for intermediate text segments until no further changes are necessary and whole segments are appended. append() methods that take combining-class values always write to the StringBuilder. Other append() methods flush and append to the Appendable.

/** * Writable buffer that takes care of canonical ordering. * Its Appendable methods behave like the C++ implementation's * appendZeroCC() methods. * <p> * If dest is a StringBuilder, then the buffer writes directly to it. * Otherwise, the buffer maintains a StringBuilder for intermediate text segments * until no further changes are necessary and whole segments are appended. * append() methods that take combining-class values always write to the StringBuilder. * Other append() methods flush and append to the Appendable. */
public static final class ReorderingBuffer implements Appendable { public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { impl=ni; app=dest; if (app instanceof StringBuilder) { appIsStringBuilder=true; str=(StringBuilder)dest; // In Java, the constructor subsumes public void init(int destCapacity) str.ensureCapacity(destCapacity); reorderStart=0; if(str.length()==0) { lastCC=0; } else { setIterator(); lastCC=previousCC(); // Set reorderStart after the last code point with cc<=1 if there is one. if(lastCC>1) { while(previousCC()>1) {} } reorderStart=codePointLimit; } } else { appIsStringBuilder=false; str=new StringBuilder(); reorderStart=0; lastCC=0; } } public boolean isEmpty() { return str.length()==0; } public int length() { return str.length(); } public int getLastCC() { return lastCC; } public StringBuilder getStringBuilder() { return str; } public boolean equals(CharSequence s, int start, int limit) { return UTF16Plus.equal(str, 0, str.length(), s, start, limit); } public void append(int c, int cc) { if(lastCC<=cc || cc==0) { str.appendCodePoint(c); lastCC=cc; if(cc<=1) { reorderStart=str.length(); } } else { insert(c, cc); } } // s must be in NFD, otherwise change the implementation. public void append(CharSequence s, int start, int limit, int leadCC, int trailCC) { if(start==limit) { return; } if(lastCC<=leadCC || leadCC==0) { if(trailCC<=1) { reorderStart=str.length()+(limit-start); } else if(leadCC<=1) { reorderStart=str.length()+1; // Ok if not a code point boundary. } str.append(s, start, limit); lastCC=trailCC; } else { int c=Character.codePointAt(s, start); start+=Character.charCount(c); insert(c, leadCC); // insert first code point while(start<limit) { c=Character.codePointAt(s, start); start+=Character.charCount(c); if(start<limit) { // s must be in NFD, otherwise we need to use getCC(). leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); } else { leadCC=trailCC; } append(c, leadCC); } } } // The following append() methods work like C++ appendZeroCC(). // They assume that the cc or trailCC of their input is 0. // Most of them implement Appendable interface methods. @Override public ReorderingBuffer append(char c) { str.append(c); lastCC=0; reorderStart=str.length(); return this; } public void appendZeroCC(int c) { str.appendCodePoint(c); lastCC=0; reorderStart=str.length(); } @Override public ReorderingBuffer append(CharSequence s) { if(s.length()!=0) { str.append(s); lastCC=0; reorderStart=str.length(); } return this; } @Override public ReorderingBuffer append(CharSequence s, int start, int limit) { if(start!=limit) { str.append(s, start, limit); lastCC=0; reorderStart=str.length(); } return this; }
Flushes from the intermediate StringBuilder to the Appendable, if they are different objects. Used after recomposition. Must be called at the end when writing to a non-StringBuilder Appendable.
/** * Flushes from the intermediate StringBuilder to the Appendable, * if they are different objects. * Used after recomposition. * Must be called at the end when writing to a non-StringBuilder Appendable. */
public void flush() { if(appIsStringBuilder) { reorderStart=str.length(); } else { try { app.append(str); str.setLength(0); reorderStart=0; } catch(IOException e) { throw new InternalError(e); // Avoid declaring "throws IOException". } } lastCC=0; }
Flushes from the intermediate StringBuilder to the Appendable, if they are different objects. Then appends the new text to the Appendable or StringBuilder. Normally used after quick check loops find a non-empty sequence.
/** * Flushes from the intermediate StringBuilder to the Appendable, * if they are different objects. * Then appends the new text to the Appendable or StringBuilder. * Normally used after quick check loops find a non-empty sequence. */
public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { if(appIsStringBuilder) { str.append(s, start, limit); reorderStart=str.length(); } else { try { app.append(str).append(s, start, limit); str.setLength(0); reorderStart=0; } catch(IOException e) { throw new InternalError(e); // Avoid declaring "throws IOException". } } lastCC=0; return this; } public void remove() { str.setLength(0); lastCC=0; reorderStart=0; } public void removeSuffix(int suffixLength) { int oldLength=str.length(); str.delete(oldLength-suffixLength, oldLength); lastCC=0; reorderStart=str.length(); } // Inserts c somewhere before the last character. // Requires 0<cc<lastCC which implies reorderStart<limit. private void insert(int c, int cc) { for(setIterator(), skipPrevious(); previousCC()>cc;) {} // insert c at codePointLimit, after the character with prevCC<=cc if(c<=0xffff) { str.insert(codePointLimit, (char)c); if(cc<=1) { reorderStart=codePointLimit+1; } } else { str.insert(codePointLimit, Character.toChars(c)); if(cc<=1) { reorderStart=codePointLimit+2; } } } private final NormalizerImpl impl; private final Appendable app; private final StringBuilder str; private final boolean appIsStringBuilder; private int reorderStart; private int lastCC; // private backward iterator private void setIterator() { codePointStart=str.length(); } private void skipPrevious() { // Requires 0<codePointStart. codePointLimit=codePointStart; codePointStart=str.offsetByCodePoints(codePointStart, -1); } private int previousCC() { // Returns 0 if there is no previous character. codePointLimit=codePointStart; if(reorderStart>=codePointStart) { return 0; } int c=str.codePointBefore(codePointStart); codePointStart-=Character.charCount(c); return impl.getCCFromYesOrMaybeCP(c); } private int codePointStart, codePointLimit; } // TODO: Propose as public API on the UTF16 class. // TODO: Propose widening UTF16 methods that take char to take int. // TODO: Propose widening UTF16 methods that take String to take CharSequence. public static final class UTF16Plus {
Assuming c is a surrogate code point (UTF16.isSurrogate(c)), is it a lead surrogate?
Params:
  • c – code unit or code point
Returns:true or false
/** * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), * is it a lead surrogate? * @param c code unit or code point * @return true or false */
public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
Compares two CharSequence subsequences for binary equality.
Params:
  • s1 – first sequence
  • start1 – start offset in first sequence
  • limit1 – limit offset in first sequence
  • s2 – second sequence
  • start2 – start offset in second sequence
  • limit2 – limit offset in second sequence
Returns:true if s1.subSequence(start1, limit1) contains the same text as s2.subSequence(start2, limit2)
/** * Compares two CharSequence subsequences for binary equality. * @param s1 first sequence * @param start1 start offset in first sequence * @param limit1 limit offset in first sequence * @param s2 second sequence * @param start2 start offset in second sequence * @param limit2 limit offset in second sequence * @return true if s1.subSequence(start1, limit1) contains the same text * as s2.subSequence(start2, limit2) */
public static boolean equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2) { if((limit1-start1)!=(limit2-start2)) { return false; } if(s1==s2 && start1==start2) { return true; } while(start1<limit1) { if(s1.charAt(start1++)!=s2.charAt(start2++)) { return false; } } return true; } } public NormalizerImpl() {} private static final class IsAcceptable implements ICUBinary.Authenticate { public boolean isDataVersionAcceptable(byte version[]) { return version[0]==3; } } private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" public NormalizerImpl load(ByteBuffer bytes) { try { dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 if(indexesLength<=IX_MIN_LCCC_CP) { throw new InternalError("Normalizer2 data: not enough indexes"); } int[] inIndexes=new int[indexesLength]; inIndexes[0]=indexesLength*4; for(int i=1; i<indexesLength; ++i) { inIndexes[i]=bytes.getInt(); } minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; minLcccCP=inIndexes[IX_MIN_LCCC_CP]; minYesNo=inIndexes[IX_MIN_YES_NO]; minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; minNoNo=inIndexes[IX_MIN_NO_NO]; minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; limitNoNo=inIndexes[IX_LIMIT_NO_NO]; minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; // Read the normTrie. int offset=inIndexes[IX_NORM_TRIE_OFFSET]; int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; normTrie=Trie2_16.createFromSerialized(bytes); int trieLength=normTrie.getSerializedLength(); if(trieLength>(nextOffset-offset)) { throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); } ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes // Read the composition and mapping data. offset=nextOffset; nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; int numChars=(nextOffset-offset)/2; char[] chars; if(numChars!=0) { chars=new char[numChars]; for(int i=0; i<numChars; ++i) { chars[i]=bytes.getChar(); } maybeYesCompositions=new String(chars); extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); } // smallFCD: new in formatVersion 2 offset=nextOffset; smallFCD=new byte[0x100]; bytes.get(smallFCD); return this; } catch(IOException e) { throw new InternalError(e); } } public NormalizerImpl load(String name) { return load(ICUBinary.getRequiredData(name)); } public int getNorm16(int c) { return normTrie.get(c); } public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } public int getCC(int norm16) { if(norm16>=MIN_NORMAL_MAYBE_YES) { return getCCFromNormalYesOrMaybe(norm16); } if(norm16<minNoNo || limitNoNo<=norm16) { return 0; } return getCCFromNoNo(norm16); } public static int getCCFromNormalYesOrMaybe(int norm16) { return (norm16 >> OFFSET_SHIFT) & 0xff; } public static int getCCFromYesOrMaybe(int norm16) { return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; } public int getCCFromYesOrMaybeCP(int c) { if (c < minCompNoMaybeCP) { return 0; } return getCCFromYesOrMaybe(getNorm16(c)); }
Returns the FCD data for code point c.
Params:
  • c – A Unicode code point.
Returns:The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
/** * Returns the FCD data for code point c. * @param c A Unicode code point. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */
public int getFCD16(int c) { if(c<minDecompNoCP) { return 0; } else if(c<=0xffff) { if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } } return getFCD16FromNormData(c); }
Returns true if the single-or-lead code unit c might have non-zero FCD data.
/** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
public boolean singleLeadMightHaveNonZeroFCD16(int lead) { // 0<=lead<=0xffff byte bits=smallFCD[lead>>8]; if(bits==0) { return false; } return ((bits>>((lead>>5)&7))&1)!=0; }
Gets the FCD value from the regular normalization data.
/** Gets the FCD value from the regular normalization data. */
public int getFCD16FromNormData(int c) { int norm16=getNorm16(c); if (norm16 >= limitNoNo) { if(norm16>=MIN_NORMAL_MAYBE_YES) { // combining mark norm16=getCCFromNormalYesOrMaybe(norm16); return norm16|(norm16<<8); } else if(norm16>=minMaybeYes) { return 0; } else { // isDecompNoAlgorithmic(norm16) int deltaTrailCC = norm16 & DELTA_TCCC_MASK; if (deltaTrailCC <= DELTA_TCCC_1) { return deltaTrailCC >> OFFSET_SHIFT; } // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); norm16=getNorm16(c); } } if(norm16<=minYesNo || isHangulLVT(norm16)) { // no decomposition or Hangul syllable, all zeros return 0; } // c decomposes, get everything from the variable-length extra data int mapping=norm16>>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); int fcd16=firstUnit>>8; // tccc if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc } return fcd16; }
Gets the decomposition for one code point.
Params:
  • c – code point
Returns:c's decomposition, if it has one; returns null if it does not have a decomposition
/** * Gets the decomposition for one code point. * @param c code point * @return c's decomposition, if it has one; returns null if it does not have a decomposition */
public String getDecomposition(int c) { int norm16; if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { // c does not decompose return null; } int decomp = -1; if(isDecompNoAlgorithmic(norm16)) { // Maps to an isCompYesAndZeroCC. decomp=c=mapAlgorithmic(c, norm16); // The mapping might decompose further. norm16 = getNorm16(c); } if (norm16 < minYesNo) { if(decomp<0) { return null; } else { return UTF16.valueOf(decomp); } } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { // Hangul syllable: decompose algorithmically StringBuilder buffer=new StringBuilder(); Hangul.decompose(c, buffer); return buffer.toString(); } // c decomposes, get everything from the variable-length extra data int mapping=norm16>>OFFSET_SHIFT; int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; return extraData.substring(mapping, mapping+length); } // Fixed norm16 values. public static final int MIN_YES_YES_WITH_CC=0xfe02; public static final int JAMO_VT=0xfe00; public static final int MIN_NORMAL_MAYBE_YES=0xfc00; public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE // norm16 bit 0 is comp-boundary-after. public static final int HAS_COMP_BOUNDARY_AFTER=1; public static final int OFFSET_SHIFT=1; // For algorithmic one-way mappings, norm16 bits 2..1 indicate the // tccc (0, 1, >1) for quick FCC boundary-after tests. public static final int DELTA_TCCC_0=0; public static final int DELTA_TCCC_1=2; public static final int DELTA_TCCC_GT_1=4; public static final int DELTA_TCCC_MASK=6; public static final int DELTA_SHIFT=3; public static final int MAX_DELTA=0x40; // Byte offsets from the start of the data, after the generic header. public static final int IX_NORM_TRIE_OFFSET=0; public static final int IX_EXTRA_DATA_OFFSET=1; public static final int IX_SMALL_FCD_OFFSET=2; public static final int IX_RESERVED3_OFFSET=3; public static final int IX_TOTAL_SIZE=7; public static final int MIN_CCC_LCCC_CP=0x300; // Code point thresholds for quick check codes. public static final int IX_MIN_DECOMP_NO_CP=8; public static final int IX_MIN_COMP_NO_MAYBE_CP=9; // Norm16 value thresholds for quick check combinations and types of extra data.
Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
/** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
public static final int IX_MIN_YES_NO=10;
Mappings are comp-normalized.
/** Mappings are comp-normalized. */
public static final int IX_MIN_NO_NO=11; public static final int IX_LIMIT_NO_NO=12; public static final int IX_MIN_MAYBE_YES=13;
Mappings only in [minYesNoMappingsOnly..minNoNo[.
/** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
Mappings are not comp-normalized but have a comp boundary before.
/** Mappings are not comp-normalized but have a comp boundary before. */
public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
Mappings do not have a comp boundary before.
/** Mappings do not have a comp boundary before. */
public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
Mappings to the empty string.
/** Mappings to the empty string. */
public static final int IX_MIN_NO_NO_EMPTY=17; public static final int IX_MIN_LCCC_CP=18; public static final int IX_COUNT=20; public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; public static final int MAPPING_HAS_RAW_MAPPING=0x40; // unused bit 0x20; public static final int MAPPING_LENGTH_MASK=0x1f; public static final int COMP_1_LAST_TUPLE=0x8000; public static final int COMP_1_TRIPLE=1; public static final int COMP_1_TRAIL_LIMIT=0x3400; public static final int COMP_1_TRAIL_MASK=0x7ffe; public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit public static final int COMP_2_TRAIL_SHIFT=6; public static final int COMP_2_TRAIL_MASK=0xffc0; // higher-level functionality ------------------------------------------ ***
Decomposes s[src, limit[ and writes the result to dest. limit can be NULL if src is NUL-terminated. destLengthEstimate is the initial dest buffer capacity and can be -1.
/** * Decomposes s[src, limit[ and writes the result to dest. * limit can be NULL if src is NUL-terminated. * destLengthEstimate is the initial dest buffer capacity and can be -1. */
public void decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate) { if(destLengthEstimate<0) { destLengthEstimate=limit-src; } dest.setLength(0); ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); decompose(s, src, limit, buffer); } // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes public int decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer) { int minNoCP=minDecompNoCP; int prevSrc; int c=0; int norm16=0; // only for quick check int prevBoundary=src; int prevCC=0; for(;;) { // count code units below the minimum or with irrelevant data for the quick check for(prevSrc=src; src!=limit;) { if( (c=s.charAt(src))<minNoCP || isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; } else if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { --src; c=Character.toCodePoint(c2, (char)c); } } if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { src+=Character.charCount(c); } else { break; } } } // copy these code units all at once if(src!=prevSrc) { if(buffer!=null) { buffer.flushAndAppendZeroCC(s, prevSrc, src); } else { prevCC=0; prevBoundary=src; } } if(src==limit) { break; } // Check one above-minimum, relevant code point. src+=Character.charCount(c); if(buffer!=null) { decompose(c, norm16, buffer); } else { if(isDecompYes(norm16)) { int cc=getCCFromYesOrMaybe(norm16); if(prevCC<=cc || cc==0) { prevCC=cc; if(cc<=1) { prevBoundary=src; } continue; } } return prevBoundary; // "no" or cc out of order } } return src; } public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { int limit=s.length(); if(limit==0) { return; } if(doDecompose) { decompose(s, 0, limit, buffer); return; } // Just merge the strings at the boundary. int c=Character.codePointAt(s, 0); int src=0; int firstCC, prevCC, cc; firstCC=prevCC=cc=getCC(getNorm16(c)); while(cc!=0) { prevCC=cc; src+=Character.charCount(c); if(src>=limit) { break; } c=Character.codePointAt(s, src); cc=getCC(getNorm16(c)); }; buffer.append(s, 0, src, firstCC, prevCC); buffer.append(s, src, limit); } // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. // doCompose: normalize // !doCompose: isNormalized (buffer must be empty and initialized) public boolean compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer) { int prevBoundary=src; int minNoMaybeCP=minCompNoMaybeCP; for (;;) { // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, // or with (compYes && ccc==0) properties. int prevSrc; int c = 0; int norm16 = 0; for (;;) { if (src == limit) { if (prevBoundary != limit && doCompose) { buffer.append(s, prevBoundary, limit); } return true; } if( (c=s.charAt(src))<minNoMaybeCP || isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; } else { prevSrc = src++; if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { ++src; c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { --prevSrc; c=Character.toCodePoint(c2, (char)c); } } if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { break; } } } } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) // or a "maybeYes" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. // Medium-fast path: Handle cases that do not require full decomposition and recomposition. if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes if (!doCompose) { return false; } // Fast path for mapping a character that is immediately surrounded by boundaries. // In this case, we need not decompose around the current character. if (isDecompNoAlgorithmic(norm16)) { // Maps to a single isCompYesAndZeroCC character // which also implies hasCompBoundaryBefore. if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || hasCompBoundaryBefore(s, src, limit)) { if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } buffer.append(mapAlgorithmic(c, norm16), 0); prevBoundary = src; continue; } } else if (norm16 < minNoNoCompBoundaryBefore) { // The mapping is comp-normalized which also implies hasCompBoundaryBefore. if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || hasCompBoundaryBefore(s, src, limit)) { if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } int mapping = norm16 >> OFFSET_SHIFT; int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; buffer.append(extraData, mapping, mapping + length); prevBoundary = src; continue; } } else if (norm16 >= minNoNoEmpty) { // The current character maps to nothing. // Simply omit it from the output if there is a boundary before _or_ after it. // The character itself implies no boundaries. if (hasCompBoundaryBefore(s, src, limit) || hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } prevBoundary = src; continue; } } // Other "noNo" type, or need to examine more text around this character: // Fall through to the slow path. } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { char prev=s.charAt(prevSrc-1); if(c<Hangul.JAMO_T_BASE) { // The current character is a Jamo Vowel, // compose with previous Jamo L and following Jamo T. char l = (char)(prev-Hangul.JAMO_L_BASE); if(l<Hangul.JAMO_L_COUNT) { if (!doCompose) { return false; } int t; if (src != limit && 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && t < Hangul.JAMO_T_COUNT) { // The next character is a Jamo T. ++src; } else if (hasCompBoundaryBefore(s, src, limit)) { // No Jamo T follows, not even via decomposition. t = 0; } else { t = -1; } if (t >= 0) { int syllable = Hangul.HANGUL_BASE + (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * Hangul.JAMO_T_COUNT + t; --prevSrc; // Replace the Jamo L as well. if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } buffer.append((char)syllable); prevBoundary = src; continue; } // If we see L+V+x where x!=T then we drop to the slow path, // decompose and recompose. // This is to deal with NFKC finding normal L and V but a // compatibility variant of a T. // We need to either fully compose that combination here // (which would complicate the code and may not work with strange custom data) // or use the slow path. } } else if (Hangul.isHangulLV(prev)) { // The current character is a Jamo Trailing consonant, // compose with previous Hangul LV that does not contain a Jamo T. if (!doCompose) { return false; } int syllable = prev + c - Hangul.JAMO_T_BASE; --prevSrc; // Replace the Hangul LV as well. if (prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } buffer.append((char)syllable); prevBoundary = src; continue; } // No matching context, or may need to decompose surrounding text first: // Fall through to the slow path. } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC // One or more combining marks that do not combine-back: // Check for canonical order, copy unchanged if ok and // if followed by a character with a boundary-before. int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { // Fails FCD test, need to decompose and contiguously recompose. if (!doCompose) { return false; } } else { // If !onlyContiguous (not FCC), then we ignore the tccc of // the previous character which passed the quick check "yes && ccc==0" test. int n16; for (;;) { if (src == limit) { if (doCompose) { buffer.append(s, prevBoundary, limit); } return true; } int prevCC = cc; c = Character.codePointAt(s, src); n16 = normTrie.get(c); if (n16 >= MIN_YES_YES_WITH_CC) { cc = getCCFromNormalYesOrMaybe(n16); if (prevCC > cc) { if (!doCompose) { return false; } break; } } else { break; } src += Character.charCount(c); } // p is after the last in-order combining mark. // If there is a boundary here, then we continue with no change. if (norm16HasCompBoundaryBefore(n16)) { if (isCompYesAndZeroCC(n16)) { src += Character.charCount(c); } continue; } // Use the slow path. There is no boundary in [prevSrc, src[. } } // Slow path: Find the nearest boundaries around the current character, // decompose and recompose. if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { c = Character.codePointBefore(s, prevSrc); norm16 = normTrie.get(c); if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { prevSrc -= Character.charCount(c); } } if (doCompose && prevBoundary != prevSrc) { buffer.append(s, prevBoundary, prevSrc); } int recomposeStartIndex=buffer.length(); // We know there is not a boundary here. decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, buffer); // Decompose until the next boundary. src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, buffer); recompose(buffer, recomposeStartIndex, onlyContiguous); if(!doCompose) { if(!buffer.equals(s, prevSrc, src)) { return false; } buffer.remove(); } prevBoundary=src; } }
Very similar to compose(): Make the same changes in both places if relevant. doSpan: spanQuickCheckYes (ignore bit 0 of the return value) !doSpan: quickCheck
Returns:bits 31..1: spanQuickCheckYes (==s.length() if "yes") and bit 0: set if "maybe"; otherwise, if the span length<s.length() then the quick check result is "no"
/** * Very similar to compose(): Make the same changes in both places if relevant. * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) * !doSpan: quickCheck * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and * bit 0: set if "maybe"; otherwise, if the span length&lt;s.length() * then the quick check result is "no" */
public int composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan) { int qcResult=0; int prevBoundary=src; int minNoMaybeCP=minCompNoMaybeCP; for(;;) { // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, // or with (compYes && ccc==0) properties. int prevSrc; int c = 0; int norm16 = 0; for (;;) { if(src==limit) { return (src<<1)|qcResult; // "yes" or "maybe" } if( (c=s.charAt(src))<minNoMaybeCP || isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { ++src; } else { prevSrc = src++; if(!UTF16.isSurrogate((char)c)) { break; } else { char c2; if(UTF16Plus.isSurrogateLead(c)) { if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { ++src; c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { --prevSrc; c=Character.toCodePoint(c2, (char)c); } } if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { break; } } } } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) // or a "maybeYes" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. int prevNorm16 = INERT; if (prevBoundary != prevSrc) { prevBoundary = prevSrc; if (!norm16HasCompBoundaryBefore(norm16)) { c = Character.codePointBefore(s, prevSrc); int n16 = getNorm16(c); if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { prevBoundary -= Character.charCount(c); prevNorm16 = n16; } } } if(isMaybeOrNonZeroCC(norm16)) { int cc=getCCFromYesOrMaybe(norm16); if (onlyContiguous /* FCC */ && cc != 0 && getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { // The [prevBoundary..prevSrc[ character // passed the quick check "yes && ccc==0" test // but is out of canonical order with the current combining mark. } else { // If !onlyContiguous (not FCC), then we ignore the tccc of // the previous character which passed the quick check "yes && ccc==0" test. for (;;) { if (norm16 < MIN_YES_YES_WITH_CC) { if (!doSpan) { qcResult = 1; } else { return prevBoundary << 1; // spanYes does not care to know it's "maybe" } } if (src == limit) { return (src<<1) | qcResult; // "yes" or "maybe" } int prevCC = cc; c = Character.codePointAt(s, src); norm16 = getNorm16(c); if (isMaybeOrNonZeroCC(norm16)) { cc = getCCFromYesOrMaybe(norm16); if (!(prevCC <= cc || cc == 0)) { break; } } else { break; } src += Character.charCount(c); } // src is after the last in-order combining mark. if (isCompYesAndZeroCC(norm16)) { prevBoundary = src; src += Character.charCount(c); continue; } } } return prevBoundary<<1; // "no" } } public void composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer) { int src=0, limit=s.length(); if(!buffer.isEmpty()) { int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); if(0!=firstStarterInSrc) { int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), buffer.length(), onlyContiguous); StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ firstStarterInSrc+16); middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); buffer.removeSuffix(buffer.length()-lastStarterInDest); middle.append(s, 0, firstStarterInSrc); compose(middle, 0, middle.length(), onlyContiguous, true, buffer); src=firstStarterInSrc; } } if(doCompose) { compose(s, src, limit, onlyContiguous, true, buffer); } else { buffer.append(s, src, limit); } } // Dual functionality: // buffer!=NULL: normalize // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { // Note: In this function we use buffer->appendZeroCC() because we track // the lead and trail combining classes here, rather than leaving it to // the ReorderingBuffer. // The exception is the call to decomposeShort() which uses the buffer // in the normal way. // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. // Similar to the prevBoundary in the compose() implementation. int prevBoundary=src; int prevSrc; int c=0; int prevFCD16=0; int fcd16=0; for(;;) { // count code units with lccc==0 for(prevSrc=src; src!=limit;) { if((c=s.charAt(src))<minLcccCP) { prevFCD16=~c; ++src; } else if(!singleLeadMightHaveNonZeroFCD16(c)) { prevFCD16=0; ++src; } else { if(UTF16.isSurrogate((char)c)) { char c2; if(UTF16Plus.isSurrogateLead(c)) { if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { c=Character.toCodePoint((char)c, c2); } } else /* trail surrogate */ { if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { --src; c=Character.toCodePoint(c2, (char)c); } } } if((fcd16=getFCD16FromNormData(c))<=0xff) { prevFCD16=fcd16; src+=Character.charCount(c); } else { break; } } } // copy these code units all at once if(src!=prevSrc) { if(src==limit) { if(buffer!=null) { buffer.flushAndAppendZeroCC(s, prevSrc, src); } break; } prevBoundary=src; // We know that the previous character's lccc==0. if(prevFCD16<0) { // Fetching the fcd16 value was deferred for this below-minLcccCP code point. int prev=~prevFCD16; if(prev<minDecompNoCP) { prevFCD16=0; } else { prevFCD16=getFCD16FromNormData(prev); if(prevFCD16>1) { --prevBoundary; } } } else { int p=src-1; if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && Character.isHighSurrogate(s.charAt(p-1)) ) { --p; // Need to fetch the previous character's FCD value because // prevFCD16 was just for the trail surrogate code point. prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); // Still known to have lccc==0 because its lead surrogate unit had lccc==0. } if(prevFCD16>1) { prevBoundary=p; } } if(buffer!=null) { // The last lccc==0 character is excluded from the // flush-and-append call in case it needs to be modified. buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); buffer.append(s, prevBoundary, src); } // The start of the current character (c). prevSrc=src; } else if(src==limit) { break; } src+=Character.charCount(c); // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. // Check for proper order, and decompose locally if necessary. if((prevFCD16&0xff)<=(fcd16>>8)) { // proper order: prev tccc <= current lccc if((fcd16&0xff)<=1) { prevBoundary=src; } if(buffer!=null) { buffer.appendZeroCC(c); } prevFCD16=fcd16; continue; } else if(buffer==null) { return prevBoundary; // quick check "no" } else { /* * Back out the part of the source that we copied or appended * already but is now going to be decomposed. * prevSrc is set to after what was copied/appended. */ buffer.removeSuffix(prevSrc-prevBoundary); /* * Find the part of the source that needs to be decomposed, * up to the next safe boundary. */ src=findNextFCDBoundary(s, src, limit); /* * The source text does not fulfill the conditions for FCD. * Decompose and reorder a limited piece of the text. */ decomposeShort(s, prevBoundary, src, false, false, buffer); prevBoundary=src; prevFCD16=0; } } return src; } public boolean hasDecompBoundaryBefore(int c) { return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || norm16HasDecompBoundaryBefore(getNorm16(c)); } public boolean norm16HasDecompBoundaryBefore(int norm16) { if (norm16 < minNoNoCompNoMaybeCC) { return true; } if (norm16 >= limitNoNo) { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; } // c decomposes, get everything from the variable-length extra data int mapping=norm16>>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; } public boolean hasDecompBoundaryAfter(int c) { if (c < minDecompNoCP) { return true; } if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { return true; } return norm16HasDecompBoundaryAfter(getNorm16(c)); } public boolean norm16HasDecompBoundaryAfter(int norm16) { if(norm16 <= minYesNo || isHangulLVT(norm16)) { return true; } if (norm16 >= limitNoNo) { if (isMaybeOrNonZeroCC(norm16)) { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; } // Maps to an isCompYesAndZeroCC. return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; } // c decomposes, get everything from the variable-length extra data int mapping=norm16>>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 if(firstUnit>0x1ff) { return false; // trailCC>1 } if(firstUnit<=0xff) { return true; // trailCC==0 } // if(trailCC==1) test leadCC==0, same as checking for before-boundary // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; } public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } public boolean hasCompBoundaryBefore(int c) { return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); } public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); } private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } private static boolean isInert(int norm16) { return norm16==INERT; } private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } private boolean isHangulLV(int norm16) { return norm16==minYesNo; } private boolean isHangulLVT(int norm16) { return norm16==hangulLVT(); } private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } // UBool isCompYes(uint16_t norm16) const { // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; // } // UBool isCompYesOrMaybe(uint16_t norm16) const { // return norm16<minNoNo || minMaybeYes<=norm16; // } // private boolean hasZeroCCFromDecompYes(int norm16) { // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; // } private boolean isDecompYesAndZeroCC(int norm16) { return norm16<minYesNo || norm16==JAMO_VT || (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); }
A little faster and simpler than isDecompYesAndZeroCC() but does not include the MaybeYes which combine-forward and have ccc=0. (Standard Unicode 10 normalization does not have such characters.)
/** * A little faster and simpler than isDecompYesAndZeroCC() but does not include * the MaybeYes which combine-forward and have ccc=0. * (Standard Unicode 10 normalization does not have such characters.) */
private boolean isMostDecompYesAndZeroCC(int norm16) { return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; } private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } // For use with isCompYes(). // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. // static uint8_t getCCFromYes(uint16_t norm16) { // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; // } private int getCCFromNoNo(int norm16) { int mapping=norm16>>OFFSET_SHIFT; if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { return extraData.charAt(mapping-1)&0xff; } else { return 0; } } int getTrailCCFromCompYesAndZeroCC(int norm16) { if(norm16<=minYesNo) { return 0; // yesYes and Hangul LV have ccc=tccc=0 } else { // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo } } // Requires algorithmic-NoNo. private int mapAlgorithmic(int c, int norm16) { return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; } // Requires minYesNo<norm16<limitNoNo. // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
Returns:index into maybeYesCompositions, or -1
/** * @return index into maybeYesCompositions, or -1 */
private int getCompositionsListForDecompYes(int norm16) { if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { return -1; } else { if((norm16-=minMaybeYes)<0) { // norm16<minMaybeYes: index into extraData which is a substring at // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list } return norm16>>OFFSET_SHIFT; } }
Returns:index into maybeYesCompositions
/** * @return index into maybeYesCompositions */
private int getCompositionsListForComposite(int norm16) { // A composite has both mapping & compositions list. int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; int firstUnit=maybeYesCompositions.charAt(list); return list+ // mapping in maybeYesCompositions 1+ // +1 to skip the first unit with the mapping length (firstUnit&MAPPING_LENGTH_MASK); // + mapping length } // Decompose a short piece of text which is likely to contain characters that // fail the quick check loop and/or where the quick check loop's overhead // is unlikely to be amortized. // Called by the compose() and makeFCD() implementations. // Public in Java for collation implementation code. private int decomposeShort( CharSequence s, int src, int limit, boolean stopAtCompBoundary, boolean onlyContiguous, ReorderingBuffer buffer) { while(src<limit) { int c=Character.codePointAt(s, src); if (stopAtCompBoundary && c < minCompNoMaybeCP) { return src; } int norm16 = getNorm16(c); if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { return src; } src+=Character.charCount(c); decompose(c, norm16, buffer); if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { return src; } } return src; } private void decompose(int c, int norm16, ReorderingBuffer buffer) { // get the decomposition and the lead and trail cc's if (norm16 >= limitNoNo) { if (isMaybeOrNonZeroCC(norm16)) { buffer.append(c, getCCFromYesOrMaybe(norm16)); return; } // Maps to an isCompYesAndZeroCC. c=mapAlgorithmic(c, norm16); norm16=getNorm16(c); } if (norm16 < minYesNo) { // c does not decompose buffer.append(c, 0); } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { // Hangul syllable: decompose algorithmically Hangul.decompose(c, buffer); } else { // c decomposes, get everything from the variable-length extra data int mapping=norm16>>OFFSET_SHIFT; int firstUnit=extraData.charAt(mapping); int length=firstUnit&MAPPING_LENGTH_MASK; int leadCC, trailCC; trailCC=firstUnit>>8; if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { leadCC=extraData.charAt(mapping-1)>>8; } else { leadCC=0; } ++mapping; // skip over the firstUnit buffer.append(extraData, mapping, mapping+length, leadCC, trailCC); } }
Finds the recomposition result for a forward-combining "lead" character, specified with a pointer to its compositions list, and a backward-combining "trail" character.

If the lead and trail characters combine, then this function returns the following "compositeAndFwd" value:

Bits 21..1  composite character
Bit      0  set if the composite is a forward-combining starter
otherwise it returns -1.

The compositions list has (trail, compositeAndFwd) pair entries, encoded as either pairs or triples of 16-bit units. The last entry has the high bit of its first unit set.

The list is sorted by ascending trail characters (there are no duplicates). A linear search is used.

See normalizer2impl.h for a more detailed description of the compositions list format.

/** * Finds the recomposition result for * a forward-combining "lead" character, * specified with a pointer to its compositions list, * and a backward-combining "trail" character. * * <p>If the lead and trail characters combine, then this function returns * the following "compositeAndFwd" value: * <pre> * Bits 21..1 composite character * Bit 0 set if the composite is a forward-combining starter * </pre> * otherwise it returns -1. * * <p>The compositions list has (trail, compositeAndFwd) pair entries, * encoded as either pairs or triples of 16-bit units. * The last entry has the high bit of its first unit set. * * <p>The list is sorted by ascending trail characters (there are no duplicates). * A linear search is used. * * <p>See normalizer2impl.h for a more detailed description * of the compositions list format. */
private static int combine(String compositions, int list, int trail) { int key1, firstUnit; if(trail<COMP_1_TRAIL_LIMIT) { // trail character is 0..33FF // result entry may have 2 or 3 units key1=(trail<<1); while(key1>(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if((firstUnit&COMP_1_TRIPLE)!=0) { return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); } else { return compositions.charAt(list+1); } } } else { // trail character is 3400..10FFFF // result entry has 3 units key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; int secondUnit; for(;;) { if(key1>(firstUnit=compositions.charAt(list))) { list+=2+(firstUnit&COMP_1_TRIPLE); } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if(key2>(secondUnit=compositions.charAt(list+1))) { if((firstUnit&COMP_1_LAST_TUPLE)!=0) { break; } else { list+=3; } } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); } else { break; } } else { break; } } } return -1; } /* * Recomposes the buffer text starting at recomposeStartIndex * (which is in NFD - decomposed and canonically ordered), * and truncates the buffer contents. * * Note that recomposition never lengthens the text: * Any character consists of either one or two code units; * a composition may contain at most one more code unit than the original starter, * while the combining mark that is removed has at least one code unit. */ private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous) { StringBuilder sb=buffer.getStringBuilder(); int p=recomposeStartIndex; if(p==sb.length()) { return; } int starter, pRemove; int compositionsList; int c, compositeAndFwd; int norm16; int cc, prevCC; boolean starterIsSupplementary; // Some of the following variables are not used until we have a forward-combining starter // and are only initialized now to avoid compiler warnings. compositionsList=-1; // used as indicator for whether we have a forward-combining starter starter=-1; starterIsSupplementary=false; prevCC=0; for(;;) { c=sb.codePointAt(p); p+=Character.charCount(c); norm16=getNorm16(c); cc=getCCFromYesOrMaybe(norm16); if( // this character combines backward and isMaybe(norm16) && // we have seen a starter that combines forward and compositionsList>=0 && // the backward-combining character is not blocked (prevCC<cc || prevCC==0) ) { if(isJamoVT(norm16)) { // c is a Jamo V/T, see if we can compose it with the previous character. if(c<Hangul.JAMO_T_BASE) { // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); if(prev<Hangul.JAMO_L_COUNT) { pRemove=p-1; char syllable=(char) (Hangul.HANGUL_BASE+ (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* Hangul.JAMO_T_COUNT); char t; if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { ++p; syllable+=t; // The next character was a Jamo T. } sb.setCharAt(starter, syllable); // remove the Jamo V/T sb.delete(pRemove, p); p=pRemove; } } /* * No "else" for Jamo T: * Since the input is in NFD, there are no Hangul LV syllables that * a Jamo T could combine with. * All Jamo Ts are combined above when handling Jamo Vs. */ if(p==sb.length()) { break; } compositionsList=-1; continue; } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { // The starter and the combining mark (c) do combine. int composite=compositeAndFwd>>1; // Remove the combining mark. pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark sb.delete(pRemove, p); p=pRemove; // Replace the starter with the composite. if(starterIsSupplementary) { if(composite>0xffff) { // both are supplementary sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); } else { sb.setCharAt(starter, (char)c); sb.deleteCharAt(starter+1); // The composite is shorter than the starter, // move the intermediate characters forward one. starterIsSupplementary=false; --p; } } else if(composite>0xffff) { // The composite is longer than the starter, // move the intermediate characters back one. starterIsSupplementary=true; sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); ++p; } else { // both are on the BMP sb.setCharAt(starter, (char)composite); } // Keep prevCC because we removed the combining mark. if(p==sb.length()) { break; } // Is the composite a starter that combines forward? if((compositeAndFwd&1)!=0) { compositionsList= getCompositionsListForComposite(getNorm16(composite)); } else { compositionsList=-1; } // We combined; continue with looking for compositions. continue; } } // no combination this time prevCC=cc; if(p==sb.length()) { break; } // If c did not combine, then check if it is a starter. if(cc==0) { // Found a new starter. if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { // It may combine with something, prepare for it. if(c<=0xffff) { starterIsSupplementary=false; starter=p-1; } else { starterIsSupplementary=true; starter=p-2; } } } else if(onlyContiguous) { // FCC: no discontiguous compositions; any intervening character blocks. compositionsList=-1; } } buffer.flush(); }
Does c have a composition boundary before it? True if its decomposition begins with a character that has ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()) so we need not decompose.
/** * Does c have a composition boundary before it? * True if its decomposition begins with a character that has * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes * (isCompYesAndZeroCC()) so we need not decompose. */
private boolean hasCompBoundaryBefore(int c, int norm16) { return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); } private boolean norm16HasCompBoundaryBefore(int norm16) { return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); } private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); } private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); } private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); }
For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1?
/** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); } private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { while(p>0) { int c=Character.codePointBefore(s, p); int norm16 = getNorm16(c); if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { break; } p-=Character.charCount(c); if(hasCompBoundaryBefore(c, norm16)) { break; } } return p; } private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { while(p<limit) { int c=Character.codePointAt(s, p); int norm16=normTrie.get(c); if(hasCompBoundaryBefore(c, norm16)) { break; } p+=Character.charCount(c); if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { break; } } return p; } private int findNextFCDBoundary(CharSequence s, int p, int limit) { while(p<limit) { int c=Character.codePointAt(s, p); int norm16; if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { break; } p+=Character.charCount(c); if (norm16HasDecompBoundaryAfter(norm16)) { break; } } return p; }
Get the canonical decomposition sherman for ComposedCharIter
/** * Get the canonical decomposition * sherman for ComposedCharIter */
public static int getDecompose(int chars[], String decomps[]) { Normalizer2 impl = Normalizer2.getNFDInstance(); int length=0; int norm16 = 0; int ch = -1; int i = 0; while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff //TBD !!!! the hack code heres save us about 50ms for startup //need a better solution/lookup if (ch == 0x30ff) ch = 0xf900; else if (ch == 0x115bc) ch = 0x1d15e; else if (ch == 0x1d1c1) ch = 0x2f800; String s = impl.getDecomposition(ch); if(s != null && i < chars.length) { chars[i] = ch; decomps[i++] = s; } } return i; } //------------------------------------------------------ // special method for Collation (RBTableBuilder.build()) //------------------------------------------------------ private static boolean needSingleQuotation(char c) { return (c >= 0x0009 && c <= 0x000D) || (c >= 0x0020 && c <= 0x002F) || (c >= 0x003A && c <= 0x0040) || (c >= 0x005B && c <= 0x0060) || (c >= 0x007B && c <= 0x007E); } public static String canonicalDecomposeWithSingleQuotation(String string) { Normalizer2 impl = Normalizer2.getNFDInstance(); char[] src = string.toCharArray(); int srcIndex = 0; int srcLimit = src.length; char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 int destIndex = 0; int destLimit = dest.length; int prevSrc; String norm; int reorderStartIndex, length; char c1, c2; int cp; int minNoMaybe = 0x00c0; int cc, prevCC, trailCC; char[] p; int pStart; // initialize reorderStartIndex = 0; prevCC = 0; norm = null; cp = 0; pStart = 0; cc = trailCC = -1; // initialize to bogus value c1 = 0; for (;;) { prevSrc=srcIndex; //quick check (1)less than minNoMaybe (2)no decomp (3)hangual while (srcIndex != srcLimit && ((c1 = src[srcIndex]) < minNoMaybe || (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables prevCC = 0; srcIndex += (cp < 0x10000) ? 1 : 2; } // copy these code units all at once if (srcIndex != prevSrc) { length = srcIndex - prevSrc; if ((destIndex + length) <= destLimit) { System.arraycopy(src,prevSrc,dest,destIndex,length); } destIndex += length; reorderStartIndex = destIndex; } // end of source reached? if (srcIndex == srcLimit) { break; } // cp already contains *src and norm32 is set for it, increment src srcIndex += (cp < 0x10000) ? 1 : 2; if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { c2 = 0; length = 1; if (Character.isHighSurrogate(c1) || Character.isLowSurrogate(c1)) { norm = null; } } else { length = 2; c2 = src[srcIndex-1]; } // get the decomposition and the lead and trail cc's if (norm == null) { // cp does not decompose cc = trailCC = UCharacter.getCombiningClass(cp); p = null; pStart = -1; } else { pStart = 0; p = norm.toCharArray(); length = p.length; int cpNum = norm.codePointCount(0, length); cc= UCharacter.getCombiningClass(norm.codePointAt(0)); trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); if (length == 1) { // fastpath a single code unit from decomposition c1 = p[pStart]; c2 = 0; p = null; pStart = -1; } } if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations // buffer overflow char[] tmpBuf = new char[destLimit * 2]; System.arraycopy(dest, 0, tmpBuf, 0, destIndex); dest = tmpBuf; destLimit = dest.length; } // append the decomposition to the destination buffer, assume length>0 { int reorderSplit = destIndex; if (p == null) { // fastpath: single code point if (needSingleQuotation(c1)) { //if we need single quotation, no need to consider "prevCC" //and it must NOT be a supplementary pair dest[destIndex++] = '\''; dest[destIndex++] = c1; dest[destIndex++] = '\''; trailCC = 0; } else if(cc != 0 && cc < prevCC) { // (c1, c2) is out of order with respect to the preceding // text destIndex += length; trailCC = insertOrdered(dest, reorderStartIndex, reorderSplit, destIndex, c1, c2, cc); } else { // just append (c1, c2) dest[destIndex++] = c1; if(c2 != 0) { dest[destIndex++] = c2; } } } else { // general: multiple code points (ordered by themselves) // from decomposition if (needSingleQuotation(p[pStart])) { dest[destIndex++] = '\''; dest[destIndex++] = p[pStart++]; dest[destIndex++] = '\''; length--; do { dest[destIndex++] = p[pStart++]; } while(--length > 0); } else if (cc != 0 && cc < prevCC) { destIndex += length; trailCC = mergeOrdered(dest, reorderStartIndex, reorderSplit, p, pStart, pStart+length); } else { // just append the decomposition do { dest[destIndex++] = p[pStart++]; } while (--length > 0); } } } prevCC = trailCC; if(prevCC == 0) { reorderStartIndex = destIndex; } } return new String(dest, 0, destIndex); }
simpler, single-character version of mergeOrdered() - bubble-insert one single code point into the preceding string which is already canonically ordered (c, c2) may or may not yet have been inserted at src[current]..src[p] it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) before: src[start]..src[current] is already ordered, and src[current]..src[p] may or may not hold (c, c2) but must be exactly the same length as (c, c2) after: src[start]..src[p] is ordered
Returns:the trailing combining class
/** * simpler, single-character version of mergeOrdered() - * bubble-insert one single code point into the preceding string * which is already canonically ordered * (c, c2) may or may not yet have been inserted at src[current]..src[p] * * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) * * before: src[start]..src[current] is already ordered, and * src[current]..src[p] may or may not hold (c, c2) but * must be exactly the same length as (c, c2) * after: src[start]..src[p] is ordered * * @return the trailing combining class */
private static int/*unsigned byte*/ insertOrdered(char[] source, int start, int current, int p, char c1, char c2, int/*unsigned byte*/ cc) { int back, preBack; int r; int prevCC, trailCC=cc; if (start<current && cc!=0) { // search for the insertion point where cc>=prevCC preBack=back=current; PrevArgs prevArgs = new PrevArgs(); prevArgs.current = current; prevArgs.start = start; prevArgs.src = source; prevArgs.c1 = c1; prevArgs.c2 = c2; // get the prevCC prevCC=getPrevCC(prevArgs); preBack = prevArgs.current; if(cc<prevCC) { // this will be the last code point, so keep its cc trailCC=prevCC; back=preBack; while(start<preBack) { prevCC=getPrevCC(prevArgs); preBack=prevArgs.current; if(cc>=prevCC) { break; } back=preBack; } // this is where we are right now with all these indicies: // [start]..[pPreBack] 0..? code points that we can ignore // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) // [current]..[p] 1 code point (c, c2) with cc // move the code units in between up r=p; do { source[--r]=source[--current]; } while (back!=current); } } // insert (c1, c2) source[current] = c1; if (c2!=0) { source[(current+1)] = c2; } // we know the cc of the last code point return trailCC; }
merge two UTF-16 string parts together to canonically order (order by combining classes) their concatenation the two strings may already be adjacent, so that the merging is done in-place if the two strings are not adjacent, then the buffer holding the first one must be large enough the second string may or may not be ordered in itself before: [start]..[current] is already ordered, and [next]..[limit] may be ordered in itself, but is not in relation to [start..current[ after: [start..current+(limit-next)[ is ordered the algorithm is a simple bubble-sort that takes the characters from src[next++] and inserts them in correct combining class order into the preceding part of the string since this function is called much less often than the single-code point insertOrdered(), it just uses that for easier maintenance
Returns:the trailing combining class
/** * merge two UTF-16 string parts together * to canonically order (order by combining classes) their concatenation * * the two strings may already be adjacent, so that the merging is done * in-place if the two strings are not adjacent, then the buffer holding the * first one must be large enough * the second string may or may not be ordered in itself * * before: [start]..[current] is already ordered, and * [next]..[limit] may be ordered in itself, but * is not in relation to [start..current[ * after: [start..current+(limit-next)[ is ordered * * the algorithm is a simple bubble-sort that takes the characters from * src[next++] and inserts them in correct combining class order into the * preceding part of the string * * since this function is called much less often than the single-code point * insertOrdered(), it just uses that for easier maintenance * * @return the trailing combining class */
private static int /*unsigned byte*/ mergeOrdered(char[] source, int start, int current, char[] data, int next, int limit) { int r; int /*unsigned byte*/ cc, trailCC=0; boolean adjacent; adjacent= current==next; NextCCArgs ncArgs = new NextCCArgs(); ncArgs.source = data; ncArgs.next = next; ncArgs.limit = limit; if(start!=current) { while(ncArgs.next<ncArgs.limit) { cc=getNextCC(ncArgs); if(cc==0) { // does not bubble back trailCC=0; if(adjacent) { current=ncArgs.next; } else { data[current++]=ncArgs.c1; if(ncArgs.c2!=0) { data[current++]=ncArgs.c2; } } break; } else { r=current+(ncArgs.c2==0 ? 1 : 2); trailCC=insertOrdered(source,start, current, r, ncArgs.c1, ncArgs.c2, cc); current=r; } } } if(ncArgs.next==ncArgs.limit) { // we know the cc of the last code point return trailCC; } else { if(!adjacent) { // copy the second string part do { source[current++]=data[ncArgs.next++]; } while(ncArgs.next!=ncArgs.limit); ncArgs.limit=current; } PrevArgs prevArgs = new PrevArgs(); prevArgs.src = data; prevArgs.start = start; prevArgs.current = ncArgs.limit; return getPrevCC(prevArgs); } } private static final class PrevArgs{ char[] src; int start; int current; char c1; char c2; } private static final class NextCCArgs{ char[] source; int next; int limit; char c1; char c2; } private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { args.c1=args.source[args.next++]; args.c2=0; if (UTF16.isTrailSurrogate(args.c1)) { /* unpaired second surrogate */ return 0; } else if (!UTF16.isLeadSurrogate(args.c1)) { return UCharacter.getCombiningClass(args.c1); } else if (args.next!=args.limit && UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ ++args.next; return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); } else { /* unpaired first surrogate */ args.c2=0; return 0; } } private static int /*unsigned*/ getPrevCC(PrevArgs args) { args.c1=args.src[--args.current]; args.c2=0; if (args.c1 < MIN_CCC_LCCC_CP) { return 0; } else if (UTF16.isLeadSurrogate(args.c1)) { /* unpaired first surrogate */ return 0; } else if (!UTF16.isTrailSurrogate(args.c1)) { return UCharacter.getCombiningClass(args.c1); } else if (args.current!=args.start && UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { --args.current; return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); } else { /* unpaired second surrogate */ args.c2=0; return 0; } } private int getPreviousTrailCC(CharSequence s, int start, int p) { if (start == p) { return 0; } return getFCD16(Character.codePointBefore(s, p)); } private VersionInfo dataVersion; // BMP code point thresholds for quick check loops looking at single UTF-16 code units. private int minDecompNoCP; private int minCompNoMaybeCP; private int minLcccCP; // Norm16 value thresholds for quick check combinations and types of extra data. private int minYesNo; private int minYesNoMappingsOnly; private int minNoNo; private int minNoNoCompBoundaryBefore; private int minNoNoCompNoMaybeCC; private int minNoNoEmpty; private int limitNoNo; private int centerNoNoDelta; private int minMaybeYes; private Trie2_16 normTrie; private String maybeYesCompositions; private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 }