org.apache.lucene/lucene-core/8.2.0 : org/apache/lucene/util/automaton/UTF32ToUTF8.java

UTF32ToUTF8
http://lucene.apache.org/lucene-parent/lucene-core: Apache Lucene Java Core (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util.automaton;


import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;

// TODO
//   - do we really need the .bits...?  if not we can make util in UnicodeUtil to convert 1 char into a BytesRef

Converts UTF-32 automata to the equivalent UTF-8 representation.
@lucene.internal /** 
 * Converts UTF-32 automata to the equivalent UTF-8 representation.
 * @lucene.internal 
 */
public final class UTF32ToUTF8 {

  // Unicode boundaries for UTF8 bytes 1,2,3,4
  private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
  private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111};

  static int[] MASKS = new int[32];
  static {
    int v = 2;
    for(int i=0;i<32;i++) {
      MASKS[i] = v-1;
      v *= 2;
    }
  }

  // Represents one of the N utf8 bytes that (in sequence)
  // define a code point.  value is the byte value; bits is
  // how many bits are "used" by utf8 at that byte
  private static class UTF8Byte {
    int value;                                    // TODO: change to byte
    byte bits;
  }

  // Holds a single code point, as a sequence of 1-4 utf8 bytes:
  // TODO: maybe move to UnicodeUtil?
  private static class UTF8Sequence {
    private final UTF8Byte[] bytes;
    private int len;

    public UTF8Sequence() {
      bytes = new UTF8Byte[4];
      for(int i=0;i<4;i++) {
        bytes[i] = new UTF8Byte();
      }
    }

    public int byteAt(int idx) {
      return bytes[idx].value;
    }

    public int numBits(int idx) {
      return bytes[idx].bits;
    }

    private void set(int code) {
      if (code < 128) {
        // 0xxxxxxx
        bytes[0].value = code;
        bytes[0].bits = 7;
        len = 1;
      } else if (code < 2048) {
        // 110yyyxx 10xxxxxx
        bytes[0].value = (6 << 5) | (code >> 6);
        bytes[0].bits = 5;
        setRest(code, 1);
        len = 2;
      } else if (code < 65536) {
        // 1110yyyy 10yyyyxx 10xxxxxx
        bytes[0].value = (14 << 4) | (code >> 12);
        bytes[0].bits = 4;
        setRest(code, 2);
        len = 3;
      } else {
        // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
        bytes[0].value = (30 << 3) | (code >> 18);
        bytes[0].bits = 3;
        setRest(code, 3);
        len = 4;
      }
    }

    private void setRest(int code, int numBytes) {
      for(int i=0;i<numBytes;i++) {
        bytes[numBytes-i].value = 128 | (code & MASKS[5]);
        bytes[numBytes-i].bits = 6;
        code = code >> 6;
      }
    }

    @Override
    public String toString() {
      StringBuilder b = new StringBuilder();
      for(int i=0;i<len;i++) {
        if (i > 0) {
          b.append(' ');
        }
        b.append(Integer.toBinaryString(bytes[i].value));
      }
      return b.toString();
    }
  }

  Sole constructor. /** Sole constructor. */
  public UTF32ToUTF8() {
  }

  private final UTF8Sequence startUTF8 = new UTF8Sequence();
  private final UTF8Sequence endUTF8 = new UTF8Sequence();

  private final UTF8Sequence tmpUTF8a = new UTF8Sequence();
  private final UTF8Sequence tmpUTF8b = new UTF8Sequence();

  // Builds necessary utf8 edges between start & end
  void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) {
    startUTF8.set(startCodePoint);
    endUTF8.set(endCodePoint);
    build(start, end, startUTF8, endUTF8, 0);
  }

  private void build(int start, int end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) {

    // Break into start, middle, end:
    if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) {
      // Degen case: lead with the same byte:
      if (upto == startUTF8.len-1 && upto == endUTF8.len-1) {
        // Super degen: just single edge, one UTF8 byte:
        utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
        return;
      } else {
        assert startUTF8.len > upto+1;
        assert endUTF8.len > upto+1;
        int n = utf8.createState();

        // Single value leading edge
        utf8.addTransition(start, n, startUTF8.byteAt(upto));
        //start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=single

        // Recurse for the rest
        build(n, end, startUTF8, endUTF8, 1+upto);
      }
    } else if (startUTF8.len == endUTF8.len) {
      if (upto == startUTF8.len-1) {
        //start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end));        // type=startend
        utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
      } else {
        start(start, end, startUTF8, upto, false);
        if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) {
          // There is a middle
          all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1);
        }
        end(start, end, endUTF8, upto, false);
      }
    } else {

      // start
      start(start, end, startUTF8, upto, true);

      // possibly middle, spanning multiple num bytes
      int byteCount = 1+startUTF8.len-upto;
      final int limit = endUTF8.len-upto;
      while (byteCount < limit) {
        // wasteful: we only need first byte, and, we should
        // statically encode this first byte:
        tmpUTF8a.set(startCodes[byteCount-1]);
        tmpUTF8b.set(endCodes[byteCount-1]);
        all(start, end,
            tmpUTF8a.byteAt(0),
            tmpUTF8b.byteAt(0),
            tmpUTF8a.len - 1);
        byteCount++;
      }

      // end
      end(start, end, endUTF8, upto, true);
    }
  }

  private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) {
    if (upto == startUTF8.len-1) {
      // Done recursing
      utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start
      //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end));  // type=start
    } else {
      int n = utf8.createState();
      utf8.addTransition(start, n, startUTF8.byteAt(upto));
      //start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=start
      start(n, end, startUTF8, 1+upto, true);
      int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1];
      if (doAll && startUTF8.byteAt(upto) != endCode) {
        all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1);
      }
    }
  }

  private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
    if (upto == endUTF8.len-1) {
      // Done recursing
      //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end));   // type=end
      utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto));
    } else {
      final int startCode;
      if (endUTF8.numBits(upto) == 5) {
        // special case -- avoid created unused edges (endUTF8
        // doesn't accept certain byte sequences) -- there
        // are other cases we could optimize too:
        startCode = 194;
      } else {
        startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]);
      }
      if (doAll && endUTF8.byteAt(upto) != startCode) {
        all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1);
      }
      int n = utf8.createState();
      //start.addTransition(new Transition(endUTF8.byteAt(upto), n));  // type=end
      utf8.addTransition(start, n, endUTF8.byteAt(upto));
      end(n, end, endUTF8, 1+upto, true);
    }
  }

  private void all(int start, int end, int startCode, int endCode, int left) {
    if (left == 0) {
      //start.addTransition(new Transition(startCode, endCode, end));  // type=all
      utf8.addTransition(start, end, startCode, endCode);
    } else {
      int lastN = utf8.createState();
      //start.addTransition(new Transition(startCode, endCode, lastN));  // type=all
      utf8.addTransition(start, lastN, startCode, endCode);
      while (left > 1) {
        int n = utf8.createState();
        //lastN.addTransition(new Transition(128, 191, n));  // type=all*
        utf8.addTransition(lastN, n, 128, 191); // type=all*
        left--;
        lastN = n;
      }
      //lastN.addTransition(new Transition(128, 191, end)); // type = all*
      utf8.addTransition(lastN, end, 128, 191); // type = all*
    }
  }

  Automaton.Builder utf8;

  Converts an incoming utf32 automaton to an equivalent
 utf8 one.  The incoming automaton need not be
 deterministic.  Note that the returned automaton will
 not in general be deterministic, so you must
 determinize it if that's needed. /** Converts an incoming utf32 automaton to an equivalent
   *  utf8 one.  The incoming automaton need not be
   *  deterministic.  Note that the returned automaton will
   *  not in general be deterministic, so you must
   *  determinize it if that's needed. */
  public Automaton convert(Automaton utf32) {
    if (utf32.getNumStates() == 0) {
      return utf32;
    }

    int[] map = new int[utf32.getNumStates()];
    Arrays.fill(map, -1);

    List<Integer> pending = new ArrayList<>();
    int utf32State = 0;
    pending.add(utf32State);
    utf8 = new Automaton.Builder();
       
    int utf8State = utf8.createState();

    utf8.setAccept(utf8State, utf32.isAccept(utf32State));

    map[utf32State] = utf8State;
    
    Transition scratch = new Transition();
    
    while (pending.size() != 0) {
      utf32State = pending.remove(pending.size()-1);
      utf8State = map[utf32State];
      assert utf8State != -1;

      int numTransitions = utf32.getNumTransitions(utf32State);
      utf32.initTransition(utf32State, scratch);
      for(int i=0;i<numTransitions;i++) {
        utf32.getNextTransition(scratch);
        int destUTF32 = scratch.dest;
        int destUTF8 = map[destUTF32];
        if (destUTF8 == -1) {
          destUTF8 = utf8.createState();
          utf8.setAccept(destUTF8, utf32.isAccept(destUTF32));
          map[destUTF32] = destUTF8;
          pending.add(destUTF32);
        }

        // Writes new transitions into pendingTransitions:
        convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max);
      }
    }

    return utf8.finish();
  }
}
/

org.apache.lucene/ lucene-core/ 8.2.0/ org/apache/lucene/util/automaton/UTF32ToUTF8.java