package org.apache.lucene.util.automaton;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;
public final class UTF32ToUTF8 {
private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111};
static int[] MASKS = new int[32];
static {
int v = 2;
for(int i=0;i<32;i++) {
MASKS[i] = v-1;
v *= 2;
}
}
private static class UTF8Byte {
int value;
byte bits;
}
private static class UTF8Sequence {
private final UTF8Byte[] bytes;
private int len;
public UTF8Sequence() {
bytes = new UTF8Byte[4];
for(int i=0;i<4;i++) {
bytes[i] = new UTF8Byte();
}
}
public int byteAt(int idx) {
return bytes[idx].value;
}
public int numBits(int idx) {
return bytes[idx].bits;
}
private void set(int code) {
if (code < 128) {
bytes[0].value = code;
bytes[0].bits = 7;
len = 1;
} else if (code < 2048) {
bytes[0].value = (6 << 5) | (code >> 6);
bytes[0].bits = 5;
setRest(code, 1);
len = 2;
} else if (code < 65536) {
bytes[0].value = (14 << 4) | (code >> 12);
bytes[0].bits = 4;
setRest(code, 2);
len = 3;
} else {
bytes[0].value = (30 << 3) | (code >> 18);
bytes[0].bits = 3;
setRest(code, 3);
len = 4;
}
}
private void setRest(int code, int numBytes) {
for(int i=0;i<numBytes;i++) {
bytes[numBytes-i].value = 128 | (code & MASKS[5]);
bytes[numBytes-i].bits = 6;
code = code >> 6;
}
}
@Override
public String toString() {
StringBuilder b = new StringBuilder();
for(int i=0;i<len;i++) {
if (i > 0) {
b.append(' ');
}
b.append(Integer.toBinaryString(bytes[i].value));
}
return b.toString();
}
}
public UTF32ToUTF8() {
}
private final UTF8Sequence startUTF8 = new UTF8Sequence();
private final UTF8Sequence endUTF8 = new UTF8Sequence();
private final UTF8Sequence tmpUTF8a = new UTF8Sequence();
private final UTF8Sequence tmpUTF8b = new UTF8Sequence();
void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) {
startUTF8.set(startCodePoint);
endUTF8.set(endCodePoint);
build(start, end, startUTF8, endUTF8, 0);
}
private void build(int start, int end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) {
if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) {
if (upto == startUTF8.len-1 && upto == endUTF8.len-1) {
utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
return;
} else {
assert startUTF8.len > upto+1;
assert endUTF8.len > upto+1;
int n = utf8.createState();
utf8.addTransition(start, n, startUTF8.byteAt(upto));
build(n, end, startUTF8, endUTF8, 1+upto);
}
} else if (startUTF8.len == endUTF8.len) {
if (upto == startUTF8.len-1) {
utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
} else {
start(start, end, startUTF8, upto, false);
if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) {
all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1);
}
end(start, end, endUTF8, upto, false);
}
} else {
start(start, end, startUTF8, upto, true);
int byteCount = 1+startUTF8.len-upto;
final int limit = endUTF8.len-upto;
while (byteCount < limit) {
tmpUTF8a.set(startCodes[byteCount-1]);
tmpUTF8b.set(endCodes[byteCount-1]);
all(start, end,
tmpUTF8a.byteAt(0),
tmpUTF8b.byteAt(0),
tmpUTF8a.len - 1);
byteCount++;
}
end(start, end, endUTF8, upto, true);
}
}
private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) {
if (upto == startUTF8.len-1) {
utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]);
} else {
int n = utf8.createState();
utf8.addTransition(start, n, startUTF8.byteAt(upto));
start(n, end, startUTF8, 1+upto, true);
int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1];
if (doAll && startUTF8.byteAt(upto) != endCode) {
all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1);
}
}
}
private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
if (upto == endUTF8.len-1) {
utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto));
} else {
final int startCode;
if (endUTF8.numBits(upto) == 5) {
startCode = 194;
} else {
startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]);
}
if (doAll && endUTF8.byteAt(upto) != startCode) {
all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1);
}
int n = utf8.createState();
utf8.addTransition(start, n, endUTF8.byteAt(upto));
end(n, end, endUTF8, 1+upto, true);
}
}
private void all(int start, int end, int startCode, int endCode, int left) {
if (left == 0) {
utf8.addTransition(start, end, startCode, endCode);
} else {
int lastN = utf8.createState();
utf8.addTransition(start, lastN, startCode, endCode);
while (left > 1) {
int n = utf8.createState();
utf8.addTransition(lastN, n, 128, 191);
left--;
lastN = n;
}
utf8.addTransition(lastN, end, 128, 191);
}
}
Automaton.Builder utf8;
public Automaton convert(Automaton utf32) {
if (utf32.getNumStates() == 0) {
return utf32;
}
int[] map = new int[utf32.getNumStates()];
Arrays.fill(map, -1);
List<Integer> pending = new ArrayList<>();
int utf32State = 0;
pending.add(utf32State);
utf8 = new Automaton.Builder();
int utf8State = utf8.createState();
utf8.setAccept(utf8State, utf32.isAccept(utf32State));
map[utf32State] = utf8State;
Transition scratch = new Transition();
while (pending.size() != 0) {
utf32State = pending.remove(pending.size()-1);
utf8State = map[utf32State];
assert utf8State != -1;
int numTransitions = utf32.getNumTransitions(utf32State);
utf32.initTransition(utf32State, scratch);
for(int i=0;i<numTransitions;i++) {
utf32.getNextTransition(scratch);
int destUTF32 = scratch.dest;
int destUTF8 = map[destUTF32];
if (destUTF8 == -1) {
destUTF8 = utf8.createState();
utf8.setAccept(destUTF8, utf32.isAccept(destUTF32));
map[destUTF32] = destUTF8;
pending.add(destUTF32);
}
convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max);
}
}
return utf8.finish();
}
}