/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.serializers;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
public class UTF8Serializer extends AbstractTextSerializer
{
public static final UTF8Serializer instance = new UTF8Serializer();
private UTF8Serializer()
{
super(StandardCharsets.UTF_8);
}
public void validate(ByteBuffer bytes) throws MarshalException
{
if (!UTF8Validator.validate(bytes))
throw new MarshalException("String didn't validate.");
}
static class UTF8Validator
{
enum State
{
START,
TWO,
TWO_80,
THREE_a0bf,
THREE_80bf_1,
THREE_80bf_2,
FOUR_90bf,
FOUR_80bf_3,
};
// since we're not converting to java strings, we don't need to worry about converting to surrogates.
// buf has already been sliced/duplicated.
static boolean validate(ByteBuffer buf)
{
if (buf == null)
return false;
buf = buf.slice();
int b = 0;
State state = State.START;
while (buf.remaining() > 0)
{
b = buf.get();
switch (state)
{
case START:
if (b >= 0)
{
// ascii, state stays start.
if (b > 127)
return false;
}
else if ((b >> 5) == -2)
{
// validate first byte of 2-byte char, 0xc2-0xdf
if (b == (byte) 0xc0)
// special case: modified utf8 null is 0xc080.
state = State.TWO_80;
else if ((b & 0x1e) == 0)
return false;
else
state = State.TWO;
}
else if ((b >> 4) == -2)
{
// 3 bytes. first byte will be 0xe0 or 0xe1-0xef. handling of second byte will differ.
// so 0xe0,0xa0-0xbf,0x80-0xbf or 0xe1-0xef,0x80-0xbf,0x80-0xbf.
if (b == (byte)0xe0)
state = State.THREE_a0bf;
else
state = State.THREE_80bf_2;
break;
}
else if ((b >> 3) == -2)
{
// 4 bytes. this is where the fun starts.
if (b == (byte)0xf0)
// 0xf0, 0x90-0xbf, 0x80-0xbf, 0x80-0xbf
state = State.FOUR_90bf;
else
// 0xf4, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf
// 0xf1-0xf3, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf
state = State.FOUR_80bf_3;
break;
}
else
return false; // malformed.
break;
case TWO:
// validate second byte of 2-byte char, 0x80-0xbf
if ((b & 0xc0) != 0x80)
return false;
state = State.START;
break;
case TWO_80:
if (b != (byte)0x80)
return false;
state = State.START;
break;
case THREE_a0bf:
if ((b & 0xe0) == 0x80)
return false;
state = State.THREE_80bf_1;
break;
case THREE_80bf_1:
// expecting 0x80-0xbf
if ((b & 0xc0) != 0x80)
return false;
state = State.START;
break;
case THREE_80bf_2:
// expecting 0x80-bf and then another of the same.
if ((b & 0xc0) != 0x80)
return false;
state = State.THREE_80bf_1;
break;
case FOUR_90bf:
// expecting 0x90-bf. 2nd byte of 4byte sequence. after that it should degrade to 80-bf,80-bf (like 3byte seq).
if ((b & 0x30) == 0)
return false;
state = State.THREE_80bf_2;
break;
case FOUR_80bf_3:
// expecting 0x80-bf 3 times. degenerates to THREE_80bf_2.
if ((b & 0xc0) != 0x80)
return false;
state = State.THREE_80bf_2;
break;
default:
return false; // invalid state.
}
}
// if state != start, we've got underflow. that's an error.
return state == State.START;
}
}
}