/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.avro;

import java.util.Map;
import java.util.HashMap;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;

Collection of static methods for generating the canonical form of schemas (see toParsingForm) -- and fingerprints of canonical forms (fingerprint).
/** * Collection of static methods for generating the canonical form of schemas * (see {@link #toParsingForm}) -- and fingerprints of canonical forms * ({@link #fingerprint}). */
public class SchemaNormalization { private SchemaNormalization() { }
Returns "Parsing Canonical Form" of a schema as defined by Avro spec.
/** * Returns "Parsing Canonical Form" of a schema as defined by Avro spec. */
public static String toParsingForm(Schema s) { try { Map<String, String> env = new HashMap<>(); return build(env, s, new StringBuilder()).toString(); } catch (IOException e) { // Shouldn't happen, b/c StringBuilder can't throw IOException throw new RuntimeException(e); } }
Returns a fingerprint of a string of bytes. This string is presumed to contain a canonical form of a schema. The algorithm used to compute the fingerprint is selected by the argument fpName. If fpName equals the string "CRC-64-AVRO", then the result of fingerprint64 is returned in little-endian format. Otherwise, fpName is used as an algorithm name for MessageDigest.getInstance(String), which will throw NoSuchAlgorithmException if it doesn't recognize the name.

Recommended Avro practice dictates that "CRC-64-AVRO" is used for 64-bit fingerprints, "MD5" is used for 128-bit fingerprints, and "SHA-256" is used for 256-bit fingerprints.

/** * Returns a fingerprint of a string of bytes. This string is presumed to * contain a canonical form of a schema. The algorithm used to compute the * fingerprint is selected by the argument <i>fpName</i>. If <i>fpName</i> * equals the string <code>"CRC-64-AVRO"</code>, then the result of * {@link #fingerprint64} is returned in little-endian format. Otherwise, * <i>fpName</i> is used as an algorithm name for * {@link MessageDigest#getInstance(String)}, which will throw * <code>NoSuchAlgorithmException</code> if it doesn't recognize the name. * <p> * Recommended Avro practice dictates that <code>"CRC-64-AVRO"</code> is used * for 64-bit fingerprints, <code>"MD5"</code> is used for 128-bit fingerprints, * and <code>"SHA-256"</code> is used for 256-bit fingerprints. */
public static byte[] fingerprint(String fpName, byte[] data) throws NoSuchAlgorithmException { if (fpName.equals("CRC-64-AVRO")) { long fp = fingerprint64(data); byte[] result = new byte[8]; for (int i = 0; i < 8; i++) { result[i] = (byte) fp; fp >>= 8; } return result; } MessageDigest md = MessageDigest.getInstance(fpName); return md.digest(data); }
Returns the 64-bit Rabin Fingerprint (as recommended in the Avro spec) of a byte string.
/** * Returns the 64-bit Rabin Fingerprint (as recommended in the Avro spec) of a * byte string. */
public static long fingerprint64(byte[] data) { long result = EMPTY64; for (byte b : data) result = (result >>> 8) ^ FP64.FP_TABLE[(int) (result ^ b) & 0xff]; return result; }
Returns fingerprint applied to the parsing canonical form of the supplied schema.
/** * Returns {@link #fingerprint} applied to the parsing canonical form of the * supplied schema. */
public static byte[] parsingFingerprint(String fpName, Schema s) throws NoSuchAlgorithmException { return fingerprint(fpName, toParsingForm(s).getBytes(StandardCharsets.UTF_8)); }
Returns fingerprint64 applied to the parsing canonical form of the supplied schema.
/** * Returns {@link #fingerprint64} applied to the parsing canonical form of the * supplied schema. */
public static long parsingFingerprint64(Schema s) { return fingerprint64(toParsingForm(s).getBytes(StandardCharsets.UTF_8)); } private static Appendable build(Map<String, String> env, Schema s, Appendable o) throws IOException { boolean firstTime = true; Schema.Type st = s.getType(); switch (st) { default: // boolean, bytes, double, float, int, long, null, string return o.append('"').append(st.getName()).append('"'); case UNION: o.append('['); for (Schema b : s.getTypes()) { if (!firstTime) o.append(','); else firstTime = false; build(env, b, o); } return o.append(']'); case ARRAY: case MAP: o.append("{\"type\":\"").append(st.getName()).append("\""); if (st == Schema.Type.ARRAY) build(env, s.getElementType(), o.append(",\"items\":")); else build(env, s.getValueType(), o.append(",\"values\":")); return o.append("}"); case ENUM: case FIXED: case RECORD: String name = s.getFullName(); if (env.get(name) != null) return o.append(env.get(name)); String qname = "\"" + name + "\""; env.put(name, qname); o.append("{\"name\":").append(qname); o.append(",\"type\":\"").append(st.getName()).append("\""); if (st == Schema.Type.ENUM) { o.append(",\"symbols\":["); for (String enumSymbol : s.getEnumSymbols()) { if (!firstTime) o.append(','); else firstTime = false; o.append('"').append(enumSymbol).append('"'); } o.append("]"); } else if (st == Schema.Type.FIXED) { o.append(",\"size\":").append(Integer.toString(s.getFixedSize())); } else { // st == Schema.Type.RECORD o.append(",\"fields\":["); for (Schema.Field f : s.getFields()) { if (!firstTime) o.append(','); else firstTime = false; o.append("{\"name\":\"").append(f.name()).append("\""); build(env, f.schema(), o.append(",\"type\":")).append("}"); } o.append("]"); } return o.append("}"); } } final static long EMPTY64 = 0xc15d213aa4d7a795L; /* An inner class ensures that FP_TABLE initialized only when needed. */ private static class FP64 { private static final long[] FP_TABLE = new long[256]; static { for (int i = 0; i < 256; i++) { long fp = i; for (int j = 0; j < 8; j++) { long mask = -(fp & 1L); fp = (fp >>> 1) ^ (EMPTY64 & mask); } FP_TABLE[i] = fp; } } } }