/*
 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.java.util.jar.pack;

import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.Arrays;

Histogram derived from an integer array of events (int[]).
Author:John Rose
/** * Histogram derived from an integer array of events (int[]). * @author John Rose */
final class Histogram { // Compact histogram representation: 4 bytes per distinct value, // plus 5 words per distinct count. protected final int[][] matrix; // multi-row matrix {{counti,valueij...}} protected final int totalWeight; // sum of all counts // These are created eagerly also, since that saves work. // They cost another 8 bytes per distinct value. protected final int[] values; // unique values, sorted by value protected final int[] counts; // counts, same order as values private static final long LOW32 = (long)-1 >>> 32;
Build a histogram given a sequence of values. To save work, the input should be sorted, but need not be.
/** Build a histogram given a sequence of values. * To save work, the input should be sorted, but need not be. */
public Histogram(int[] valueSequence) { long[] hist2col = computeHistogram2Col(maybeSort(valueSequence)); int[][] table = makeTable(hist2col); values = table[0]; counts = table[1]; this.matrix = makeMatrix(hist2col); this.totalWeight = valueSequence.length; assert(assertWellFormed(valueSequence)); } public Histogram(int[] valueSequence, int start, int end) { this(sortedSlice(valueSequence, start, end)); }
Build a histogram given a compact matrix of counts and values.
/** Build a histogram given a compact matrix of counts and values. */
public Histogram(int[][] matrix) { // sort the rows matrix = normalizeMatrix(matrix); // clone and sort this.matrix = matrix; int length = 0; int weight = 0; for (int i = 0; i < matrix.length; i++) { int rowLength = matrix[i].length-1; length += rowLength; weight += matrix[i][0] * rowLength; } this.totalWeight = weight; long[] hist2col = new long[length]; int fillp = 0; for (int i = 0; i < matrix.length; i++) { for (int j = 1; j < matrix[i].length; j++) { // sort key is value, so put it in the high 32! hist2col[fillp++] = ((long) matrix[i][j] << 32) | (LOW32 & matrix[i][0]); } } assert(fillp == hist2col.length); Arrays.sort(hist2col); int[][] table = makeTable(hist2col); values = table[1]; //backwards counts = table[0]; //backwards assert(assertWellFormed(null)); }
Histogram of int values, reported compactly as a ragged matrix, indexed by descending frequency rank.

Format of matrix: Each row in the matrix begins with an occurrence count, and continues with all int values that occur at that frequency.

 int[][] matrix = {
   { count1, value11, value12, value13, ...  },
   { count2, value21, value22, ... },
   ...
 }
 
The first column of the matrix { count1, count2, ... } is sorted in descending order, and contains no duplicates. Each row of the matrix (apart from its first element) is sorted in ascending order, and contains no duplicates. That is, each sequence { valuei1, valuei2, ... } is sorted.
/** Histogram of int values, reported compactly as a ragged matrix, * indexed by descending frequency rank. * <p> * Format of matrix: * Each row in the matrix begins with an occurrence count, * and continues with all int values that occur at that frequency. * <pre> * int[][] matrix = { * { count1, value11, value12, value13, ... }, * { count2, value21, value22, ... }, * ... * } * </pre> * The first column of the matrix { count1, count2, ... } * is sorted in descending order, and contains no duplicates. * Each row of the matrix (apart from its first element) * is sorted in ascending order, and contains no duplicates. * That is, each sequence { valuei1, valuei2, ... } is sorted. */
public int[][] getMatrix() { return matrix; } public int getRowCount() { return matrix.length; } public int getRowFrequency(int rn) { return matrix[rn][0]; } public int getRowLength(int rn) { return matrix[rn].length-1; } public int getRowValue(int rn, int vn) { return matrix[rn][vn+1]; } public int getRowWeight(int rn) { return getRowFrequency(rn) * getRowLength(rn); } public int getTotalWeight() { return totalWeight; } public int getTotalLength() { return values.length; }
Returns an array of all values, sorted.
/** Returns an array of all values, sorted. */
public int[] getAllValues() { return values; }
Returns an array parallel with getValues, with a frequency for each value.
/** Returns an array parallel with {@link #getValues}, * with a frequency for each value. */
public int[] getAllFrequencies() { return counts; } private static double log2 = Math.log(2); public int getFrequency(int value) { int pos = Arrays.binarySearch(values, value); if (pos < 0) return 0; assert(values[pos] == value); return counts[pos]; } public double getBitLength(int value) { double prob = (double) getFrequency(value) / getTotalWeight(); return - Math.log(prob) / log2; } public double getRowBitLength(int rn) { double prob = (double) getRowFrequency(rn) / getTotalWeight(); return - Math.log(prob) / log2; } public interface BitMetric { public double getBitLength(int value); } private final BitMetric bitMetric = new BitMetric() { public double getBitLength(int value) { return Histogram.this.getBitLength(value); } }; public BitMetric getBitMetric() { return bitMetric; }
bit-length is negative entropy: -H(matrix).
/** bit-length is negative entropy: -H(matrix). */
public double getBitLength() { double sum = 0; for (int i = 0; i < matrix.length; i++) { sum += getRowBitLength(i) * getRowWeight(i); } assert(0.1 > Math.abs(sum - getBitLength(bitMetric))); return sum; }
bit-length in to another coding (cross-entropy)
/** bit-length in to another coding (cross-entropy) */
public double getBitLength(BitMetric len) { double sum = 0; for (int i = 0; i < matrix.length; i++) { for (int j = 1; j < matrix[i].length; j++) { sum += matrix[i][0] * len.getBitLength(matrix[i][j]); } } return sum; } private static double round(double x, double scale) { return Math.round(x * scale) / scale; }
Sort rows and columns. Merge adjacent rows with the same key element [0]. Make a fresh copy of all of it.
/** Sort rows and columns. * Merge adjacent rows with the same key element [0]. * Make a fresh copy of all of it. */
public int[][] normalizeMatrix(int[][] matrix) { long[] rowMap = new long[matrix.length]; for (int i = 0; i < matrix.length; i++) { if (matrix[i].length <= 1) continue; int count = matrix[i][0]; if (count <= 0) continue; rowMap[i] = (long) count << 32 | i; } Arrays.sort(rowMap); int[][] newMatrix = new int[matrix.length][]; int prevCount = -1; int fillp1 = 0; int fillp2 = 0; for (int i = 0; ; i++) { int[] row; if (i < matrix.length) { long rowMapEntry = rowMap[rowMap.length-i-1]; if (rowMapEntry == 0) continue; row = matrix[(int)rowMapEntry]; assert(rowMapEntry>>>32 == row[0]); } else { row = new int[]{ -1 }; // close it off } if (row[0] != prevCount && fillp2 > fillp1) { // Close off previous run. int length = 0; for (int p = fillp1; p < fillp2; p++) { int[] row0 = newMatrix[p]; // previously visited row assert(row0[0] == prevCount); length += row0.length-1; } int[] row1 = new int[1+length]; // cloned & consolidated row row1[0] = prevCount; int rfillp = 1; for (int p = fillp1; p < fillp2; p++) { int[] row0 = newMatrix[p]; // previously visited row assert(row0[0] == prevCount); System.arraycopy(row0, 1, row1, rfillp, row0.length-1); rfillp += row0.length-1; } if (!isSorted(row1, 1, true)) { Arrays.sort(row1, 1, row1.length); int jfillp = 2; // Detect and squeeze out duplicates. for (int j = 2; j < row1.length; j++) { if (row1[j] != row1[j-1]) row1[jfillp++] = row1[j]; } if (jfillp < row1.length) { // Reallocate because of lost duplicates. int[] newRow1 = new int[jfillp]; System.arraycopy(row1, 0, newRow1, 0, jfillp); row1 = newRow1; } } newMatrix[fillp1++] = row1; fillp2 = fillp1; } if (i == matrix.length) break; prevCount = row[0]; newMatrix[fillp2++] = row; } assert(fillp1 == fillp2); // no unfinished business // Now drop missing rows. matrix = newMatrix; if (fillp1 < matrix.length) { newMatrix = new int[fillp1][]; System.arraycopy(matrix, 0, newMatrix, 0, fillp1); matrix = newMatrix; } return matrix; } public String[] getRowTitles(String name) { int totalUnique = getTotalLength(); int ltotalWeight = getTotalWeight(); String[] histTitles = new String[matrix.length]; int cumWeight = 0; int cumUnique = 0; for (int i = 0; i < matrix.length; i++) { int count = getRowFrequency(i); int unique = getRowLength(i); int weight = getRowWeight(i); cumWeight += weight; cumUnique += unique; long wpct = ((long)cumWeight * 100 + ltotalWeight/2) / ltotalWeight; long upct = ((long)cumUnique * 100 + totalUnique/2) / totalUnique; double len = getRowBitLength(i); assert(0.1 > Math.abs(len - getBitLength(matrix[i][1]))); histTitles[i] = name+"["+i+"]" +" len="+round(len,10) +" ("+count+"*["+unique+"])" +" ("+cumWeight+":"+wpct+"%)" +" ["+cumUnique+":"+upct+"%]"; } return histTitles; }
Print a report of this histogram.
/** Print a report of this histogram. */
public void print(PrintStream out) { print("hist", out); }
Print a report of this histogram.
/** Print a report of this histogram. */
public void print(String name, PrintStream out) { print(name, getRowTitles(name), out); }
Print a report of this histogram.
/** Print a report of this histogram. */
public void print(String name, String[] histTitles, PrintStream out) { int totalUnique = getTotalLength(); int ltotalWeight = getTotalWeight(); double tlen = getBitLength(); double avgLen = tlen / ltotalWeight; double avg = (double) ltotalWeight / totalUnique; String title = (name +" len="+round(tlen,10) +" avgLen="+round(avgLen,10) +" weight("+ltotalWeight+")" +" unique["+totalUnique+"]" +" avgWeight("+round(avg,100)+")"); if (histTitles == null) { out.println(title); } else { out.println(title+" {"); StringBuffer buf = new StringBuffer(); for (int i = 0; i < matrix.length; i++) { buf.setLength(0); buf.append(" ").append(histTitles[i]).append(" {"); for (int j = 1; j < matrix[i].length; j++) { buf.append(" ").append(matrix[i][j]); } buf.append(" }"); out.println(buf); } out.println("}"); } } /* public static int[][] makeHistogramMatrix(int[] values) { // Make sure they are sorted. values = maybeSort(values); long[] hist2col = computeHistogram2Col(values); int[][] matrix = makeMatrix(hist2col); return matrix; } */ private static int[][] makeMatrix(long[] hist2col) { // Sort by increasing count, then by increasing value. Arrays.sort(hist2col); int[] counts = new int[hist2col.length]; for (int i = 0; i < counts.length; i++) { counts[i] = (int)( hist2col[i] >>> 32 ); } long[] countHist = computeHistogram2Col(counts); int[][] matrix = new int[countHist.length][]; int histp = 0; // cursor into hist2col (increasing count, value) int countp = 0; // cursor into countHist (increasing count) // Do a join between hist2col (resorted) and countHist. for (int i = matrix.length; --i >= 0; ) { long countAndRep = countHist[countp++]; int count = (int) (countAndRep); // what is the value count? int repeat = (int) (countAndRep >>> 32); // # times repeated? int[] row = new int[1+repeat]; row[0] = count; for (int j = 0; j < repeat; j++) { long countAndValue = hist2col[histp++]; assert(countAndValue >>> 32 == count); row[1+j] = (int) countAndValue; } matrix[i] = row; } assert(histp == hist2col.length); return matrix; } private static int[][] makeTable(long[] hist2col) { int[][] table = new int[2][hist2col.length]; // Break apart the entries in hist2col. // table[0] gets values, table[1] gets entries. for (int i = 0; i < hist2col.length; i++) { table[0][i] = (int)( hist2col[i] ); table[1][i] = (int)( hist2col[i] >>> 32 ); } return table; }
Simple two-column histogram. Contains repeated counts. Assumes input is sorted. Does not sort output columns.

Format of result:

 long[] hist = {
   (count1 << 32) | (value1),
   (count2 << 32) | (value2),
   ...
 }
 
In addition, the sequence {valuei...} is guaranteed to be sorted. Note that resorting this using Arrays.sort() will reorder the entries by increasing count.
/** Simple two-column histogram. Contains repeated counts. * Assumes input is sorted. Does not sort output columns. * <p> * Format of result: * <pre> * long[] hist = { * (count1 << 32) | (value1), * (count2 << 32) | (value2), * ... * } * </pre> * In addition, the sequence {valuei...} is guaranteed to be sorted. * Note that resorting this using Arrays.sort() will reorder the * entries by increasing count. */
private static long[] computeHistogram2Col(int[] sortedValues) { switch (sortedValues.length) { case 0: return new long[]{ }; case 1: return new long[]{ ((long)1 << 32) | (LOW32 & sortedValues[0]) }; } long[] hist = null; for (boolean sizeOnly = true; ; sizeOnly = false) { int prevIndex = -1; int prevValue = sortedValues[0] ^ -1; // force a difference int prevCount = 0; for (int i = 0; i <= sortedValues.length; i++) { int thisValue; if (i < sortedValues.length) thisValue = sortedValues[i]; else thisValue = prevValue ^ -1; // force a difference at end if (thisValue == prevValue) { prevCount += 1; } else { // Found a new value. if (!sizeOnly && prevCount != 0) { // Save away previous value. hist[prevIndex] = ((long)prevCount << 32) | (LOW32 & prevValue); } prevValue = thisValue; prevCount = 1; prevIndex += 1; } } if (sizeOnly) { // Finished the sizing pass. Allocate the histogram. hist = new long[prevIndex]; } else { break; // done } } return hist; }
Regroup the histogram, so that it becomes an approximate histogram whose rows are of the given lengths. If matrix rows must be split, the latter parts (larger values) are placed earlier in the new matrix. If matrix rows are joined, they are resorted into ascending order. In the new histogram, the counts are averaged over row entries.
/** Regroup the histogram, so that it becomes an approximate histogram * whose rows are of the given lengths. * If matrix rows must be split, the latter parts (larger values) * are placed earlier in the new matrix. * If matrix rows are joined, they are resorted into ascending order. * In the new histogram, the counts are averaged over row entries. */
private static int[][] regroupHistogram(int[][] matrix, int[] groups) { long oldEntries = 0; for (int i = 0; i < matrix.length; i++) { oldEntries += matrix[i].length-1; } long newEntries = 0; for (int ni = 0; ni < groups.length; ni++) { newEntries += groups[ni]; } if (newEntries > oldEntries) { int newlen = groups.length; long ok = oldEntries; for (int ni = 0; ni < groups.length; ni++) { if (ok < groups[ni]) { int[] newGroups = new int[ni+1]; System.arraycopy(groups, 0, newGroups, 0, ni+1); groups = newGroups; groups[ni] = (int) ok; ok = 0; break; } ok -= groups[ni]; } } else { long excess = oldEntries - newEntries; int[] newGroups = new int[groups.length+1]; System.arraycopy(groups, 0, newGroups, 0, groups.length); newGroups[groups.length] = (int) excess; groups = newGroups; } int[][] newMatrix = new int[groups.length][]; // Fill pointers. int i = 0; // into matrix int jMin = 1; int jMax = matrix[i].length; for (int ni = 0; ni < groups.length; ni++) { int groupLength = groups[ni]; int[] group = new int[1+groupLength]; long groupWeight = 0; // count of all in new group newMatrix[ni] = group; int njFill = 1; while (njFill < group.length) { int len = group.length - njFill; while (jMin == jMax) { jMin = 1; jMax = matrix[++i].length; } if (len > jMax - jMin) len = jMax - jMin; groupWeight += (long) matrix[i][0] * len; System.arraycopy(matrix[i], jMax - len, group, njFill, len); jMax -= len; njFill += len; } Arrays.sort(group, 1, group.length); // compute average count of new group: group[0] = (int) ((groupWeight + groupLength/2) / groupLength); } assert(jMin == jMax); assert(i == matrix.length-1); return newMatrix; } public static Histogram makeByteHistogram(InputStream bytes) throws IOException { byte[] buf = new byte[1<<12]; int[] tally = new int[1<<8]; for (int nr; (nr = bytes.read(buf)) > 0; ) { for (int i = 0; i < nr; i++) { tally[buf[i] & 0xFF] += 1; } } // Build a matrix. int[][] matrix = new int[1<<8][2]; for (int i = 0; i < tally.length; i++) { matrix[i][0] = tally[i]; matrix[i][1] = i; } return new Histogram(matrix); }
Slice and sort the given input array.
/** Slice and sort the given input array. */
private static int[] sortedSlice(int[] valueSequence, int start, int end) { if (start == 0 && end == valueSequence.length && isSorted(valueSequence, 0, false)) { return valueSequence; } else { int[] slice = new int[end-start]; System.arraycopy(valueSequence, start, slice, 0, slice.length); Arrays.sort(slice); return slice; } }
Tell if an array is sorted.
/** Tell if an array is sorted. */
private static boolean isSorted(int[] values, int from, boolean strict) { for (int i = from+1; i < values.length; i++) { if (strict ? !(values[i-1] < values[i]) : !(values[i-1] <= values[i])) { return false; // found witness to disorder } } return true; // no witness => sorted }
Clone and sort the array, if not already sorted.
/** Clone and sort the array, if not already sorted. */
private static int[] maybeSort(int[] values) { if (!isSorted(values, 0, false)) { values = values.clone(); Arrays.sort(values); } return values; } /// Debug stuff follows. private boolean assertWellFormed(int[] valueSequence) { /* // Sanity check. int weight = 0; int vlength = 0; for (int i = 0; i < matrix.length; i++) { int vlengthi = (matrix[i].length-1); int count = matrix[i][0]; assert(vlengthi > 0); // no empty rows assert(count > 0); // no impossible rows vlength += vlengthi; weight += count * vlengthi; } assert(isSorted(values, 0, true)); // make sure the counts all add up assert(totalWeight == weight); assert(vlength == values.length); assert(vlength == counts.length); int weight2 = 0; for (int i = 0; i < counts.length; i++) { weight2 += counts[i]; } assert(weight2 == weight); int[] revcol1 = new int[matrix.length]; //1st matrix colunm for (int i = 0; i < matrix.length; i++) { // spot checking: try a random query on each matrix row assert(matrix[i].length > 1); revcol1[matrix.length-i-1] = matrix[i][0]; assert(isSorted(matrix[i], 1, true)); int rand = (matrix[i].length+1) / 2; int val = matrix[i][rand]; int count = matrix[i][0]; int pos = Arrays.binarySearch(values, val); assert(values[pos] == val); assert(counts[pos] == matrix[i][0]); if (valueSequence != null) { int count2 = 0; for (int j = 0; j < valueSequence.length; j++) { if (valueSequence[j] == val) count2++; } assert(count2 == count); } } assert(isSorted(revcol1, 0, true)); //*/ return true; } /* public static int[] readValuesFrom(InputStream instr) { return readValuesFrom(new InputStreamReader(instr)); } public static int[] readValuesFrom(Reader inrdr) { inrdr = new BufferedReader(inrdr); final StreamTokenizer in = new StreamTokenizer(inrdr); final int TT_NOTHING = -99; in.commentChar('#'); return readValuesFrom(new Iterator() { int token = TT_NOTHING; private int getToken() { if (token == TT_NOTHING) { try { token = in.nextToken(); assert(token != TT_NOTHING); } catch (IOException ee) { throw new RuntimeException(ee); } } return token; } public boolean hasNext() { return getToken() != StreamTokenizer.TT_EOF; } public Object next() { int ntok = getToken(); token = TT_NOTHING; switch (ntok) { case StreamTokenizer.TT_EOF: throw new NoSuchElementException(); case StreamTokenizer.TT_NUMBER: return new Integer((int) in.nval); default: assert(false); return null; } } public void remove() { throw new UnsupportedOperationException(); } }); } public static int[] readValuesFrom(Iterator iter) { return readValuesFrom(iter, 0); } public static int[] readValuesFrom(Iterator iter, int initSize) { int[] na = new int[Math.max(10, initSize)]; int np = 0; while (iter.hasNext()) { Integer val = (Integer) iter.next(); if (np == na.length) { int[] na2 = new int[np*2]; System.arraycopy(na, 0, na2, 0, np); na = na2; } na[np++] = val.intValue(); } if (np != na.length) { int[] na2 = new int[np]; System.arraycopy(na, 0, na2, 0, np); na = na2; } return na; } public static Histogram makeByteHistogram(byte[] bytes) { try { return makeByteHistogram(new ByteArrayInputStream(bytes)); } catch (IOException ee) { throw new RuntimeException(ee); } } public static void main(String[] av) throws IOException { if (av.length > 0 && av[0].equals("-r")) { int[] values = new int[Integer.parseInt(av[1])]; int limit = values.length; if (av.length >= 3) { limit = (int)( limit * Double.parseDouble(av[2]) ); } Random rnd = new Random(); for (int i = 0; i < values.length; i++) { values[i] = rnd.nextInt(limit);; } Histogram rh = new Histogram(values); rh.print("random", System.out); return; } if (av.length > 0 && av[0].equals("-s")) { int[] values = readValuesFrom(System.in); Random rnd = new Random(); for (int i = values.length; --i > 0; ) { int j = rnd.nextInt(i+1); if (j < i) { int tem = values[i]; values[i] = values[j]; values[j] = tem; } } for (int i = 0; i < values.length; i++) System.out.println(values[i]); return; } if (av.length > 0 && av[0].equals("-e")) { // edge cases new Histogram(new int[][] { {1, 11, 111}, {0, 123, 456}, {1, 111, 1111}, {0, 456, 123}, {3}, {}, {3}, {2, 22}, {4} }).print(System.out); return; } if (av.length > 0 && av[0].equals("-b")) { // edge cases Histogram bh = makeByteHistogram(System.in); bh.print("bytes", System.out); return; } boolean regroup = false; if (av.length > 0 && av[0].equals("-g")) { regroup = true; } int[] values = readValuesFrom(System.in); Histogram h = new Histogram(values); if (!regroup) h.print(System.out); if (regroup) { int[] groups = new int[12]; for (int i = 0; i < groups.length; i++) { groups[i] = 1<<i; } int[][] gm = regroupHistogram(h.getMatrix(), groups); Histogram g = new Histogram(gm); System.out.println("h.getBitLength(g) = "+ h.getBitLength(g.getBitMetric())); System.out.println("g.getBitLength(h) = "+ g.getBitLength(h.getBitMetric())); g.print("regrouped", System.out); } } //*/ }