Minimal perfect hash tool

f986f6ae · Thomas Mueller · 2d649ea2 · f986f6ae · f986f6ae · f986f6ae
--- a/h2/src/test/org/h2/test/unit/TestIntPerfectHash.java
+++ b/h2/src/test/org/h2/test/unit/TestIntPerfectHash.java
+/*
+ * Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0,
+ * and the EPL 1.0 (http://h2database.com/html/license.html).
+ * Initial Developer: H2 Group
+ */
+package org.h2.test.unit;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+import org.h2.dev.hash.IntPerfectHash;
+import org.h2.dev.hash.IntPerfectHash.BitArray;
+import org.h2.test.TestBase;
+/**
+ * Tests the perfect hash tool.
+ */
+public class TestIntPerfectHash extends TestBase {
+    /**
+     * Run just this test.
+     *
+     * @param a ignored
+     */
+    public static void main(String... a) throws Exception {
+        TestIntPerfectHash test = (TestIntPerfectHash) TestBase.createCaller().init();
+        test.measure();
+        test.test();
+        test.measure();
+    }
+    /**
+     * Measure the hash functions.
+     */
+    public void measure() {
+        int size = 10000;
+        test(size / 10);
+        int s;
+        long time = System.currentTimeMillis();
+        s = test(size);
+        time = System.currentTimeMillis() - time;
+        System.out.println((double) s / size + " bits/key in " +
+                time + " ms");
+    }
+    @Override
+    public void test() {
+        testBitArray();
+        for (int i = 0; i < 100; i++) {
+            test(i);
+        }
+        for (int i = 100; i <= 10000; i *= 10) {
+            test(i);
+        }
+    }
+    private void testBitArray() {
+        byte[] data = new byte[0];
+        BitSet set = new BitSet();
+        for (int i = 100; i >= 0; i--) {
+            data = BitArray.setBit(data, i, true);
+            set.set(i);
+        }
+        Random r = new Random(1);
+        for (int i = 0; i < 10000; i++) {
+            int pos = r.nextInt(100);
+            boolean s = r.nextBoolean();
+            data = BitArray.setBit(data, pos, s);
+            set.set(pos, s);
+            pos = r.nextInt(100);
+            assertTrue(BitArray.getBit(data, pos) == set.get(pos));
+        }
+        assertTrue(BitArray.countBits(data) == set.cardinality());
+    }
+    private int test(int size) {
+        Random r = new Random(size);
+        HashSet<Integer> set = new HashSet<Integer>();
+        while (set.size() < size) {
+            set.add(r.nextInt());
+        }
+        ArrayList<Integer> list = new ArrayList<Integer>();
+        list.addAll(set);
+        byte[] desc = IntPerfectHash.generate(list);
+        int max = test(desc, set);
+        assertEquals(size - 1, max);
+        return desc.length * 8;
+    }
+    private int test(byte[] desc, Set<Integer> set) {
+        int max = -1;
+        HashSet<Integer> test = new HashSet<Integer>();
+        IntPerfectHash hash = new IntPerfectHash(desc);
+        for (int x : set) {
+            int h = hash.get(x);
+            assertTrue(h >= 0);
+            assertTrue(h <= set.size() * 3);
+            max = Math.max(max, h);
+            assertFalse(test.contains(h));
+            test.add(h);
+        }
+        return max;
+    }
+}
--- a/h2/src/tools/org/h2/dev/hash/IntPerfectHash.java
+++ b/h2/src/tools/org/h2/dev/hash/IntPerfectHash.java
+/*
+ * Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0,
+ * and the EPL 1.0 (http://h2database.com/html/license.html).
+ * Initial Developer: H2 Group
+ */
+package org.h2.dev.hash;
+import java.util.ArrayList;
+import java.util.Arrays;
+/**
+ * A minimum perfect hash function tool. It needs about 2.2 bits per key.
+ */
+public class IntPerfectHash {
+    /**
+     * Large buckets are typically divided into buckets of this size.
+     */
+    private static final int DIVIDE = 6;
+    /**
+     * The maximum size of a small bucket (one that is not further split if
+     * possible).
+     */
+    private static final int MAX_SIZE = 12;
+    /**
+     * The maximum offset for hash functions of small buckets. At most that many
+     * hash functions are tried for the given size.
+     */
+    private static final int[] MAX_OFFSETS = { 0, 0, 8, 18, 47, 123, 319, 831, 2162,
+            5622, 14617, 38006, 98815 };
+    /**
+     * The output value to split the bucket into many (more than 2) smaller
+     * buckets.
+     */
+    private static final int SPLIT_MANY = 3;
+    /**
+     * The minimum output value for a small bucket of a given size.
+     */
+    private static final int[] SIZE_OFFSETS = new int[MAX_OFFSETS.length + 1];
+    static {
+        int last = SPLIT_MANY + 1;
+        for (int i = 0; i < MAX_OFFSETS.length; i++) {
+            SIZE_OFFSETS[i] = last;
+            last += MAX_OFFSETS[i];
+        }
+        SIZE_OFFSETS[SIZE_OFFSETS.length - 1] = last;
+    }
+    /**
+     * The description of the hash function. Used for calculating the hash of a
+     * key.
+     */
+    private final byte[] data;
+    /**
+     * Create a hash object to convert keys to hashes.
+     *
+     * @param data the data returned by the generate method
+     */
+    public IntPerfectHash(byte[] data) {
+        this.data = data;
+    }
+    /**
+     * Get the hash function description.
+     * 
+     * @return the data
+     */
+    public byte[] getData() {
+        return data;
+    }
+    /**
+     * Calculate the hash value for the given key.
+     *
+     * @param x the key
+     * @return the hash value
+     */
+    public int get(int x) {
+        return get(0, x, 0);
+    }
+    /**
+     * Get the hash value for the given key, starting at a certain position and
+     * level.
+     *
+     * @param pos the start position
+     * @param x the key
+     * @param isRoot whether this is the root of the tree
+     * @param level the level
+     * @return the hash value
+     */
+    private int get(int pos, int x, int level) {
+        int n = readVarInt(data, pos);
+        if (n < 2) {
+            return 0;
+        } else if (n > SPLIT_MANY) {
+            int size = getSize(n);
+            int offset = getOffset(n, size);
+            return hash(x, level, offset, size);
+        }
+        pos++;
+        int split;
+        if (n == SPLIT_MANY) {
+            split = readVarInt(data, pos);
+            pos += getVarIntLength(data, pos);
+        } else {
+            split = n;
+        }
+        int h = hash(x, level, 0, split);
+        int s;
+        int start = pos;
+        for (int i = 0; i < h; i++) {
+            pos = getNextPos(pos);
+        }
+        s = getSizeSum(start, pos);
+        return s + get(pos, x, level + 1);
+    }
+    /**
+     * Get the position of the next sibling.
+     *
+     * @param pos the position of this branch
+     * @return the position of the next sibling
+     */
+    private int getNextPos(int pos) {
+        int n = readVarInt(data, pos);
+        pos += getVarIntLength(data, pos);
+        if (n < 2 || n > SPLIT_MANY) {
+            return pos;
+        }
+        int split;
+        if (n == SPLIT_MANY) {
+            split = readVarInt(data, pos);
+            pos += getVarIntLength(data, pos);
+        } else {
+            split = n;
+        }
+        for (int i = 0; i < split; i++) {
+            pos = getNextPos(pos);
+        }
+        return pos;
+    }
+    /**
+     * The sum of the sizes between the start and end position.
+     *
+     * @param start the start position
+     * @param end the end position (excluding)
+     * @return the sizes
+     */
+    private int getSizeSum(int start, int end) {
+        int s = 0;
+        for (int pos = start; pos < end;) {
+            int n = readVarInt(data, pos);
+            pos += getVarIntLength(data, pos);
+            if (n < 2) {
+                s += n;
+            } else if (n > SPLIT_MANY) {
+                s += getSize(n);
+            } else if (n == SPLIT_MANY) {
+                pos += getVarIntLength(data, pos);
+            }
+        }
+        return s;
+    }
+    private static void writeSizeOffset(ByteStream out, int size,
+            int offset) {
+        writeVarInt(out, SIZE_OFFSETS[size] + offset);
+    }
+    private static int getOffset(int n, int size) {
+        return n - SIZE_OFFSETS[size];
+    }
+    private static int getSize(int n) {
+        for (int i = 0; i < SIZE_OFFSETS.length; i++) {
+            if (n < SIZE_OFFSETS[i]) {
+                return i - 1;
+            }
+        }
+        return 0;
+    }
+    /**
+     * Generate the minimal perfect hash function data from the given list.
+     *
+     * @param list the data
+     * @return the hash function description
+     */
+    public static <K> byte[] generate(ArrayList<Integer> list) {
+        ByteStream out = new ByteStream();
+        generate(list, 0, out);
+        return out.toByteArray();
+    }
+    private static <K> void generate(ArrayList<Integer> list, int level, ByteStream out) {
+        int size = list.size();
+        if (size <= 1) {
+            out.write((byte) size);
+            return;
+        }
+        if (level > 32) {
+            throw new IllegalStateException("Too many recursions; " +
+                    " incorrect universal hash function?");
+        }
+        if (size <= MAX_SIZE) {
+            int maxOffset = MAX_OFFSETS[size];
+            int testSize = size;
+            nextOffset:
+            for (int offset = 0; offset < maxOffset; offset++) {
+                int bits = 0;
+                for (int i = 0; i < size; i++) {
+                    int x = list.get(i);
+                    int h = hash(x, level, offset, testSize);
+                    if ((bits & (1 << h)) != 0) {
+                        continue nextOffset;
+                    }
+                    bits |= 1 << h;
+                }
+                writeSizeOffset(out, size, offset);
+                return;
+            }
+        }
+        int split;
+        if (size > 57 * DIVIDE) {
+            split = size / (36 * DIVIDE);
+        } else {
+            split = (size - 47) / DIVIDE;
+        }
+        split = Math.max(2, split);
+        ArrayList<ArrayList<Integer>> lists;
+        do {
+            lists = new ArrayList<ArrayList<Integer>>(split);
+            for (int i = 0; i < split; i++) {
+                lists.add(new ArrayList<Integer>(size / split));
+            }
+            for (int x : list) {
+                ArrayList<Integer> l = lists.get(hash(x, level, 0, split));
+                l.add(x);
+            }
+        } while (lists == null);
+        if (split >= SPLIT_MANY) {
+            out.write((byte) SPLIT_MANY);
+        }
+        writeVarInt(out, split);
+        list.clear();
+        list.trimToSize();
+        for (ArrayList<Integer> s2 : lists) {
+            generate(s2, level + 1, out);
+        }
+    }
+    private static int hash(int x, int level, int offset, int size) {
+        x += level + offset * 32;
+        x = ((x >>> 16) ^ x) * 0x45d9f3b;
+        x = ((x >>> 16) ^ x) * 0x45d9f3b;
+        x = (x >>> 16) ^ x;
+        return Math.abs(x % size);
+    }
+    private static int writeVarInt(ByteStream out, int x) {
+        int len = 0;
+        while ((x & ~0x7f) != 0) {
+            out.write((byte) (0x80 | (x & 0x7f)));
+            x >>>= 7;
+            len++;
+        }
+        out.write((byte) x);
+        return ++len;
+    }
+    private static int readVarInt(byte[] d, int pos) {
+        int x = d[pos++];
+        if (x >= 0) {
+            return x;
+        }
+        x &= 0x7f;
+        for (int s = 7; s < 64; s += 7) {
+            int b = d[pos++];
+            x |= (b & 0x7f) << s;
+            if (b >= 0) {
+                break;
+            }
+        }
+        return x;
+    }
+    private static int getVarIntLength(byte[] d, int pos) {
+        int x = d[pos++];
+        if (x >= 0) {
+            return 1;
+        }
+        int len = 2;
+        for (int s = 7; s < 64; s += 7) {
+            int b = d[pos++];
+            if (b >= 0) {
+                break;
+            }
+            len++;
+        }
+        return len;
+    }
+    /**
+     * A stream of bytes.
+     */
+    static class ByteStream {
+        private byte[] data;
+        private int pos;
+        ByteStream() {
+            this.data = new byte[16];
+        }
+        ByteStream(byte[] data) {
+            this.data = data;
+        }
+        /**
+         * Read a byte.
+         * 
+         * @return the byte, or -1.
+         */
+        int read() {
+            return pos < data.length ? (data[pos++] & 255) : -1;
+        }
+        /**
+         * Write a byte.
+         * 
+         * @param value the byte
+         */
+        void write(byte value) {
+            if (pos >= data.length) {
+                data = Arrays.copyOf(data, data.length * 2);
+            }
+            data[pos++] = value;
+        }
+        /**
+         * Get the byte array.
+         * 
+         * @return the byte array
+         */
+        byte[] toByteArray() {
+            return Arrays.copyOf(data, pos);
+        }
+    }
+    /**
+     * A helper class for bit arrays.
+     */
+    public static class BitArray {
+        /**
+         * Set a bit in the array.
+         * 
+         * @param data the array
+         * @param x the bit index
+         * @param value the new value
+         * @return the bit array (if the passed one was too small)
+         */
+        public static byte[] setBit(byte[] data, int x, boolean value) {
+            int pos = x / 8;
+            if (pos >= data.length) {
+                data = Arrays.copyOf(data, pos + 1);
+            }
+            if (value) {
+                data[pos] |= 1 << (x & 7);
+            } else {
+                data[pos] &= 255 - (1 << (x & 7));
+            }
+            return data;
+        }
+        /**
+         * Get a bit in a bit array.
+         * 
+         * @param data the array
+         * @param x the bit index
+         * @return the value
+         */
+        public static boolean getBit(byte[] data, int x) {
+            return (data[x / 8] & (1 << (x & 7))) != 0;
+        }
+        /**
+         * Count the number of set bits.
+         * 
+         * @param data the array
+         * @return the number of set bits
+         */
+        public static int countBits(byte[] data) {
+            int count = 0;
+            for (byte x : data) {
+                count += Integer.bitCount(x & 255);
+            }
+            return count;
+        }
+    }
+}
--- a/h2/src/tools/org/h2/dev/hash/MinimalPerfectHash.java
+++ b/h2/src/tools/org/h2/dev/hash/MinimalPerfectHash.java
@@ -76,6 +76,12 @@ public class MinimalPerfectHash<K> {
     */
    private static final int DIVIDE = 6;
+    /**
+     * For sets larger than this, instead of trying to map then uniquely to a
+     * set of the same size, the size of the set is incremented by one. This
+     * reduces the time to find a mapping, but the index of the hole also needs
+     * to be stored, which increases the space usage.
+     */
    private static final int SPEEDUP = 11;
    /**