A perfect hash function / minimal perfect hash function tool

b7a0f583 · Thomas Mueller · df34dee4 · b7a0f583 · b7a0f583 · b7a0f583
--- a/h2/src/test/org/h2/test/TestAll.java
+++ b/h2/src/test/org/h2/test/TestAll.java
@@ -176,6 +176,7 @@ import org.h2.test.unit.TestOverflow;
 import org.h2.test.unit.TestPageStore;
 import org.h2.test.unit.TestPageStoreCoverage;
 import org.h2.test.unit.TestPattern;
+import org.h2.test.unit.TestPerfectHash;
 import org.h2.test.unit.TestPgServer;
 import org.h2.test.unit.TestReader;
 import org.h2.test.unit.TestRecovery;
@@ -788,6 +789,7 @@ kill -9 `jps -l | grep "org.h2.test." | cut -d " " -f 1`
        new TestPageStore().runTest(this);
        new TestPageStoreCoverage().runTest(this);
        new TestPattern().runTest(this);
+        new TestPerfectHash().runTest(this);
        new TestPgServer().runTest(this);
        new TestReader().runTest(this);
        new TestRecovery().runTest(this);

--- a/h2/src/test/org/h2/test/unit/TestPerfectHash.java
+++ b/h2/src/test/org/h2/test/unit/TestPerfectHash.java
+/*
+ * Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0,
+ * and the EPL 1.0 (http://h2database.com/html/license.html).
+ * Initial Developer: H2 Group
+ */
+package org.h2.test.unit;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+import org.h2.dev.hash.PerfectHash;
+import org.h2.test.TestBase;
+/**
+ * Tests the perfect hash tool.
+ */
+public class TestPerfectHash extends TestBase {
+    /**
+     * Run just this test.
+     *
+     * @param a ignored
+     */
+    public static void main(String... a) throws Exception {
+        TestBase.createCaller().init().test();
+    }
+    @Override
+    public void test() {
+        for (int i = 0; i < 1000; i++) {
+            test(i, true);
+            test(i, false);
+        }
+        for (int i = 1000; i <= 100000; i *= 10) {
+            test(i, true);
+            test(i, false);
+        }
+    }
+    void test(int size, boolean minimal) {
+        Random r = new Random(size);
+        HashSet<Integer> set = new HashSet<Integer>();
+        while (set.size() < size) {
+            set.add(r.nextInt());
+        }
+        byte[] desc = PerfectHash.generate(set, minimal);
+        int max = test(desc, set);
+        if (minimal) {
+            assertEquals(size - 1, max);
+        } else {
+            if (size > 10) {
+                assertTrue(max < 1.5 * size);
+            }
+        }
+    }
+    int test(byte[] desc, Set<Integer> set) {
+        int max = -1;
+        HashSet<Integer> test = new HashSet<Integer>();
+        PerfectHash hash = new PerfectHash(desc);
+        for (int x : set) {
+            int h = hash.get(x);
+            assertTrue(h >= 0);
+            assertTrue(h <= set.size() * 3);
+            max = Math.max(max, h);
+            assertFalse(test.contains(h));
+            test.add(h);
+        }
+        return max;
+    }
+}
--- a/h2/src/tools/org/h2/dev/hash/PerfectHash.java
+++ b/h2/src/tools/org/h2/dev/hash/PerfectHash.java
+/*
+ * Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0,
+ * and the EPL 1.0 (http://h2database.com/html/license.html).
+ * Initial Developer: H2 Group
+ */
+package org.h2.dev.hash;
+import java.io.ByteArrayOutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.zip.Deflater;
+import java.util.zip.Inflater;
+/**
+ * A perfect hash function tool. It needs about 1.4 bits per key, and the
+ * resulting hash table is about 79% full. The minimal perfect hash function
+ * needs about 2.3 bits per key.
+ * <p>
+ * Generating the hash function takes about 1 second per million keys (linear)
+ * for both perfect hash and minimal perfect hash.
+ * <p>
+ * The algorithm is recursive: sets that contain no or only one entry are not
+ * processed as no conflicts are possible. Sets that contain between 2 and 16
+ * buckets, up to 16 hash functions are tested to check if they can store the
+ * data without conflict. If no function was found, the same is tested on a
+ * larger bucket (except for the minimal perfect hash). If no hash function was
+ * found, and for larger buckets, the bucket is split into a number of smaller
+ * buckets (up to 32).
+ * <p>
+ * At the end of the generation process, the data is compressed using a general
+ * purpose compression tool (Deflate / Huffman coding). The uncompressed data is
+ * around 1.52 bits per key (perfect hash) and 3.72 (minimal perfect hash).
+ * 
+ */
+public class PerfectHash {
+    private static final int MAX_SIZE = 16;
+    private static final int OFFSETS = 16;
+    private static final int MAX_SPLIT = 32;
+    /**
+     * The description of the hash function.
+     */
+    private final byte[] data;
+    private final int[] plus;
+    private final int[] next;
+    /**
+     * Create a hash object to convert keys to hashes.
+     * 
+     * @param data the data returned by the generate method
+     */
+    public PerfectHash(byte[] data) {
+        this.data = data = expand(data);
+        plus = new int[data.length];
+        next = new int[data.length];
+        for (int i = 0, p = 0; i < data.length; i++) {
+            plus[i] = p;
+            int n = data[i] & 255;
+            p += n < 2 ? n : n >= MAX_SPLIT ? (n / OFFSETS) : 0;
+        }
+    }
+    /**
+     * Calculate the hash from the key.
+     * 
+     * @param x the key
+     * @return the hash
+     */
+    public int get(int x) {
+        return get(0, x, 0);
+    }
+    private int get(int pos, int x, int level) {
+        int n = data[pos] & 255;
+        if (n < 2) {
+            return plus[pos];
+        } else if (n >= MAX_SPLIT) {
+            return plus[pos] + hash(x, level, n % OFFSETS, n / OFFSETS);
+        }
+        pos++;
+        int h = hash(x, level, 0, n);
+        for (int i = 0; i < h; i++) {
+            pos = read(pos);
+        }
+        return get(pos, x, level + 1);
+    }
+    private int read(int pos) {
+        int p = next[pos];
+        if (p == 0) {
+            int n = data[pos] & 255;
+            if (n < 2 || n >= MAX_SPLIT) {
+                return pos + 1;
+            }
+            int start = pos++;
+            for (int i = 0; i < n; i++) {
+                pos = read(pos);
+            }
+            next[start] = p = pos;
+        }
+        return p;
+    }
+    /**
+     * Generate the perfect hash function data from the given set of integers.
+     * 
+     * @param list the set
+     * @param minimal whether the perfect hash function needs to be minimal
+     * @return the data
+     */
+    public static byte[] generate(Set<Integer> list, boolean minimal) {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        generate(list, 0, minimal, out);
+        return compress(out.toByteArray());
+    }
+    private static void generate(Collection<Integer> set, int level, 
+            boolean minimal, ByteArrayOutputStream out) {
+        int size = set.size();
+        if (size <= 1) {
+            out.write(size);
+            return;
+        }
+        if (size < MAX_SIZE) {
+            int max = minimal ? size : Math.min(MAX_SIZE - 1, size * 2);
+            for (int s = size; s <= max; s++) {
+                nextOffset:
+                for (int offset = 0; offset < OFFSETS; offset++) {
+                    int bits = 0;
+                    for (int x : set) {
+                        int h = hash(x, level, offset, s);
+                        if ((bits & (1 << h)) != 0) {
+                            continue nextOffset;
+                        }
+                        bits |= 1 << h;
+                    }
+                    out.write(s * OFFSETS + offset);
+                    return;
+                }
+            }
+        }
+        int split;
+        if (minimal) {
+            split = size > 150 ? size / 83 : (size + 3) / 4;
+        } else {
+            split = size > 265 ? size / 142 : (size + 5) / 7;
+        }
+        split = Math.min(MAX_SPLIT - 1, Math.max(2, split));
+        out.write(split);
+        List<List<Integer>> lists = new ArrayList<List<Integer>>(split);
+        for (int i = 0; i < split; i++) {
+            lists.add(new ArrayList<Integer>(size / split));
+        }
+        for (int x : set) {
+            lists.get(hash(x, level, 0, split)).add(x);
+        }
+        for (List<Integer> s2 : lists) {
+            generate(s2, level + 1, minimal, out);
+        }
+    }
+    private static int hash(int x, int level, int offset, int size) {
+        x += level * OFFSETS + offset;
+        x = ((x >>> 16) ^ x) * 0x45d9f3b;
+        x = ((x >>> 16) ^ x) * 0x45d9f3b;
+        x = (x >>> 16) ^ x;
+        return Math.abs(x % size);
+    }
+    private static byte[] compress(byte[] d) {
+        Deflater deflater = new Deflater();
+        deflater.setStrategy(Deflater.HUFFMAN_ONLY);
+        deflater.setInput(d);
+        deflater.finish();
+        ByteArrayOutputStream out2 = new ByteArrayOutputStream(d.length);
+        byte[] buffer = new byte[1024];
+        while (!deflater.finished()) {
+            int count = deflater.deflate(buffer);
+            out2.write(buffer, 0, count);
+        }
+        deflater.end();
+        return out2.toByteArray();
+    }
+    private static byte[] expand(byte[] d) {
+        Inflater inflater = new Inflater();  
+        inflater.setInput(d); 
+        ByteArrayOutputStream out = new ByteArrayOutputStream(d.length); 
+        byte[] buffer = new byte[1024]; 
+        try {
+            while (!inflater.finished()) { 
+                int count = inflater.inflate(buffer);
+                out.write(buffer, 0, count); 
+            }
+            inflater.end();
+        } catch (Exception e) {
+            throw new IllegalArgumentException(e);
+        } 
+        return out.toByteArray();        
+    }
+}
--- a/h2/src/tools/org/h2/dev/hash/package.html
+++ b/h2/src/tools/org/h2/dev/hash/package.html
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<!--
+Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0, Version 1.0,
+and under the Eclipse Public License, Version 1.0
+Initial Developer: H2 Group
+-->
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
+<head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><title>
+Javadoc package documentation
+</title></head><body style="font: 9pt/130% Tahoma, Arial, Helvetica, sans-serif; font-weight: normal;"><p>
+A perfect hash function tool.
+</p></body></html>
\ No newline at end of file