提交 c364baf4 authored 作者: Thomas Mueller's avatar Thomas Mueller

A minimal perfect hash function tool

上级 4300469b
...@@ -9,6 +9,7 @@ import java.util.HashSet; ...@@ -9,6 +9,7 @@ import java.util.HashSet;
import java.util.Random; import java.util.Random;
import java.util.Set; import java.util.Set;
import org.h2.dev.hash.MinimalPerfectHash;
import org.h2.dev.hash.PerfectHash; import org.h2.dev.hash.PerfectHash;
import org.h2.test.TestBase; import org.h2.test.TestBase;
...@@ -23,22 +24,45 @@ public class TestPerfectHash extends TestBase { ...@@ -23,22 +24,45 @@ public class TestPerfectHash extends TestBase {
* @param a ignored * @param a ignored
*/ */
public static void main(String... a) throws Exception { public static void main(String... a) throws Exception {
TestBase.createCaller().init().test(); TestPerfectHash test = (TestPerfectHash) TestBase.createCaller().init();
test.test();
test.measure();
} }
/**
* Measure the hash functions.
*/
public void measure() {
int size = 1000000;
int s = testMinimal(size);
System.out.println((double) s / size + " bits/key (minimal)");
s = test(size, true);
System.out.println((double) s / size + " bits/key (minimal old)");
s = test(size, false);
System.out.println((double) s / size + " bits/key (not minimal)");
}
@Override @Override
public void test() { public void test() {
for (int i = 0; i < 1000; i++) { for (int i = 0; i < 100; i++) {
testMinimal(i);
}
for (int i = 100; i <= 100000; i *= 10) {
testMinimal(i);
}
for (int i = 0; i < 100; i++) {
test(i, true); test(i, true);
test(i, false); test(i, false);
} }
for (int i = 1000; i <= 100000; i *= 10) { for (int i = 100; i <= 100000; i *= 10) {
test(i, true); test(i, true);
test(i, false); test(i, false);
} }
} }
void test(int size, boolean minimal) { private int test(int size, boolean minimal) {
Random r = new Random(size); Random r = new Random(size);
HashSet<Integer> set = new HashSet<Integer>(); HashSet<Integer> set = new HashSet<Integer>();
while (set.size() < size) { while (set.size() < size) {
...@@ -53,9 +77,10 @@ public class TestPerfectHash extends TestBase { ...@@ -53,9 +77,10 @@ public class TestPerfectHash extends TestBase {
assertTrue(max < 1.5 * size); assertTrue(max < 1.5 * size);
} }
} }
return desc.length * 8;
} }
int test(byte[] desc, Set<Integer> set) { private int test(byte[] desc, Set<Integer> set) {
int max = -1; int max = -1;
HashSet<Integer> test = new HashSet<Integer>(); HashSet<Integer> test = new HashSet<Integer>();
PerfectHash hash = new PerfectHash(desc); PerfectHash hash = new PerfectHash(desc);
...@@ -69,4 +94,32 @@ public class TestPerfectHash extends TestBase { ...@@ -69,4 +94,32 @@ public class TestPerfectHash extends TestBase {
} }
return max; return max;
} }
private int testMinimal(int size) {
Random r = new Random(size);
HashSet<Integer> set = new HashSet<Integer>();
while (set.size() < size) {
set.add(r.nextInt());
}
byte[] desc = MinimalPerfectHash.generate(set);
int max = testMinimal(desc, set);
assertEquals(size - 1, max);
return desc.length * 8;
}
private int testMinimal(byte[] desc, Set<Integer> set) {
int max = -1;
HashSet<Integer> test = new HashSet<Integer>();
MinimalPerfectHash hash = new MinimalPerfectHash(desc);
for (int x : set) {
int h = hash.get(x);
assertTrue(h >= 0);
assertTrue(h <= set.size() * 3);
max = Math.max(max, h);
assertFalse(test.contains(h));
test.add(h);
}
return max;
}
} }
...@@ -18,12 +18,12 @@ import java.util.zip.Inflater; ...@@ -18,12 +18,12 @@ import java.util.zip.Inflater;
* resulting hash table is about 79% full. The minimal perfect hash function * resulting hash table is about 79% full. The minimal perfect hash function
* needs about 2.3 bits per key. * needs about 2.3 bits per key.
* <p> * <p>
* Generating the hash function takes about 1 second per million keys (linear) * Generating the hash function takes about 1 second per million keys
* for both perfect hash and minimal perfect hash. * for both perfect hash and minimal perfect hash.
* <p> * <p>
* The algorithm is recursive: sets that contain no or only one entry are not * The algorithm is recursive: sets that contain no or only one entry are not
* processed as no conflicts are possible. Sets that contain between 2 and 16 * processed as no conflicts are possible. Sets that contain between 2 and 16
* buckets, up to 16 hash functions are tested to check if they can store the * entries, up to 16 hash functions are tested to check if they can store the
* data without conflict. If no function was found, the same is tested on a * data without conflict. If no function was found, the same is tested on a
* larger bucket (except for the minimal perfect hash). If no hash function was * larger bucket (except for the minimal perfect hash). If no hash function was
* found, and for larger buckets, the bucket is split into a number of smaller * found, and for larger buckets, the bucket is split into a number of smaller
...@@ -32,7 +32,8 @@ import java.util.zip.Inflater; ...@@ -32,7 +32,8 @@ import java.util.zip.Inflater;
* At the end of the generation process, the data is compressed using a general * At the end of the generation process, the data is compressed using a general
* purpose compression tool (Deflate / Huffman coding). The uncompressed data is * purpose compression tool (Deflate / Huffman coding). The uncompressed data is
* around 1.52 bits per key (perfect hash) and 3.72 (minimal perfect hash). * around 1.52 bits per key (perfect hash) and 3.72 (minimal perfect hash).
* * <p>
* Please also note the MinimalPerfectHash class, which uses less space per key.
*/ */
public class PerfectHash { public class PerfectHash {
...@@ -40,12 +41,12 @@ public class PerfectHash { ...@@ -40,12 +41,12 @@ public class PerfectHash {
* The maximum size of a bucket. * The maximum size of a bucket.
*/ */
private static final int MAX_SIZE = 16; private static final int MAX_SIZE = 16;
/** /**
* The maximum number of hash functions to test. * The maximum number of hash functions to test.
*/ */
private static final int OFFSETS = 16; private static final int OFFSETS = 16;
/** /**
* The maximum number of buckets to split the set into. * The maximum number of buckets to split the set into.
*/ */
...@@ -56,13 +57,13 @@ public class PerfectHash { ...@@ -56,13 +57,13 @@ public class PerfectHash {
* key. * key.
*/ */
private final byte[] data; private final byte[] data;
/** /**
* The offset of the result of the hash function at the given offset within * The offset of the result of the hash function at the given offset within
* the data array. Used for calculating the hash of a key. * the data array. Used for calculating the hash of a key.
*/ */
private final int[] plus; private final int[] plus;
/** /**
* The position of the next bucket in the data array (in case this bucket * The position of the next bucket in the data array (in case this bucket
* needs to be skipped). Used for calculating the hash of a key. * needs to be skipped). Used for calculating the hash of a key.
...@@ -71,7 +72,7 @@ public class PerfectHash { ...@@ -71,7 +72,7 @@ public class PerfectHash {
/** /**
* Create a hash object to convert keys to hashes. * Create a hash object to convert keys to hashes.
* *
* @param data the data returned by the generate method * @param data the data returned by the generate method
*/ */
public PerfectHash(byte[] data) { public PerfectHash(byte[] data) {
...@@ -87,7 +88,7 @@ public class PerfectHash { ...@@ -87,7 +88,7 @@ public class PerfectHash {
/** /**
* Calculate the hash from the key. * Calculate the hash from the key.
* *
* @param x the key * @param x the key
* @return the hash * @return the hash
*/ */
...@@ -128,7 +129,7 @@ public class PerfectHash { ...@@ -128,7 +129,7 @@ public class PerfectHash {
/** /**
* Generate the perfect hash function data from the given set of integers. * Generate the perfect hash function data from the given set of integers.
* *
* @param list the set * @param list the set
* @param minimal whether the perfect hash function needs to be minimal * @param minimal whether the perfect hash function needs to be minimal
* @return the data * @return the data
...@@ -139,7 +140,7 @@ public class PerfectHash { ...@@ -139,7 +140,7 @@ public class PerfectHash {
return compress(out.toByteArray()); return compress(out.toByteArray());
} }
private static void generate(Collection<Integer> set, int level, private static void generate(Collection<Integer> set, int level,
boolean minimal, ByteArrayOutputStream out) { boolean minimal, ByteArrayOutputStream out) {
int size = set.size(); int size = set.size();
if (size <= 1) { if (size <= 1) {
...@@ -195,7 +196,7 @@ public class PerfectHash { ...@@ -195,7 +196,7 @@ public class PerfectHash {
/** /**
* Calculate the hash of a key. The result depends on the key, the recursion * Calculate the hash of a key. The result depends on the key, the recursion
* level, and the offset. * level, and the offset.
* *
* @param x the key * @param x the key
* @param level the recursion level * @param level the recursion level
* @param offset the index of the hash function * @param offset the index of the hash function
...@@ -209,10 +210,10 @@ public class PerfectHash { ...@@ -209,10 +210,10 @@ public class PerfectHash {
x = (x >>> 16) ^ x; x = (x >>> 16) ^ x;
return Math.abs(x % size); return Math.abs(x % size);
} }
/** /**
* Compress the hash description using a Huffman coding. * Compress the hash description using a Huffman coding.
* *
* @param d the data * @param d the data
* @return the compressed data * @return the compressed data
*/ */
...@@ -230,28 +231,28 @@ public class PerfectHash { ...@@ -230,28 +231,28 @@ public class PerfectHash {
deflater.end(); deflater.end();
return out2.toByteArray(); return out2.toByteArray();
} }
/** /**
* Decompress the hash description using a Huffman coding. * Decompress the hash description using a Huffman coding.
* *
* @param d the data * @param d the data
* @return the decompressed data * @return the decompressed data
*/ */
private static byte[] expand(byte[] d) { private static byte[] expand(byte[] d) {
Inflater inflater = new Inflater(); Inflater inflater = new Inflater();
inflater.setInput(d); inflater.setInput(d);
ByteArrayOutputStream out = new ByteArrayOutputStream(d.length); ByteArrayOutputStream out = new ByteArrayOutputStream(d.length);
byte[] buffer = new byte[1024]; byte[] buffer = new byte[1024];
try { try {
while (!inflater.finished()) { while (!inflater.finished()) {
int count = inflater.inflate(buffer); int count = inflater.inflate(buffer);
out.write(buffer, 0, count); out.write(buffer, 0, count);
} }
inflater.end(); inflater.end();
} catch (Exception e) { } catch (Exception e) {
throw new IllegalArgumentException(e); throw new IllegalArgumentException(e);
} }
return out.toByteArray(); return out.toByteArray();
} }
} }
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论