提交 f986f6ae authored 作者: Thomas Mueller's avatar Thomas Mueller

Minimal perfect hash tool

上级 2d649ea2
/*
* Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0,
* and the EPL 1.0 (http://h2database.com/html/license.html).
* Initial Developer: H2 Group
*/
package org.h2.test.unit;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import org.h2.dev.hash.IntPerfectHash;
import org.h2.dev.hash.IntPerfectHash.BitArray;
import org.h2.test.TestBase;
/**
* Tests the perfect hash tool.
*/
public class TestIntPerfectHash extends TestBase {
/**
* Run just this test.
*
* @param a ignored
*/
public static void main(String... a) throws Exception {
TestIntPerfectHash test = (TestIntPerfectHash) TestBase.createCaller().init();
test.measure();
test.test();
test.measure();
}
/**
* Measure the hash functions.
*/
public void measure() {
int size = 10000;
test(size / 10);
int s;
long time = System.currentTimeMillis();
s = test(size);
time = System.currentTimeMillis() - time;
System.out.println((double) s / size + " bits/key in " +
time + " ms");
}
@Override
public void test() {
testBitArray();
for (int i = 0; i < 100; i++) {
test(i);
}
for (int i = 100; i <= 10000; i *= 10) {
test(i);
}
}
private void testBitArray() {
byte[] data = new byte[0];
BitSet set = new BitSet();
for (int i = 100; i >= 0; i--) {
data = BitArray.setBit(data, i, true);
set.set(i);
}
Random r = new Random(1);
for (int i = 0; i < 10000; i++) {
int pos = r.nextInt(100);
boolean s = r.nextBoolean();
data = BitArray.setBit(data, pos, s);
set.set(pos, s);
pos = r.nextInt(100);
assertTrue(BitArray.getBit(data, pos) == set.get(pos));
}
assertTrue(BitArray.countBits(data) == set.cardinality());
}
private int test(int size) {
Random r = new Random(size);
HashSet<Integer> set = new HashSet<Integer>();
while (set.size() < size) {
set.add(r.nextInt());
}
ArrayList<Integer> list = new ArrayList<Integer>();
list.addAll(set);
byte[] desc = IntPerfectHash.generate(list);
int max = test(desc, set);
assertEquals(size - 1, max);
return desc.length * 8;
}
private int test(byte[] desc, Set<Integer> set) {
int max = -1;
HashSet<Integer> test = new HashSet<Integer>();
IntPerfectHash hash = new IntPerfectHash(desc);
for (int x : set) {
int h = hash.get(x);
assertTrue(h >= 0);
assertTrue(h <= set.size() * 3);
max = Math.max(max, h);
assertFalse(test.contains(h));
test.add(h);
}
return max;
}
}
/*
* Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0,
* and the EPL 1.0 (http://h2database.com/html/license.html).
* Initial Developer: H2 Group
*/
package org.h2.dev.hash;
import java.util.ArrayList;
import java.util.Arrays;
/**
* A minimum perfect hash function tool. It needs about 2.2 bits per key.
*/
public class IntPerfectHash {
/**
* Large buckets are typically divided into buckets of this size.
*/
private static final int DIVIDE = 6;
/**
* The maximum size of a small bucket (one that is not further split if
* possible).
*/
private static final int MAX_SIZE = 12;
/**
* The maximum offset for hash functions of small buckets. At most that many
* hash functions are tried for the given size.
*/
private static final int[] MAX_OFFSETS = { 0, 0, 8, 18, 47, 123, 319, 831, 2162,
5622, 14617, 38006, 98815 };
/**
* The output value to split the bucket into many (more than 2) smaller
* buckets.
*/
private static final int SPLIT_MANY = 3;
/**
* The minimum output value for a small bucket of a given size.
*/
private static final int[] SIZE_OFFSETS = new int[MAX_OFFSETS.length + 1];
static {
int last = SPLIT_MANY + 1;
for (int i = 0; i < MAX_OFFSETS.length; i++) {
SIZE_OFFSETS[i] = last;
last += MAX_OFFSETS[i];
}
SIZE_OFFSETS[SIZE_OFFSETS.length - 1] = last;
}
/**
* The description of the hash function. Used for calculating the hash of a
* key.
*/
private final byte[] data;
/**
* Create a hash object to convert keys to hashes.
*
* @param data the data returned by the generate method
*/
public IntPerfectHash(byte[] data) {
this.data = data;
}
/**
* Get the hash function description.
*
* @return the data
*/
public byte[] getData() {
return data;
}
/**
* Calculate the hash value for the given key.
*
* @param x the key
* @return the hash value
*/
public int get(int x) {
return get(0, x, 0);
}
/**
* Get the hash value for the given key, starting at a certain position and
* level.
*
* @param pos the start position
* @param x the key
* @param isRoot whether this is the root of the tree
* @param level the level
* @return the hash value
*/
private int get(int pos, int x, int level) {
int n = readVarInt(data, pos);
if (n < 2) {
return 0;
} else if (n > SPLIT_MANY) {
int size = getSize(n);
int offset = getOffset(n, size);
return hash(x, level, offset, size);
}
pos++;
int split;
if (n == SPLIT_MANY) {
split = readVarInt(data, pos);
pos += getVarIntLength(data, pos);
} else {
split = n;
}
int h = hash(x, level, 0, split);
int s;
int start = pos;
for (int i = 0; i < h; i++) {
pos = getNextPos(pos);
}
s = getSizeSum(start, pos);
return s + get(pos, x, level + 1);
}
/**
* Get the position of the next sibling.
*
* @param pos the position of this branch
* @return the position of the next sibling
*/
private int getNextPos(int pos) {
int n = readVarInt(data, pos);
pos += getVarIntLength(data, pos);
if (n < 2 || n > SPLIT_MANY) {
return pos;
}
int split;
if (n == SPLIT_MANY) {
split = readVarInt(data, pos);
pos += getVarIntLength(data, pos);
} else {
split = n;
}
for (int i = 0; i < split; i++) {
pos = getNextPos(pos);
}
return pos;
}
/**
* The sum of the sizes between the start and end position.
*
* @param start the start position
* @param end the end position (excluding)
* @return the sizes
*/
private int getSizeSum(int start, int end) {
int s = 0;
for (int pos = start; pos < end;) {
int n = readVarInt(data, pos);
pos += getVarIntLength(data, pos);
if (n < 2) {
s += n;
} else if (n > SPLIT_MANY) {
s += getSize(n);
} else if (n == SPLIT_MANY) {
pos += getVarIntLength(data, pos);
}
}
return s;
}
private static void writeSizeOffset(ByteStream out, int size,
int offset) {
writeVarInt(out, SIZE_OFFSETS[size] + offset);
}
private static int getOffset(int n, int size) {
return n - SIZE_OFFSETS[size];
}
private static int getSize(int n) {
for (int i = 0; i < SIZE_OFFSETS.length; i++) {
if (n < SIZE_OFFSETS[i]) {
return i - 1;
}
}
return 0;
}
/**
* Generate the minimal perfect hash function data from the given list.
*
* @param list the data
* @return the hash function description
*/
public static <K> byte[] generate(ArrayList<Integer> list) {
ByteStream out = new ByteStream();
generate(list, 0, out);
return out.toByteArray();
}
private static <K> void generate(ArrayList<Integer> list, int level, ByteStream out) {
int size = list.size();
if (size <= 1) {
out.write((byte) size);
return;
}
if (level > 32) {
throw new IllegalStateException("Too many recursions; " +
" incorrect universal hash function?");
}
if (size <= MAX_SIZE) {
int maxOffset = MAX_OFFSETS[size];
int testSize = size;
nextOffset:
for (int offset = 0; offset < maxOffset; offset++) {
int bits = 0;
for (int i = 0; i < size; i++) {
int x = list.get(i);
int h = hash(x, level, offset, testSize);
if ((bits & (1 << h)) != 0) {
continue nextOffset;
}
bits |= 1 << h;
}
writeSizeOffset(out, size, offset);
return;
}
}
int split;
if (size > 57 * DIVIDE) {
split = size / (36 * DIVIDE);
} else {
split = (size - 47) / DIVIDE;
}
split = Math.max(2, split);
ArrayList<ArrayList<Integer>> lists;
do {
lists = new ArrayList<ArrayList<Integer>>(split);
for (int i = 0; i < split; i++) {
lists.add(new ArrayList<Integer>(size / split));
}
for (int x : list) {
ArrayList<Integer> l = lists.get(hash(x, level, 0, split));
l.add(x);
}
} while (lists == null);
if (split >= SPLIT_MANY) {
out.write((byte) SPLIT_MANY);
}
writeVarInt(out, split);
list.clear();
list.trimToSize();
for (ArrayList<Integer> s2 : lists) {
generate(s2, level + 1, out);
}
}
private static int hash(int x, int level, int offset, int size) {
x += level + offset * 32;
x = ((x >>> 16) ^ x) * 0x45d9f3b;
x = ((x >>> 16) ^ x) * 0x45d9f3b;
x = (x >>> 16) ^ x;
return Math.abs(x % size);
}
private static int writeVarInt(ByteStream out, int x) {
int len = 0;
while ((x & ~0x7f) != 0) {
out.write((byte) (0x80 | (x & 0x7f)));
x >>>= 7;
len++;
}
out.write((byte) x);
return ++len;
}
private static int readVarInt(byte[] d, int pos) {
int x = d[pos++];
if (x >= 0) {
return x;
}
x &= 0x7f;
for (int s = 7; s < 64; s += 7) {
int b = d[pos++];
x |= (b & 0x7f) << s;
if (b >= 0) {
break;
}
}
return x;
}
private static int getVarIntLength(byte[] d, int pos) {
int x = d[pos++];
if (x >= 0) {
return 1;
}
int len = 2;
for (int s = 7; s < 64; s += 7) {
int b = d[pos++];
if (b >= 0) {
break;
}
len++;
}
return len;
}
/**
* A stream of bytes.
*/
static class ByteStream {
private byte[] data;
private int pos;
ByteStream() {
this.data = new byte[16];
}
ByteStream(byte[] data) {
this.data = data;
}
/**
* Read a byte.
*
* @return the byte, or -1.
*/
int read() {
return pos < data.length ? (data[pos++] & 255) : -1;
}
/**
* Write a byte.
*
* @param value the byte
*/
void write(byte value) {
if (pos >= data.length) {
data = Arrays.copyOf(data, data.length * 2);
}
data[pos++] = value;
}
/**
* Get the byte array.
*
* @return the byte array
*/
byte[] toByteArray() {
return Arrays.copyOf(data, pos);
}
}
/**
* A helper class for bit arrays.
*/
public static class BitArray {
/**
* Set a bit in the array.
*
* @param data the array
* @param x the bit index
* @param value the new value
* @return the bit array (if the passed one was too small)
*/
public static byte[] setBit(byte[] data, int x, boolean value) {
int pos = x / 8;
if (pos >= data.length) {
data = Arrays.copyOf(data, pos + 1);
}
if (value) {
data[pos] |= 1 << (x & 7);
} else {
data[pos] &= 255 - (1 << (x & 7));
}
return data;
}
/**
* Get a bit in a bit array.
*
* @param data the array
* @param x the bit index
* @return the value
*/
public static boolean getBit(byte[] data, int x) {
return (data[x / 8] & (1 << (x & 7))) != 0;
}
/**
* Count the number of set bits.
*
* @param data the array
* @return the number of set bits
*/
public static int countBits(byte[] data) {
int count = 0;
for (byte x : data) {
count += Integer.bitCount(x & 255);
}
return count;
}
}
}
...@@ -76,6 +76,12 @@ public class MinimalPerfectHash<K> { ...@@ -76,6 +76,12 @@ public class MinimalPerfectHash<K> {
*/ */
private static final int DIVIDE = 6; private static final int DIVIDE = 6;
/**
* For sets larger than this, instead of trying to map then uniquely to a
* set of the same size, the size of the set is incremented by one. This
* reduces the time to find a mapping, but the index of the hole also needs
* to be stored, which increases the space usage.
*/
private static final int SPEEDUP = 11; private static final int SPEEDUP = 11;
/** /**
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论