提交 7dcfa24a authored 作者: Thomas Mueller's avatar Thomas Mueller

A minimal perfect hash function tool: test with real-world data, improved error…

A minimal perfect hash function tool: test with real-world data, improved error detection and handling
上级 c1010fba
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
*/ */
package org.h2.test.unit; package org.h2.test.unit;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.BitSet; import java.util.BitSet;
import java.util.HashSet; import java.util.HashSet;
import java.util.Random; import java.util.Random;
...@@ -29,9 +31,43 @@ public class TestPerfectHash extends TestBase { ...@@ -29,9 +31,43 @@ public class TestPerfectHash extends TestBase {
*/ */
public static void main(String... a) throws Exception { public static void main(String... a) throws Exception {
TestPerfectHash test = (TestPerfectHash) TestBase.createCaller().init(); TestPerfectHash test = (TestPerfectHash) TestBase.createCaller().init();
largeFile();
test.test(); test.test();
test.measure(); test.measure();
} }
private static void largeFile() throws IOException {
String fileName = System.getProperty("user.home") + "/temp/enwiki-20140811-all-titles.txt";
RandomAccessFile f = new RandomAccessFile(fileName, "r");
byte[] data = new byte[(int) f.length()];
f.readFully(data);
UniversalHash<Text> hf = new UniversalHash<Text>() {
@Override
public int hashCode(Text o, int index, int seed) {
return o.hashCode(index, seed);
}
};
HashSet<Text> set = new HashSet<Text>();
Text t = new Text(data, 0);
while (true) {
set.add(t);
int end = t.getEnd();
if (end >= data.length - 1) {
break;
}
t = new Text(data, end + 1);
if (set.size() % 1000000 == 0) {
System.out.println("size: " + set.size());
}
}
System.out.println("size: " + set.size());
byte[] desc = MinimalPerfectHash.generate(set, hf);
System.out.println("len: " + desc.length);
int bits = desc.length * 8;
System.out.println(((double) bits / set.size()) + " bits/key");
}
/** /**
* Measure the hash functions. * Measure the hash functions.
...@@ -182,5 +218,75 @@ public class TestPerfectHash extends TestBase { ...@@ -182,5 +218,75 @@ public class TestPerfectHash extends TestBase {
} }
return max; return max;
} }
/**
* A text.
*/
static class Text {
final byte[] data;
final int start;
Text(byte[] data, int start) {
this.data = data;
this.start = start;
}
public int hashCode(int index, int seed) {
if (index < 4) {
int result = 0;
int x = seed + index;
int end = start;
while (data[end] != '\n') {
x = 31 + x * 0x9f3b;
result += x * (1 + (data[end] & 255));
end++;
}
return result;
}
int end = getEnd();
return StringHash.getSipHash24(data, start, end, index, seed);
}
int getEnd() {
int end = start;
while (data[end] != '\n') {
end++;
}
return end;
}
@Override
public int hashCode() {
return hashCode(0, 0);
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
} else if (!(other instanceof Text)) {
return false;
}
Text o = (Text) other;
int end = getEnd();
int s2 = o.start;
int e2 = o.getEnd();
if (e2 - s2 != end - start) {
return false;
}
for (int s1 = start; s1 < end; s1++, s2++) {
if (data[s1] != o.data[s2]) {
return false;
}
}
return true;
}
@Override
public String toString() {
return new String(data, start, getEnd() - start);
}
}
} }
...@@ -11,11 +11,13 @@ import java.nio.charset.Charset; ...@@ -11,11 +11,13 @@ import java.nio.charset.Charset;
import java.security.SecureRandom; import java.security.SecureRandom;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Set; import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.zip.Deflater; import java.util.zip.Deflater;
import java.util.zip.Inflater; import java.util.zip.Inflater;
/** /**
* A minimal perfect hash function tool. It needs about 2.0 bits per key. * A minimal perfect hash function tool. It needs about 1.98 bits per key.
* <p> * <p>
* The algorithm is recursive: sets that contain no or only one entry are not * The algorithm is recursive: sets that contain no or only one entry are not
* processed as no conflicts are possible. For sets that contain between 2 and * processed as no conflicts are possible. For sets that contain between 2 and
...@@ -57,7 +59,8 @@ import java.util.zip.Inflater; ...@@ -57,7 +59,8 @@ import java.util.zip.Inflater;
* hash functions are called. It is fine to use the regular hashCode method as * hash functions are called. It is fine to use the regular hashCode method as
* the level 0 hash function. However, just relying on the regular hashCode * the level 0 hash function. However, just relying on the regular hashCode
* method does not work if the key has more than 32 bits, because the risk of * method does not work if the key has more than 32 bits, because the risk of
* collisions is too high. * collisions is too high. Incorrect universal hash functions are detected (an
* exception is thrown if there are more than 32 recursion levels).
* <p> * <p>
* In-place updating of the hash table is not implemented but possible in * In-place updating of the hash table is not implemented but possible in
* theory, by patching the hash function description. With a small change, * theory, by patching the hash function description. With a small change,
...@@ -330,6 +333,10 @@ public class MinimalPerfectHash<K> { ...@@ -330,6 +333,10 @@ public class MinimalPerfectHash<K> {
out.write(size); out.write(size);
return; return;
} }
if (level > 32) {
throw new IllegalStateException("Too many recursions; " +
" incorrect universal hash function?");
}
if (size <= MAX_SIZE) { if (size <= MAX_SIZE) {
int maxOffset = MAX_OFFSETS[size]; int maxOffset = MAX_OFFSETS[size];
int[] hashes = new int[size]; int[] hashes = new int[size];
...@@ -407,23 +414,31 @@ public class MinimalPerfectHash<K> { ...@@ -407,23 +414,31 @@ public class MinimalPerfectHash<K> {
new ArrayList<ByteArrayOutputStream>(); new ArrayList<ByteArrayOutputStream>();
int processors = Runtime.getRuntime().availableProcessors(); int processors = Runtime.getRuntime().availableProcessors();
Thread[] threads = new Thread[processors]; Thread[] threads = new Thread[processors];
final AtomicInteger success = new AtomicInteger();
final AtomicReference<Exception> failure = new AtomicReference<Exception>();
for (int i = 0; i < processors; i++) { for (int i = 0; i < processors; i++) {
threads[i] = new Thread() { threads[i] = new Thread() {
@Override @Override
public void run() { public void run() {
while (true) { try {
ArrayList<K> list; while (true) {
ByteArrayOutputStream temp = ArrayList<K> list;
new ByteArrayOutputStream(); ByteArrayOutputStream temp =
synchronized (lists) { new ByteArrayOutputStream();
if (lists.isEmpty()) { synchronized (lists) {
break; if (lists.isEmpty()) {
break;
}
list = lists.remove(0);
outList.add(temp);
} }
list = lists.remove(0); generate(list, hash, level + 1, seed, temp);
outList.add(temp);
} }
generate(list, hash, level + 1, seed, temp); } catch (Exception e) {
failure.set(e);
return;
} }
success.incrementAndGet();
} }
}; };
} }
...@@ -434,6 +449,13 @@ public class MinimalPerfectHash<K> { ...@@ -434,6 +449,13 @@ public class MinimalPerfectHash<K> {
for (Thread t : threads) { for (Thread t : threads) {
t.join(); t.join();
} }
if (success.get() != threads.length) {
Exception e = failure.get();
if (e != null) {
throw new RuntimeException(e);
}
throw new RuntimeException("Unknown failure in one thread");
}
for (ByteArrayOutputStream temp : outList) { for (ByteArrayOutputStream temp : outList) {
out.write(temp.toByteArray()); out.write(temp.toByteArray());
} }
...@@ -657,18 +679,33 @@ public class MinimalPerfectHash<K> { ...@@ -657,18 +679,33 @@ public class MinimalPerfectHash<K> {
* @return the hash value * @return the hash value
*/ */
public static int getSipHash24(String o, long k0, long k1) { public static int getSipHash24(String o, long k0, long k1) {
byte[] b = o.getBytes(UTF8);
return getSipHash24(b, 0, b.length, k0, k1);
}
/**
* A cryptographically relatively secure hash function. It is supposed
* to protected against hash-flooding denial-of-service attacks.
*
* @param b the data
* @param start the start position
* @param end the end position plus one
* @param k0 key 0
* @param k1 key 1
* @return the hash value
*/
public static int getSipHash24(byte[] b, int start, int end, long k0, long k1) {
long v0 = k0 ^ 0x736f6d6570736575L; long v0 = k0 ^ 0x736f6d6570736575L;
long v1 = k1 ^ 0x646f72616e646f6dL; long v1 = k1 ^ 0x646f72616e646f6dL;
long v2 = k0 ^ 0x6c7967656e657261L; long v2 = k0 ^ 0x6c7967656e657261L;
long v3 = k1 ^ 0x7465646279746573L; long v3 = k1 ^ 0x7465646279746573L;
byte[] b = o.getBytes(UTF8); int repeat;
int len = b.length, repeat; for (int off = start; off <= end + 8; off += 8) {
for (int off = 0; off <= len + 8; off += 8) {
long m; long m;
if (off <= len) { if (off <= end) {
m = 0; m = 0;
int i = 0; int i = 0;
for (; i < 8 && off + i < len; i++) { for (; i < 8 && off + i < end; i++) {
m |= ((long) b[off + i] & 255) << (8 * i); m |= ((long) b[off + i] & 255) << (8 * i);
} }
if (i < 8) { if (i < 8) {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论