提交 56b943f8 authored 作者: Thomas Mueller's avatar Thomas Mueller

An archive tool that uses chunk sorting and compression

上级 299c26f5
...@@ -23,7 +23,6 @@ import java.util.ArrayList; ...@@ -23,7 +23,6 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Random;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.zip.Deflater; import java.util.zip.Deflater;
import java.util.zip.DeflaterOutputStream; import java.util.zip.DeflaterOutputStream;
...@@ -89,6 +88,7 @@ public class ArchiveTool { ...@@ -89,6 +88,7 @@ public class ArchiveTool {
new File(toFile).length() / MB + " MB in " + new File(toFile).length() / MB + " MB in " +
(System.currentTimeMillis() - start) / 1000 + (System.currentTimeMillis() - start) / 1000 +
" seconds"); " seconds");
System.out.println();
} }
private static void extract(String fromFile, String toDir) throws IOException { private static void extract(String fromFile, String toDir) throws IOException {
...@@ -133,12 +133,6 @@ public class ArchiveTool { ...@@ -133,12 +133,6 @@ public class ArchiveTool {
throw new IllegalArgumentException("Not an existing directory: " + dir); throw new IllegalArgumentException("Not an existing directory: " + dir);
} }
// int: metadata length
// byte: 0: directory, 1: file
// long: created
// long lastModified
// (file only) long: file length
// utf-8: file name
return new InputStream() { return new InputStream() {
private final String baseDir; private final String baseDir;
...@@ -172,6 +166,13 @@ public class ArchiveTool { ...@@ -172,6 +166,13 @@ public class ArchiveTool {
} }
} }
// int: metadata length
// byte: 0: directory, 1: file
// varLong: lastModified
// byte: 0: read-write, 1: read-only
// (file only) varLong: file length
// utf-8: file name
@Override @Override
public int read() throws IOException { public int read() throws IOException {
if (meta != null) { if (meta != null) {
...@@ -210,12 +211,13 @@ public class ArchiveTool { ...@@ -210,12 +211,13 @@ public class ArchiveTool {
boolean isFile = f.isFile(); boolean isFile = f.isFile();
out.writeInt(0); out.writeInt(0);
out.write(isFile ? 1 : 0); out.write(isFile ? 1 : 0);
writeVarLong(out, f.lastModified());
out.write(!f.canWrite() ? 1 : 0); out.write(!f.canWrite() ? 1 : 0);
writeVarLong(out, f.lastModified());
if (isFile) { if (isFile) {
remaining = f.length(); remaining = f.length();
writeVarLong(out, remaining); writeVarLong(out, remaining);
fileIn = new DataInputStream(new BufferedInputStream(new FileInputStream(current))); fileIn = new DataInputStream(new BufferedInputStream(
new FileInputStream(current)));
} }
if (!current.startsWith(baseDir)) { if (!current.startsWith(baseDir)) {
throw new IOException("File " + current + " does not start with " + baseDir); throw new IOException("File " + current + " does not start with " + baseDir);
...@@ -259,12 +261,6 @@ public class ArchiveTool { ...@@ -259,12 +261,6 @@ public class ArchiveTool {
private long modified; private long modified;
private boolean readOnly; private boolean readOnly;
// byte: 0: directory, 1: file
// long lastModified
// byte: 0: read-write, 2: read-only
// (file only) long: file length
// utf-8: file name
@Override @Override
public void write(byte[] buff, int offset, int length) throws IOException { public void write(byte[] buff, int offset, int length) throws IOException {
while (length > 0) { while (length > 0) {
...@@ -303,7 +299,8 @@ public class ArchiveTool { ...@@ -303,7 +299,8 @@ public class ArchiveTool {
if (--remaining > 0) { if (--remaining > 0) {
return; return;
} }
DataInputStream in = new DataInputStream(new ByteArrayInputStream(meta.toByteArray())); DataInputStream in = new DataInputStream(
new ByteArrayInputStream(meta.toByteArray()));
if (meta.size() == 4) { if (meta.size() == 4) {
// metadata is next // metadata is next
remaining = in.readInt() - 4; remaining = in.readInt() - 4;
...@@ -315,8 +312,8 @@ public class ArchiveTool { ...@@ -315,8 +312,8 @@ public class ArchiveTool {
// read and ignore the length // read and ignore the length
in.readInt(); in.readInt();
boolean isFile = in.read() == 1; boolean isFile = in.read() == 1;
modified = readVarLong(in);
readOnly = in.read() == 1; readOnly = in.read() == 1;
modified = readVarLong(in);
if (isFile) { if (isFile) {
remaining = readVarLong(in); remaining = readVarLong(in);
} else { } else {
...@@ -343,32 +340,22 @@ public class ArchiveTool { ...@@ -343,32 +340,22 @@ public class ArchiveTool {
}; };
} }
private static void sort(InputStream in, OutputStream out, String tempFileName, long size) throws IOException { private static void sort(InputStream in, OutputStream out,
String tempFileName, long size) throws IOException {
long lastTime = System.currentTimeMillis();
int bufferSize = 16 * 1024 * 1024; int bufferSize = 16 * 1024 * 1024;
int[] random = new int[256];
Random r = new Random(1);
for (int i = 0; i < random.length; i++) {
random[i] = r.nextInt();
}
DataOutputStream tempOut = new DataOutputStream(new BufferedOutputStream( DataOutputStream tempOut = new DataOutputStream(new BufferedOutputStream(
new FileOutputStream(tempFileName))); new FileOutputStream(tempFileName)));
// TODO document
// temp
// segment1: pos [, pos..., 0], hash, chunk1,..., 0
// segment2: pos [, pos..., 0], hash, chunk1,..., 0
// (compare by hash, value.length, value data,
// so hash conflicts are not a problem anywhere)
// out
// pos [,pos..., 0] chunk1, pos [,pos..., 0] chunk2,..., 0
byte[] bytes = new byte[bufferSize]; byte[] bytes = new byte[bufferSize];
ArrayList<Long> segmentStart = new ArrayList<Long>(); ArrayList<Long> segmentStart = new ArrayList<Long>();
long inPos = 0; long inPos = 0;
long outPos = 0; long outPos = 0;
long id = 1; long id = 1;
long lastTime = System.currentTimeMillis();
// Temp file: segment* 0
// Segment: chunk* 0
// Chunk: pos* 0 sortKey data
while (true) { while (true) {
int len = readFully(in, bytes, bytes.length); int len = readFully(in, bytes, bytes.length);
if (len == 0) { if (len == 0) {
...@@ -378,7 +365,7 @@ public class ArchiveTool { ...@@ -378,7 +365,7 @@ public class ArchiveTool {
lastTime = printProgress(lastTime, 0, 50, inPos, size); lastTime = printProgress(lastTime, 0, 50, inPos, size);
TreeMap<Chunk, Chunk> map = new TreeMap<Chunk, Chunk>(); TreeMap<Chunk, Chunk> map = new TreeMap<Chunk, Chunk>();
for (int pos = 0; pos < len;) { for (int pos = 0; pos < len;) {
int[] key = getKey(random, bytes, pos, len); int[] key = getKey(bytes, pos, len);
int l = key[3]; int l = key[3];
byte[] buff = new byte[l]; byte[] buff = new byte[l];
System.arraycopy(bytes, pos, buff, 0, l); System.arraycopy(bytes, pos, buff, 0, l);
...@@ -417,10 +404,15 @@ public class ArchiveTool { ...@@ -417,10 +404,15 @@ public class ArchiveTool {
segmentIn.add(s); segmentIn.add(s);
} }
} }
DataOutputStream dataOut = new DataOutputStream(out); DataOutputStream dataOut = new DataOutputStream(out);
dataOut.write(HEADER); dataOut.write(HEADER);
writeVarLong(dataOut, size); writeVarLong(dataOut, size);
Chunk last = null; Chunk last = null;
// File: header length chunk* 0
// chunk: pos* 0 data
while (segmentIn.size() > 0) { while (segmentIn.size() > 0) {
Collections.sort(segmentIn); Collections.sort(segmentIn);
ChunkStream s = segmentIn.get(0); ChunkStream s = segmentIn.get(0);
...@@ -449,7 +441,16 @@ public class ArchiveTool { ...@@ -449,7 +441,16 @@ public class ArchiveTool {
dataOut.flush(); dataOut.flush();
} }
public static int readFully(InputStream in, byte[] buffer, int max) /**
* Read a number of bytes. This method repeats reading until
* either the bytes have been read, or EOF.
*
* @param in the input stream
* @param buffer the target buffer
* @param max the number of bytes to read
* @return the number of bytes read (max unless EOF has been reached)
*/
private static int readFully(InputStream in, byte[] buffer, int max)
throws IOException { throws IOException {
int result = 0, len = Math.min(max, buffer.length); int result = 0, len = Math.min(max, buffer.length);
while (len > 0) { while (len > 0) {
...@@ -466,21 +467,16 @@ public class ArchiveTool { ...@@ -466,21 +467,16 @@ public class ArchiveTool {
/** /**
* Get the sort key and length of a chunk. * Get the sort key and length of a chunk.
*/ */
private static int[] getKey(int[] random, byte[] data, int start, int maxPos) { private static int[] getKey(byte[] data, int start, int maxPos) {
int minLen = 4 * 1024; int minLen = 4 * 1024;
int mask = 4 * 1024 - 1; int mask = 4 * 1024 - 1;
int factor = 31;
int hash = 0, mul = 1, offset = 8;
int min = Integer.MAX_VALUE; int min = Integer.MAX_VALUE;
int max = Integer.MIN_VALUE; int max = Integer.MIN_VALUE;
int pos = start; int pos = start;
long bytes = 0;
for (int j = 0; pos < maxPos; pos++, j++) { for (int j = 0; pos < maxPos; pos++, j++) {
hash = hash * factor + random[data[pos] & 255]; bytes = (bytes << 8) | (data[pos] & 255);
if (j >= offset) { int hash = getHash(bytes);
hash -= mul * random[data[pos - offset] & 255];
} else {
mul *= factor;
}
if (hash < min) { if (hash < min) {
min = hash; min = hash;
} }
...@@ -488,10 +484,13 @@ public class ArchiveTool { ...@@ -488,10 +484,13 @@ public class ArchiveTool {
max = hash; max = hash;
} }
if (j > minLen) { if (j > minLen) {
if (j > minLen * 4) { if ((hash & mask) == 1) {
break; break;
} }
if ((hash & mask) == 1) { if (j > minLen * 4 && (hash & (mask >> 1)) == 1) {
break;
}
if (j > minLen * 16) {
break; break;
} }
} }
...@@ -517,12 +516,26 @@ public class ArchiveTool { ...@@ -517,12 +516,26 @@ public class ArchiveTool {
return key; return key;
} }
private static void combine(InputStream in, OutputStream out, String tempFileName) throws IOException { private static int getHash(long key) {
int hash = (int) ((key >>> 32) ^ key);
hash = ((hash >>> 16) ^ hash) * 0x45d9f3b;
hash = ((hash >>> 16) ^ hash) * 0x45d9f3b;
hash = (hash >>> 16) ^ hash;
return hash;
}
private static void combine(InputStream in, OutputStream out,
String tempFileName) throws IOException {
long lastTime = System.currentTimeMillis();
int bufferSize = 16 * 1024 * 1024; int bufferSize = 16 * 1024 * 1024;
DataOutputStream tempOut = DataOutputStream tempOut =
new DataOutputStream( new DataOutputStream(
new BufferedOutputStream( new BufferedOutputStream(
new FileOutputStream(tempFileName))); new FileOutputStream(tempFileName)));
// File: header length chunk* 0
// chunk: pos* 0 data
DataInputStream dataIn = new DataInputStream(in); DataInputStream dataIn = new DataInputStream(in);
byte[] header = new byte[4]; byte[] header = new byte[4];
dataIn.readFully(header); dataIn.readFully(header);
...@@ -530,18 +543,15 @@ public class ArchiveTool { ...@@ -530,18 +543,15 @@ public class ArchiveTool {
throw new IOException("Invalid header"); throw new IOException("Invalid header");
} }
long size = readVarLong(dataIn); long size = readVarLong(dataIn);
// out
// pos [,pos..., 0] chunk1, pos [,pos..., 0] chunk2,..., 0
// temp-exp
// segment1: pos1, chunk, pos3, chunk, pos5, chunk, 0
// segment2: pos2, chunk, pos4, chunk, 0
long outPos = 0; long outPos = 0;
long inPos = 0; long inPos = 0;
ArrayList<Long> segmentStart = new ArrayList<Long>(); ArrayList<Long> segmentStart = new ArrayList<Long>();
long lastTime = System.currentTimeMillis();
boolean end = false; boolean end = false;
// Temp file: segment* 0
// Segment: chunk* 0
// Chunk: pos* 0 data
while (!end) { while (!end) {
int segmentSize = 0; int segmentSize = 0;
TreeMap<Long, byte[]> map = new TreeMap<Long, byte[]>(); TreeMap<Long, byte[]> map = new TreeMap<Long, byte[]>();
...@@ -611,6 +621,11 @@ public class ArchiveTool { ...@@ -611,6 +621,11 @@ public class ArchiveTool {
DataInputStream in; DataInputStream in;
boolean readKey; boolean readKey;
/**
* Read the next chunk.
*
* @return the number of bytes read
*/
int readNext() throws IOException { int readNext() throws IOException {
current = Chunk.read(in, readKey); current = Chunk.read(in, readKey);
if (current == null) { if (current == null) {
...@@ -630,8 +645,8 @@ public class ArchiveTool { ...@@ -630,8 +645,8 @@ public class ArchiveTool {
*/ */
static class Chunk implements Comparable<Chunk> { static class Chunk implements Comparable<Chunk> {
ArrayList<Long> idList; ArrayList<Long> idList;
int[] sortKey; final byte[] value;
byte[] value; private int[] sortKey;
Chunk(ArrayList<Long> idList, int[] sortKey, byte[] value) { Chunk(ArrayList<Long> idList, int[] sortKey, byte[] value) {
this.idList = idList; this.idList = idList;
...@@ -639,6 +654,13 @@ public class ArchiveTool { ...@@ -639,6 +654,13 @@ public class ArchiveTool {
this.value = value; this.value = value;
} }
/**
* Read a chunk.
*
* @param in the input stream
* @param readKey whether to read the sort key
* @return the chunk, or null if 0 has been read
*/
public static Chunk read(DataInputStream in, boolean readKey) throws IOException { public static Chunk read(DataInputStream in, boolean readKey) throws IOException {
ArrayList<Long> idList = new ArrayList<Long>(); ArrayList<Long> idList = new ArrayList<Long>();
while (true) { while (true) {
...@@ -665,6 +687,13 @@ public class ArchiveTool { ...@@ -665,6 +687,13 @@ public class ArchiveTool {
return new Chunk(idList, key, value); return new Chunk(idList, key, value);
} }
/**
* Write a chunk.
*
* @param out the output stream
* @param writeKey whether to write the sort key
* @return the number of bytes written
*/
int write(DataOutputStream out, boolean writeKey) throws IOException { int write(DataOutputStream out, boolean writeKey) throws IOException {
int len = 0; int len = 0;
for (long x : idList) { for (long x : idList) {
...@@ -720,7 +749,14 @@ public class ArchiveTool { ...@@ -720,7 +749,14 @@ public class ArchiveTool {
} }
} }
public static int writeVarLong(OutputStream out, long x) /**
* Write a variable size long value.
*
* @param out the output stream
* @param x the value
* @return the number of bytes written
*/
static int writeVarLong(OutputStream out, long x)
throws IOException { throws IOException {
int len = 0; int len = 0;
while ((x & ~0x7f) != 0) { while ((x & ~0x7f) != 0) {
...@@ -732,7 +768,13 @@ public class ArchiveTool { ...@@ -732,7 +768,13 @@ public class ArchiveTool {
return ++len; return ++len;
} }
public static long readVarLong(InputStream in) throws IOException { /**
* Read a variable size long value.
*
* @param in the input stream
* @return the value
*/
static long readVarLong(InputStream in) throws IOException {
long x = in.read(); long x = in.read();
if (x < 0) { if (x < 0) {
throw new EOFException(); throw new EOFException();
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论