提交 4307ec5b authored 作者: Thomas Mueller's avatar Thomas Mueller

An archive tool that uses compression and de-duplication (compresses better than…

An archive tool that uses compression and de-duplication (compresses better than 7zip / pkzip for large directories in many cases)
上级 5832f043
...@@ -346,7 +346,7 @@ public class FileStore { ...@@ -346,7 +346,7 @@ public class FileStore {
return freeSpace.getFillRate(); return freeSpace.getFillRate();
} }
public long getFirstFree() { long getFirstFree() {
return freeSpace.getFirstFree(); return freeSpace.getFirstFree();
} }
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
package org.h2.table; package org.h2.table;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.sql.DatabaseMetaData; import java.sql.DatabaseMetaData;
...@@ -958,6 +959,18 @@ public class MetaTable extends Table { ...@@ -958,6 +959,18 @@ public class MetaTable extends Table {
fs.getWriteCount()); fs.getWriteCount());
add(rows, "info.FILE_READ", "" + add(rows, "info.FILE_READ", "" +
fs.getReadCount()); fs.getReadCount());
long size;
try {
size = fs.getFile().size();
} catch (IOException e) {
throw DbException.convertIOException(e, "Can not get size");
}
int pageSize = 4 * 1024;
long pageCount = size / pageSize;
add(rows, "info.PAGE_COUNT", "" +
pageCount);
add(rows, "info.PAGE_SIZE", "" +
pageSize);
add(rows, "info.CACHE_MAX_SIZE", "" + add(rows, "info.CACHE_MAX_SIZE", "" +
mvStore.getStore().getCacheSize()); mvStore.getStore().getCacheSize());
add(rows, "info.CACHE_SIZE", "" + add(rows, "info.CACHE_SIZE", "" +
......
...@@ -17,7 +17,6 @@ import java.util.Collections; ...@@ -17,7 +17,6 @@ import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Random; import java.util.Random;
import java.util.TreeMap;
import org.h2.mvstore.Cursor; import org.h2.mvstore.Cursor;
import org.h2.mvstore.DataUtils; import org.h2.mvstore.DataUtils;
...@@ -27,12 +26,12 @@ import org.h2.store.fs.FileUtils; ...@@ -27,12 +26,12 @@ import org.h2.store.fs.FileUtils;
import org.h2.util.New; import org.h2.util.New;
/** /**
* An archive tool that de-duplicates and compresses files. * An archiver tool that can compress directories.
*/ */
public class Dedup { public class Archiver {
private static final int[] RANDOM = new int[256]; private static final int[] RANDOM = new int[256];
private static final int MB = 1024 * 1024; private static final int MB = 1000 * 1000;
private long lastTime; private long lastTime;
private long start; private long start;
private int bucket; private int bucket;
...@@ -51,7 +50,7 @@ public class Dedup { ...@@ -51,7 +50,7 @@ public class Dedup {
* @param args the command line arguments * @param args the command line arguments
*/ */
public static void main(String... args) throws Exception { public static void main(String... args) throws Exception {
Dedup app = new Dedup(); Archiver app = new Archiver();
String arg = args.length != 3 ? null : args[0]; String arg = args.length != 3 ? null : args[0];
if ("-compress".equals(arg)) { if ("-compress".equals(arg)) {
app.fileName = args[1]; app.fileName = args[1];
...@@ -65,7 +64,7 @@ public class Dedup { ...@@ -65,7 +64,7 @@ public class Dedup {
System.out.println("-extract <file> <targetDir>"); System.out.println("-extract <file> <targetDir>");
} }
} }
private void compress(String sourceDir) throws Exception { private void compress(String sourceDir) throws Exception {
start(); start();
long tempSize = 8 * 1024 * 1024; long tempSize = 8 * 1024 * 1024;
...@@ -80,16 +79,15 @@ public class Dedup { ...@@ -80,16 +79,15 @@ public class Dedup {
FileUtils.delete(fileName); FileUtils.delete(fileName);
MVStore storeTemp = new MVStore.Builder(). MVStore storeTemp = new MVStore.Builder().
fileName(tempFileName). fileName(tempFileName).
compress().
autoCommitDisabled(). autoCommitDisabled().
open(); open();
MVStore store = new MVStore.Builder(). final MVStore store = new MVStore.Builder().
fileName(fileName). fileName(fileName).
pageSplitSize(2 * 1024 * 1024). pageSplitSize(2 * 1024 * 1024).
compressHigh(). compressHigh().
autoCommitDisabled(). autoCommitDisabled().
open(); open();
MVMap<String, int[]> files = store.openMap("files"); MVMap<String, int[]> filesTemp = storeTemp.openMap("files");
long currentSize = 0; long currentSize = 0;
int segmentId = 1; int segmentId = 1;
int segmentLength = 0; int segmentLength = 0;
...@@ -98,7 +96,7 @@ public class Dedup { ...@@ -98,7 +96,7 @@ public class Dedup {
String name = s.substring(sourceDir.length() + 1); String name = s.substring(sourceDir.length() + 1);
if (FileUtils.isDirectory(s)) { if (FileUtils.isDirectory(s)) {
// directory // directory
files.put(name, new int[1]); filesTemp.put(name, new int[1]);
continue; continue;
} }
buff.clear(); buff.clear();
...@@ -154,7 +152,7 @@ public class Dedup { ...@@ -154,7 +152,7 @@ public class Dedup {
segmentId++; segmentId++;
segmentLength = 0; segmentLength = 0;
} }
printProgress("Stage 1/2", currentSize, totalSize); printProgress(0, 50, currentSize, totalSize);
} }
} finally { } finally {
fc.close(); fc.close();
...@@ -163,10 +161,9 @@ public class Dedup { ...@@ -163,10 +161,9 @@ public class Dedup {
for (int i = 0; i < posList.size(); i++) { for (int i = 0; i < posList.size(); i++) {
posArray[i] = posList.get(i); posArray[i] = posList.get(i);
} }
files.put(name, posArray); filesTemp.put(name, posArray);
} }
storeTemp.commit(); storeTemp.commit();
store.commit();
ArrayList<Cursor<int[], byte[]>> list = New.arrayList(); ArrayList<Cursor<int[], byte[]>> list = New.arrayList();
totalSize = 0; totalSize = 0;
for (int i = 1; i <= segmentId; i++) { for (int i = 1; i <= segmentId; i++) {
...@@ -181,8 +178,8 @@ public class Dedup { ...@@ -181,8 +178,8 @@ public class Dedup {
segmentId = 1; segmentId = 1;
segmentLength = 0; segmentLength = 0;
currentSize = 0; currentSize = 0;
TreeMap<Integer, int[]> ranges = new TreeMap<Integer, int[]>();
MVMap<int[], byte[]> data = store.openMap("data" + segmentId); MVMap<int[], byte[]> data = store.openMap("data" + segmentId);
MVMap<int[], Boolean> keepSegment = storeTemp.openMap("keep");
while (list.size() > 0) { while (list.size() > 0) {
Collections.sort(list, new Comparator<Cursor<int[], byte[]>>() { Collections.sort(list, new Comparator<Cursor<int[], byte[]>>() {
...@@ -212,10 +209,10 @@ public class Dedup { ...@@ -212,10 +209,10 @@ public class Dedup {
byte[] bytes = top.getValue(); byte[] bytes = top.getValue();
int[] k2 = Arrays.copyOf(key, key.length); int[] k2 = Arrays.copyOf(key, key.length);
k2[key.length - 1] = 0; k2[key.length - 1] = 0;
// TODO this lookup can be avoided
// if we remember the last entry with k[..] = 0
byte[] old = data.get(k2); byte[] old = data.get(k2);
if (old == null) { if (old == null) {
key = k2;
// new entry
if (segmentLength > tempSize) { if (segmentLength > tempSize) {
// switch only for new entries // switch only for new entries
// where segmentId is 0, // where segmentId is 0,
...@@ -225,15 +222,18 @@ public class Dedup { ...@@ -225,15 +222,18 @@ public class Dedup {
store.commit(); store.commit();
segmentLength = 0; segmentLength = 0;
segmentId++; segmentId++;
ranges.put(segmentId, key);
data = store.openMap("data" + segmentId); data = store.openMap("data" + segmentId);
} }
key = k2;
// new entry
data.put(key, bytes); data.put(key, bytes);
segmentLength += bytes.length; segmentLength += bytes.length;
} else if (Arrays.equals(old, bytes)) { } else if (Arrays.equals(old, bytes)) {
// duplicate // duplicate
} else { } else {
// almost a duplicate:
// keep segment id // keep segment id
keepSegment.put(key, Boolean.TRUE);
data.put(key, bytes); data.put(key, bytes);
segmentLength += bytes.length; segmentLength += bytes.length;
} }
...@@ -243,30 +243,49 @@ public class Dedup { ...@@ -243,30 +243,49 @@ public class Dedup {
top.next(); top.next();
} }
currentSize++; currentSize++;
printProgress("Stage 2/2", currentSize, totalSize); printProgress(50, 100, currentSize, totalSize);
} }
MVMap<int[], Integer> rangeMap = store.openMap("ranges"); MVMap<String, int[]> files = store.openMap("files");
for (Entry<Integer, int[]> range : ranges.entrySet()) { for (Entry<String, int[]> e : filesTemp.entrySet()) {
rangeMap.put(range.getValue(), range.getKey()); String k = e.getKey();
int[] ids = e.getValue();
if (ids.length == 1) {
// directory
files.put(k, ids);
continue;
}
int[] newIds = Arrays.copyOf(ids, ids.length);
for (int i = 0; i < ids.length; i += 4) {
int[] id = new int[4];
id[0] = ids[i];
id[1] = ids[i + 1];
id[2] = ids[i + 2];
id[3] = ids[i + 3];
if (!keepSegment.containsKey(id)) {
newIds[i + 3] = 0;
}
}
files.put(k, newIds);
} }
store.commit();
storeTemp.close(); storeTemp.close();
FileUtils.delete(tempFileName); FileUtils.delete(tempFileName);
store.close(); store.close();
System.out.println();
System.out.println("Compressed to " + System.out.println("Compressed to " +
FileUtils.size(fileName) / MB + " MB"); FileUtils.size(fileName) / MB + " MB");
printDone(); printDone();
} }
private void start() { private void start() {
this.start = System.currentTimeMillis(); this.start = System.currentTimeMillis();
this.lastTime = start; this.lastTime = start;
} }
private void printProgress(String message, long current, long total) { private void printProgress(int low, int high, long current, long total) {
long now = System.currentTimeMillis(); long now = System.currentTimeMillis();
if (now - lastTime > 5000) { if (now - lastTime > 5000) {
System.out.println(message + ": " + System.out.print((low + (high - low) * current / total) + "% ");
100 * current / total + "%");
lastTime = now; lastTime = now;
} }
} }
...@@ -292,96 +311,172 @@ public class Dedup { ...@@ -292,96 +311,172 @@ public class Dedup {
private void expand(String targetDir) throws Exception { private void expand(String targetDir) throws Exception {
start(); start();
long tempSize = 8 * 1024 * 1024;
String tempFileName = fileName + ".temp"; String tempFileName = fileName + ".temp";
long chunkSize = 8 * 1024 * 1024;
FileUtils.createDirectories(targetDir); FileUtils.createDirectories(targetDir);
MVStore store = new MVStore.Builder(). MVStore store = new MVStore.Builder().
fileName(fileName).open(); fileName(fileName).open();
MVMap<String, int[]> files = store.openMap("files"); MVMap<String, int[]> files = store.openMap("files");
System.out.println("Extracting " + files.size() + " files"); System.out.println("Extracting " + files.size() + " files");
MVMap<int[], Integer> ranges = store.openMap("ranges");
int lastSegment;
if (ranges.size() == 0) {
lastSegment = 1;
} else {
lastSegment = ranges.get(ranges.lastKey());
}
MVStore storeTemp = null; MVStore storeTemp = null;
FileUtils.delete(tempFileName); FileUtils.delete(tempFileName);
long totalSize = 0;
int lastSegment = 0;
for (int i = 1;; i++) {
if (!store.hasMap("data" + i)) {
lastSegment = i - 1;
break;
}
}
storeTemp = new MVStore.Builder(). storeTemp = new MVStore.Builder().
fileName(tempFileName). fileName(tempFileName).
cacheSize(64).
autoCommitDisabled(). autoCommitDisabled().
open(); open();
MVMap<int[], byte[]> data = storeTemp.openMap("data");
long chunkLength = 0; MVMap<Integer, String> fileNames = storeTemp.openMap("fileNames");
long totalSize = 0;
for (int i = 1; i <= lastSegment; i++) { MVMap<String, int[]> filesTemp = storeTemp.openMap("files");
MVMap<int[], byte[]> segmentData = store.openMap("data" + i); int fileId = 0;
totalSize += segmentData.sizeAsLong(); for (Entry<String, int[]> e : files.entrySet()) {
fileNames.put(fileId++, e.getKey());
filesTemp.put(e.getKey(), e.getValue());
totalSize += e.getValue().length / 4;
} }
storeTemp.commit();
files = filesTemp;
long currentSize = 0; long currentSize = 0;
for (int i = 1; i <= lastSegment; i++) { int chunkSize = 0;
MVMap<int[], byte[]> segmentData = store.openMap("data" + i); for (int s = 1; s <= lastSegment; s++) {
for (Entry<int[], byte[]> e : segmentData.entrySet()) { MVMap<int[], byte[]> segmentData = store.openMap("data" + s);
int[] key = e.getKey(); // key: fileId, blockId; value: data
byte[] bytes = e.getValue(); MVMap<int[], byte[]> fileData = storeTemp.openMap("fileData" + s);
chunkLength += bytes.length; fileId = 0;
data.put(key, bytes); for (Entry<String, int[]> e : files.entrySet()) {
if (chunkLength > chunkSize) { int[] keys = e.getValue();
storeTemp.commit(); if (keys.length == 1) {
chunkLength = 0; fileId++;
continue;
} }
currentSize++; for (int i = 0; i < keys.length; i += 4) {
printProgress("Stage 1/2", currentSize, totalSize); int[] dk = new int[4];
} dk[0] = keys[i];
dk[1] = keys[i + 1];
dk[2] = keys[i + 2];
dk[3] = keys[i + 3];
byte[] bytes = segmentData.get(dk);
if (bytes != null) {
int[] k = new int[] { fileId, i / 4 };
fileData.put(k, bytes);
chunkSize += bytes.length;
if (chunkSize > tempSize) {
storeTemp.commit();
chunkSize = 0;
}
currentSize++;
printProgress(0, 50, currentSize, totalSize);
}
}
fileId++;
}
storeTemp.commit();
} }
ArrayList<Cursor<int[], byte[]>> list = New.arrayList();
totalSize = 0; totalSize = 0;
for (Entry<String, int[]> e : files.entrySet()) {
int[] keys = e.getValue();
totalSize += keys.length / 4;
}
currentSize = 0; currentSize = 0;
for (int i = 1; i <= lastSegment; i++) {
MVMap<int[], byte[]> fileData = storeTemp.openMap("fileData" + i);
totalSize += fileData.sizeAsLong();
Cursor<int[], byte[]> c = fileData.cursor(null);
if (c.hasNext()) {
c.next();
list.add(c);
}
}
String lastFileName = null;
OutputStream file = null;
int[] lastKey = null;
while (list.size() > 0) {
Collections.sort(list, new Comparator<Cursor<int[], byte[]>>() {
@Override
public int compare(Cursor<int[], byte[]> o1,
Cursor<int[], byte[]> o2) {
int[] k1 = o1.getKey();
int[] k2 = o2.getKey();
int comp = 0;
for (int i = 0; i < k1.length; i++) {
long x1 = k1[i];
long x2 = k2[i];
if (x1 > x2) {
comp = 1;
break;
} else if (x1 < x2) {
comp = -1;
break;
}
}
return comp;
}
});
Cursor<int[], byte[]> top = list.get(0);
int[] key = top.getKey();
byte[] bytes = top.getValue();
String f = targetDir + "/" + fileNames.get(key[0]);
if (!f.equals(lastFileName)) {
if (file != null) {
file.close();
}
String p = FileUtils.getParent(f);
if (p != null) {
FileUtils.createDirectories(p);
}
file = new BufferedOutputStream(new FileOutputStream(f));
lastFileName = f;
} else {
if (key[0] != lastKey[0] || key[1] != lastKey[1] + 1) {
System.out.println("missing entry after " + Arrays.toString(lastKey));
}
}
lastKey = key;
file.write(bytes);
if (!top.hasNext()) {
list.remove(0);
} else {
top.next();
}
currentSize++;
printProgress(50, 100, currentSize, totalSize);
}
for (Entry<String, int[]> e : files.entrySet()) { for (Entry<String, int[]> e : files.entrySet()) {
String f = e.getKey(); String f = targetDir + "/" + e.getKey();
int[] keys = e.getValue(); int[] keys = e.getValue();
String p = FileUtils.getParent(f);
if (p != null) {
FileUtils.createDirectories(targetDir + "/" + p);
}
String fn = targetDir + "/" + f;
if (keys.length == 1) { if (keys.length == 1) {
// directory FileUtils.createDirectories(f);
FileUtils.createDirectory(fn); } else if (keys.length == 0) {
continue; // empty file
} String p = FileUtils.getParent(f);
OutputStream file = new BufferedOutputStream(new FileOutputStream(fn)); if (p != null) {
for (int i = 0; i < keys.length; i += 4) { FileUtils.createDirectories(p);
int[] dk = new int[4];
dk[0] = keys[i];
dk[1] = keys[i + 1];
dk[2] = keys[i + 2];
dk[3] = keys[i + 3];
byte[] bytes;
bytes = data.get(dk);
if (bytes == null) {
dk[3] = 0;
bytes = data.get(dk);
} }
file.write(bytes); new FileOutputStream(f).close();
currentSize++;
printProgress("Stage 2/2", currentSize, totalSize);
} }
}
if (file != null) {
file.close(); file.close();
} }
store.close(); store.close();
if (storeTemp != null) {
storeTemp.close(); storeTemp.close();
FileUtils.delete(tempFileName); FileUtils.delete(tempFileName);
}
System.out.println();
printDone(); printDone();
} }
private int getChunkLength(byte[] data, int start, int maxPos) { private int getChunkLength(byte[] data, int start, int maxPos) {
int minLen = 4 * 1024; int minLen = 4 * 1024;
int mask = 4 * 1024 - 1; int mask = 4 * 1024 - 1;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论