提交 e64b0d36 authored 作者: Thomas Mueller's avatar Thomas Mueller

Off-heap storage

上级 2a280505
......@@ -32,6 +32,7 @@ MVStore
<a href="#transactions">- Transactions</a><br />
<a href="#inMemory">- In-Memory Performance and Usage</a><br />
<a href="#dataTypes">- Pluggable Data Types</a><br />
<a href="#offHeap">- Off-Heap and Pluggable Storage</a><br />
<a href="#blob">- BLOB Support</a><br />
<a href="#pluggableMap">- R-Tree and Pluggable Map Implementations</a><br />
<a href="#caching">- Concurrent Operations and Caching</a><br />
......@@ -60,8 +61,11 @@ But it can be also directly within an application, without using JDBC or SQL.
</li><li>It is intended to be fast, simple to use, and small.
</li><li>Old versions of the data can be read concurrently with all other operations.
</li><li>Transaction are supported (including concurrent transactions and 2-phase commit).
</li><li>The tool is very modular. It supports pluggable data types / serialization,
pluggable map implementations (B-tree, R-tree, concurrent B-tree currently), BLOB storage,
</li><li>The tool is very modular.
It supports pluggable data types / serialization,
pluggable storage (to a file, to off-heap memory),
pluggable map implementations (B-tree, R-tree, concurrent B-tree currently),
BLOB storage,
and a file system abstraction to support encrypted files and zip files.
</li></ul>
......@@ -102,6 +106,7 @@ MVStore s = new MVStore.Builder().
compressData().
encryptionKey("007".toCharArray()).
fileName(fileName).
fileStore(new FileStore()).
pageSplitSize(6 * 1024).
readOnly().
writeBufferSize(8).
......@@ -114,6 +119,7 @@ MVStore s = new MVStore.Builder().
</li><li>compressData: compress the data when storing.
</li><li>encryptionKey: the encryption key for file encryption.
</li><li>fileName: the name of the file, for file based stores.
</li><li>fileStore: the storage implementation to use.
</li><li>pageSplitSize: the point where pages are split.
</li><li>readOnly: open the file in read-only mode.
</li><li>writeBufferSize: the size of the write buffer in MB.
......@@ -277,6 +283,19 @@ Also, there is no inherent limit to the number of maps and chunks.
Due to using a log structured storage, there is no special case handling for large keys or pages.
</p>
<h3 id="offHeap">Off-Heap and Pluggable Storage</h3>
<p>
Storage is pluggable. The default storage is to a single file (unless pure in-memory operation is used).
</p>
<p>
An off-heap storage implementation is available. This storage keeps the data in the off-heap memory,
meaning outside of the regular garbage collected heap. This allows to use very large in-memory
stores without having to increase the JVM heap (which would increase Java garbage collection
cost a lot). Memory is allocated using <code>ByteBuffer.allocateDirect</code>.
One chunk is allocated at a time (each chunk is usually a few MB large), so that
allocation cost is low.
</p>
<h3 id="blob">BLOB Support</h3>
<p>
There is a mechanism that stores large binary objects by splitting them into smaller blocks.
......
......@@ -18,17 +18,28 @@ import org.h2.store.fs.FilePathCrypt;
import org.h2.store.fs.FilePathNio;
/**
* The storage mechanism of the MVStore.
* The default storage mechanism of the MVStore. This implementation persists
* data to a file. The file store is responsible to persist data and for free
* space management.
*/
public class FileStore {
private String fileName;
private boolean readOnly;
private FileChannel file;
private FileLock fileLock;
private long fileSize;
private long readCount;
private long writeCount;
protected long readCount;
protected long writeCount;
/**
* The free spaces between the chunks. The first block to use is block 2
* (the first two blocks are the store header).
*/
protected final FreeSpaceBitSet freeSpace = new FreeSpaceBitSet(2, MVStore.BLOCK_SIZE);
protected String fileName;
protected boolean readOnly;
protected long fileSize;
protected FileChannel file;
protected FileLock fileLock;
@Override
public String toString() {
......@@ -46,16 +57,6 @@ public class FileStore {
DataUtils.writeFully(file, pos, src);
}
/**
* Mark the space within the file as unused.
*
* @param pos
* @param length
*/
public void free(long pos, int length) {
}
public void open(String fileName, boolean readOnly, char[] encryptionKey) {
if (fileName != null && fileName.indexOf(':') < 0) {
// NIO is used, unless a different file system is specified
......@@ -109,6 +110,7 @@ public class FileStore {
fileLock = null;
}
file.close();
freeSpace.clear();
} catch (Exception e) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_WRITING_FAILED,
......@@ -134,6 +136,7 @@ public class FileStore {
public void truncate(long size) {
try {
writeCount++;
file.truncate(size);
fileSize = Math.min(fileSize, size);
} catch (IOException e) {
......@@ -178,4 +181,38 @@ public class FileStore {
return readOnly;
}
public int getDefaultRetentionTime() {
return 45000;
}
public void markUsed(long start, int len) {
freeSpace.markUsed(start, len);
}
public long allocate(int length) {
return freeSpace.allocate(length);
}
/**
* Mark the space as free.
*
* @param pos the position in bytes
* @param length the number of bytes
*/
public void free(long pos, int length) {
freeSpace.free(pos, length);
}
public int getFillRate() {
return freeSpace.getFillRate();
}
public long getFirstFree() {
return freeSpace.getFirstFree();
}
public void clear() {
freeSpace.clear();
}
}
......@@ -41,16 +41,18 @@ Documentation
- rolling docs review: at "Transactions"
- better document that writes are in background thread
- better document how to do non-unique indexes
- document pluggable store and OffHeapStore
TestMVStoreDataLoss
MVTableEngine:
- use StreamStore
TransactionStore:
MVStore:
- automated 'kill process' and 'power failure' test
- update checkstyle
- maybe split database into multiple files, to speed up compact
and allow using trim (by truncating / deleting empty files)
- auto-compact from time to time and on close
- test and possibly improve compact operation (for large dbs)
- possibly split chunk metadata into immutable and mutable
......@@ -62,7 +64,6 @@ MVStore:
split directly (specially for leaves with one large entry)
- maybe let a chunk point to a list of potential next chunks
(so no fixed location header is needed), similar to a skip list
- support stores that span multiple files (chunks stored in other files)
- triggers (can be implemented with a custom map);
maybe implement database indexing with triggers
- store number of write operations per page (maybe defragment
......@@ -78,13 +79,13 @@ MVStore:
- StreamStore optimization: avoid copying bytes in memory
- Feature shrink a store (create, copy, rename, delete)
and for MVStore on Windows, auto-detect renamed file
- ensure data is overwritten eventually if the system doesn't have a timer
- ensure data is overwritten eventually if the system doesn't have a
real-time clock (Raspberry Pi) and if there are few writes per startup
- SSD-friendly write (always in blocks of 4 MB / 1 second?)
- close the file on out of memory or disk write error (out of disk space or so)
- implement a sharded map (in one store, multiple stores)
to support concurrent updates and writes, and very large maps
- implement an off-heap file system
- add support for writing to branches
- maybe support for writing to branches
- maybe add an optional finalizer and exit hook
to store committed changes
- to save space when persisting very small transactions,
......@@ -107,15 +108,15 @@ MVStore:
- rename setStoreVersion to setDataVersion or similar
- to save space for small chunks, combine the last partial
block with the header
- off-heap storage (with lower default retention time)
- temporary file storage
- simple rollback method (rollback to last committed version)
- MVMap to implement SortedMap, then NavigableMap
- add abstraction ChunkStore,
with free space handling, retention time / flush,
possibly one file per chunk to support SSD trim on file system level,
and free up memory for off-heap storage)
- Test with OSGi
- avoid copying data from ByteBuffer to another ByteBuffer if possible,
specially for the OffHeapStore
- storage that splits database into multiple files,
to speed up compact and allow using trim
(by truncating / deleting empty files)
*/
......@@ -143,12 +144,6 @@ public class MVStore {
*/
volatile Thread backgroundThread;
/**
* The free spaces between the chunks. The first block to use is block 2
* (the first two blocks are the store header).
*/
private final FreeSpaceBitSet freeSpace = new FreeSpaceBitSet(2, BLOCK_SIZE);
private volatile boolean reuseSpace = true;
private boolean closed;
......@@ -160,8 +155,8 @@ public class MVStore {
private long rootChunkStart;
/**
* The cache. The default size is 16 MB, and the average size is 2 KB. It is
* split in 16 segments. The stack move distance is 2% of the expected
* The page cache. The default size is 16 MB, and the average size is 2 KB.
* It is split in 16 segments. The stack move distance is 2% of the expected
* number of entries.
*/
private final CacheLongKeyLIRS<Page> cache;
......@@ -175,12 +170,15 @@ public class MVStore {
new ConcurrentHashMap<Integer, Chunk>();
/**
* The map of temporarily removed pages. The key is the unsaved version, the
* value is the map of chunks. The maps of chunks contains the number of
* freed entries per chunk. Access is synchronized.
* The map of temporarily freed storage space caused by freed pages. The key
* is the unsaved version, the value is the map of chunks. The maps contains
* the number of freed entries per chunk. Access is synchronized.
*/
private final HashMap<Long, HashMap<Integer, Chunk>> freedPages = New.hashMap();
private final HashMap<Long, HashMap<Integer, Chunk>> freedPageSpace = New.hashMap();
/**
* The metadata map.
*/
private MVMapConcurrent<String, String> meta;
private final ConcurrentHashMap<Integer, MVMap<?, ?>> maps =
......@@ -218,7 +216,7 @@ public class MVStore {
* The time the store was created, in milliseconds since 1970.
*/
private long creationTime;
private int retentionTime = 45000;
private int retentionTime;
private long lastStoreTime;
......@@ -262,12 +260,16 @@ public class MVStore {
c.put("id", "0");
c.put("createVersion", Long.toString(currentVersion));
meta.init(this, c);
String f = (String) config.get("fileName");
if (f == null) {
fileStore = (FileStore) config.get("fileStore");
String fileName = (String) config.get("fileName");
if (fileName == null && fileStore == null) {
cache = null;
return;
}
if (fileStore == null) {
fileStore = new FileStore();
}
retentionTime = fileStore.getDefaultRetentionTime();
boolean readOnly = config.containsKey("readOnly");
o = config.get("cacheSize");
int mb = o == null ? 16 : (Integer) o;
......@@ -284,7 +286,7 @@ public class MVStore {
unsavedPageCountMax = writeBufferSize / (div == 0 ? 1 : div);
char[] encryptionKey = (char[]) config.get("encryptionKey");
try {
fileStore.open(f, readOnly, encryptionKey);
fileStore.open(fileName, readOnly, encryptionKey);
if (fileStore.size() == 0) {
creationTime = 0;
creationTime = getTime();
......@@ -537,14 +539,13 @@ public class MVStore {
registerFreePage(currentVersion, c.id, 0, 0);
}
}
// rebuild the free space list
freeSpace.clear();
// build the free space list
for (Chunk c : chunks.values()) {
if (c.start == Long.MAX_VALUE) {
continue;
}
int len = MathUtils.roundUpInt(c.length, BLOCK_SIZE) + BLOCK_SIZE;
freeSpace.markUsed(c.start, len);
fileStore.markUsed(c.start, len);
}
}
......@@ -687,7 +688,6 @@ public class MVStore {
}
meta = null;
chunks.clear();
freeSpace.clear();
cache.clear();
maps.clear();
try {
......@@ -849,7 +849,7 @@ public class MVStore {
meta.put("root." + m.getId(), String.valueOf(Integer.MAX_VALUE));
}
}
Set<Chunk> removedChunks = applyFreedPages(storeVersion, time);
Set<Chunk> removedChunks = applyFreedSpace(storeVersion, time);
ByteBuffer buff = getWriteBuffer();
// need to patch the header later
c.writeHeader(buff);
......@@ -884,17 +884,17 @@ public class MVStore {
long end = getEndPosition();
long filePos;
if (reuseSpace) {
filePos = freeSpace.allocate(length);
filePos = fileStore.allocate(length);
} else {
filePos = end;
freeSpace.markUsed(end, length);
fileStore.markUsed(end, length);
}
boolean storeAtEndOfFile = filePos + length >= end;
// free up the space of unused chunks now
for (Chunk x : removedChunks) {
int len = MathUtils.roundUpInt(x.length, BLOCK_SIZE) + BLOCK_SIZE;
freeSpace.free(x.start, len);
fileStore.free(x.start, len);
}
c.start = filePos;
......@@ -985,23 +985,23 @@ public class MVStore {
}
/**
* Apply the freed pages to the chunk metadata. The metadata is updated, but
* Apply the freed space to the chunk metadata. The metadata is updated, but
* freed chunks are not removed yet.
*
* @param storeVersion apply up to the given version
* @return the set of freed chunks (might be empty)
* @return the set of completely freed chunks (might be empty)
*/
private Set<Chunk> applyFreedPages(long storeVersion, long time) {
private Set<Chunk> applyFreedSpace(long storeVersion, long time) {
Set<Chunk> removedChunks = New.hashSet();
synchronized (freedPages) {
synchronized (freedPageSpace) {
while (true) {
ArrayList<Chunk> modified = New.arrayList();
for (Iterator<Long> it = freedPages.keySet().iterator(); it.hasNext();) {
for (Iterator<Long> it = freedPageSpace.keySet().iterator(); it.hasNext();) {
long v = it.next();
if (v > storeVersion) {
continue;
}
Map<Integer, Chunk> freed = freedPages.get(v);
Map<Integer, Chunk> freed = freedPageSpace.get(v);
for (Chunk f : freed.values()) {
Chunk c = chunks.get(f.id);
if (c == null) {
......@@ -1147,12 +1147,12 @@ public class MVStore {
chunks.remove(c.id);
meta.remove("chunk." + c.id);
int length = MathUtils.roundUpInt(c.length, BLOCK_SIZE) + BLOCK_SIZE;
freeSpace.free(c.start, length);
fileStore.free(c.start, length);
}
if (freeSpace.getFillRate() == 100) {
if (fileStore.getFillRate() == 100) {
return false;
}
long firstFree = freeSpace.getFirstFree();
long firstFree = fileStore.getFirstFree();
ArrayList<Chunk> move = New.arrayList();
for (Chunk c : chunks.values()) {
if (c.start > firstFree) {
......@@ -1166,8 +1166,8 @@ public class MVStore {
buff.limit(length);
fileStore.readFully(c.start, buff);
long end = getEndPosition();
freeSpace.markUsed(end, length);
freeSpace.free(c.start, length);
fileStore.markUsed(end, length);
fileStore.free(c.start, length);
c.start = end;
buff.position(0);
c.writeHeader(buff);
......@@ -1197,8 +1197,8 @@ public class MVStore {
buff = DataUtils.ensureCapacity(buff, length);
buff.limit(length);
fileStore.readFully(c.start, buff);
long pos = freeSpace.allocate(length);
freeSpace.free(c.start, length);
long pos = fileStore.allocate(length);
fileStore.free(c.start, length);
buff.position(0);
c.start = pos;
c.writeHeader(buff);
......@@ -1433,11 +1433,11 @@ public class MVStore {
}
private void registerFreePage(long version, int chunkId, long maxLengthLive, int pageCount) {
synchronized (freedPages) {
HashMap<Integer, Chunk>freed = freedPages.get(version);
synchronized (freedPageSpace) {
HashMap<Integer, Chunk>freed = freedPageSpace.get(version);
if (freed == null) {
freed = New.hashMap();
freedPages.put(version, freed);
freedPageSpace.put(version, freed);
}
Chunk f = freed.get(chunkId);
if (f == null) {
......@@ -1488,11 +1488,13 @@ public class MVStore {
/**
* How long to retain old, persisted chunks, in milliseconds. Chunks that
* are older may be overwritten once they contain no live data. The default
* is 45000 (45 seconds). It is assumed that a file system and hard disk
* will flush all write buffers within this time. Using a lower value might
* be dangerous, unless the file system and hard disk flush the buffers
* earlier. To manually flush the buffers, use
* are older may be overwritten once they contain no live data.
* <p>
* The default value is 45000 (45 seconds) when using the default file
* store. It is assumed that a file system and hard disk will flush all
* write buffers within this time. Using a lower value might be dangerous,
* unless the file system and hard disk flush the buffers earlier. To
* manually flush the buffers, use
* <code>MVStore.getFile().force(true)</code>, however please note that
* according to various tests this does not always work as expected
* depending on the operating system and hardware.
......@@ -1652,10 +1654,12 @@ public class MVStore {
}
meta.clear();
chunks.clear();
freeSpace.clear();
if (fileStore != null) {
fileStore.clear();
}
maps.clear();
synchronized (freedPages) {
freedPages.clear();
synchronized (freedPageSpace) {
freedPageSpace.clear();
}
currentVersion = version;
setWriteVersion(version);
......@@ -1669,12 +1673,12 @@ public class MVStore {
for (MVMap<?, ?> m : maps.values()) {
m.rollbackTo(version);
}
synchronized (freedPages) {
synchronized (freedPageSpace) {
for (long v = currentVersion; v >= version; v--) {
if (freedPages.size() == 0) {
if (freedPageSpace.size() == 0) {
break;
}
freedPages.remove(v);
freedPageSpace.remove(v);
}
}
meta.rollbackTo(version);
......@@ -1704,7 +1708,7 @@ public class MVStore {
}
chunks.remove(lastChunkId);
int len = MathUtils.roundUpInt(last.length, BLOCK_SIZE) + BLOCK_SIZE;
freeSpace.free(last.start, len);
fileStore.free(last.start, len);
lastChunkId--;
}
rootChunkStart = last.start;
......@@ -1739,8 +1743,8 @@ public class MVStore {
}
private void revertTemp(long storeVersion) {
synchronized (freedPages) {
for (Iterator<Long> it = freedPages.keySet().iterator(); it.hasNext();) {
synchronized (freedPageSpace) {
for (Iterator<Long> it = freedPageSpace.keySet().iterator(); it.hasNext();) {
long v = it.next();
if (v > storeVersion) {
continue;
......@@ -2083,6 +2087,16 @@ public class MVStore {
return set("backgroundExceptionHandler", exceptionHandler);
}
/**
* Use the provided file store instead of the default one.
*
* @param store the file store
* @return this
*/
public Builder fileStore(FileStore store) {
return set("fileStore", store);
}
/**
* Open the store.
*
......
/*
* Copyright 2004-2013 H2 Group. Multiple-Licensed under the H2 License,
* Version 1.0, and under the Eclipse Public License, Version 1.0
* (http://h2database.com/html/license.html).
* Initial Developer: H2 Group
*/
package org.h2.mvstore;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.TreeMap;
/**
* A storage mechanism that "persists" data in the off-heap area of the main
* memory.
*/
public class OffHeapStore extends FileStore {
private final TreeMap<Long, ByteBuffer> memory = new TreeMap<Long, ByteBuffer>();
@Override
public void open(String fileName, boolean readOnly, char[] encryptionKey) {
// nothing to do
}
@Override
public String toString() {
return memory.toString();
}
@Override
public void readFully(long pos, ByteBuffer dst) {
Entry<Long, ByteBuffer> mem = memory.floorEntry(pos);
if (mem == null) {
throw DataUtils.newIllegalStateException(DataUtils.ERROR_READING_FAILED,
"Could not read from position {0}", pos);
}
readCount++;
ByteBuffer buff = mem.getValue();
ByteBuffer read = buff.duplicate();
int offset = (int) (pos - mem.getKey());
read.position(offset);
read.limit(dst.remaining() + offset);
dst.put(read);
dst.rewind();
}
@Override
public void free(long pos, int length) {
freeSpace.free(pos, length);
ByteBuffer buff = memory.remove(pos);
if (buff == null) {
throw DataUtils.newIllegalStateException(DataUtils.ERROR_READING_FAILED,
"Could not find entry at position {0}", pos);
} else if (buff.remaining() != length) {
throw DataUtils.newIllegalStateException(DataUtils.ERROR_READING_FAILED,
"Partial remove is not supported at position {0}", pos);
}
}
@Override
public void writeFully(long pos, ByteBuffer src) {
fileSize = Math.max(fileSize, pos + src.remaining());
Entry<Long, ByteBuffer> mem = memory.floorEntry(pos);
if (mem == null) {
// not found: create a new entry
writeNewEntry(pos, src);
return;
}
long prevPos = mem.getKey();
ByteBuffer buff = mem.getValue();
int prevLength = buff.capacity();
int length = src.remaining();
if (prevPos == pos) {
if (prevLength != length) {
throw DataUtils.newIllegalStateException(DataUtils.ERROR_READING_FAILED,
"Could not write to position {0}; partial overwrite is not supported", pos);
}
writeCount++;
buff.rewind();
buff.put(src);
return;
}
if (prevPos + prevLength > pos) {
throw DataUtils.newIllegalStateException(DataUtils.ERROR_READING_FAILED,
"Could not write to position {0}; partial overwrite is not supported", pos);
}
writeNewEntry(pos, src);
}
private void writeNewEntry(long pos, ByteBuffer src) {
writeCount++;
int length = src.remaining();
ByteBuffer buff = ByteBuffer.allocateDirect(length);
buff.put(src);
buff.rewind();
memory.put(pos, buff);
}
@Override
public void truncate(long size) {
writeCount++;
if (size == 0) {
fileSize = 0;
memory.clear();
return;
}
for (Iterator<Long> it = memory.keySet().iterator(); it.hasNext();) {
long pos = it.next();
if (pos < size) {
break;
}
ByteBuffer buff = memory.get(pos);
if (buff.capacity() > size) {
throw DataUtils.newIllegalStateException(DataUtils.ERROR_READING_FAILED,
"Could not truncate to {0}; partial truncate is not supported", pos);
}
it.remove();
}
}
@Override
public void close() {
truncate(0);
freeSpace.clear();
}
@Override
public void sync() {
// nothing to do
}
@Override
public int getDefaultRetentionTime() {
return 0;
}
}
......@@ -172,7 +172,7 @@ public class Page {
buff = ByteBuffer.allocate(128);
fileStore.readFully(filePos, buff);
maxLength = buff.getInt();
//read the first bytes again
// read the first bytes again
}
buff = ByteBuffer.allocate(length);
fileStore.readFully(filePos, buff);
......
......@@ -18,6 +18,7 @@ import org.h2.mvstore.Cursor;
import org.h2.mvstore.DataUtils;
import org.h2.mvstore.MVMap;
import org.h2.mvstore.MVStore;
import org.h2.mvstore.OffHeapStore;
import org.h2.mvstore.type.DataType;
import org.h2.mvstore.type.ObjectDataType;
import org.h2.mvstore.type.StringDataType;
......@@ -46,6 +47,7 @@ public class TestMVStore extends TestBase {
public void test() throws Exception {
FileUtils.deleteRecursive(getBaseDir(), true);
FileUtils.createDirectories(getBaseDir());
testOffHeapStorage();
testNewerWriteVersion();
testCompactFully();
testBackgroundExceptionListener();
......@@ -92,6 +94,29 @@ public class TestMVStore extends TestBase {
testLargerThan2G();
}
private void testOffHeapStorage() throws Exception {
OffHeapStore offHeap = new OffHeapStore();
MVStore s = new MVStore.Builder().
fileStore(offHeap).
open();
Map<Integer, String> map = s.openMap("data");
for (int i = 0; i < 1000; i++) {
map.put(i, "Hello " + i);
s.store();
}
assertTrue(1000 < offHeap.getWriteCount());
// s.close();
s = new MVStore.Builder().
fileStore(offHeap).
open();
map = s.openMap("data");
for (int i = 0; i < 1000; i++) {
assertEquals("Hello " + i, map.get(i));
}
s.close();
}
private void testNewerWriteVersion() throws Exception {
String fileName = getBaseDir() + "/testNewerWriteVersion.h3";
FileUtils.delete(fileName);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论