MVStore: the file format was changed slightly.

20e146a3 · Thomas Mueller · 2e880b99 · 20e146a3 · 20e146a3 · 20e146a3
--- a/h2/src/docsrc/html/mvstore.html
+++ b/h2/src/docsrc/html/mvstore.html
@@ -43,6 +43,9 @@ MVStore
 <a href="#exceptionHandling">- Exception Handling</a><br />
 <a href="#storageEngine">- Storage Engine for H2</a><br />

+<a href="#fileFormat">
+    File Format</a><br />
+
 <a href="#differences">
    Similar Projects and Differences to Other Storage Engines</a><br />
 <a href="#current_state">
@@ -473,6 +476,94 @@ it is recommended to use it together with the MVCC mode
 (to do that, append <code>;MVCC=TRUE</code> to the database URL).
 </p>

+<h2 id="fileFormat">File Format</h2>
+<p>
+The data is stored in one file. The file contains two file headers (to be safe), 
+and a number of chunks. The file headers are one block each; a block is 4096 bytes.
+Chunks are at least one block long, but typically 200 blocks or more.
+There is one chunk for every version.
+</p>
+<pre>
+[ file header 1 ]
+[ file header 2 ]
+[ chunk 1 ]
+[ chunk 2 ]
+[ chunk x ]
+</pre>
+
+<h3>File Header</h3>
+<p>
+There are two file headers, which normally contain the exact same data.
+But once in a while, the file headers are updated, and writing could partially fail, 
+which would leave one header corrupt. That's why there is a second header. 
+The file headers are the only piece of data that is updated in-place. It contains
+the following data:
+</p>
+<pre>
+H:2,block:2,blockSize:1000,chunk:7,created:1441235ef73,format:1,version:7,fletcher:3044e6cc
+</pre>
+<p>
+The data is stored in the form of a key-value pair. 
+Each value is stored as a hexadecimal number. The entries are:
+</p>
+<ul><li>H:2 stands for the the H2 database.
+</li><li>block: the block number where one of the latest chunks starts.
+</li><li>blockSize: the block size; currently always hex 1000, which is decimal 4096.
+</li><li>chunk: the chunk id, which is normally the same value as version;
+    however, the chunk id might roll over to 0, while the version doesn't.
+</li><li>created: the number of milliseconds since 1970 when the file was created.
+</li><li>format: the file format number. Currently 1.
+</li><li>version: the version number of the chunk.
+</li><li>fletcher: the Fletcher-32 checksum of the header.
+</li></ul>
+<p>
+When opening the file, both headers are read and the checksum is verified.
+The newest chunk of the valid headers is used to read the chunk header.
+However, this might not be the newest chunk in the file; instead, the chunk header
+contains a pointer where the next chunk might be stored (the predicted position).
+This pointer is followed until the newest chunk was found.
+If the prediction was not correct (which is known when a chunk is stored), then
+the file header is also updated. This is to reduce the number of file header updates.
+</p>
+
+<h3>Chunk Format</h3>
+<p>
+There is one chunk per version.
+Each chunk consists of a header, a number of (B-tree) pages, and a footer.
+The pages inside a chunk are stored next to each other (unaligned).
+The pages contain the actual data of the maps; each map consists of a number of pages:
+</p>
+<pre>
+[ chunk 2 header | page 1 | page 2 | ... | page x | chunk footer ]
+[ chunk 3 header | page 1 | page 2 | ... | page x | chunk footer ]
+[ chunk 1 header | page 1 | page 2 | ... | page x | chunk footer ]
+[ chunk 2 header | page 1 | page 2 | ... | page x | chunk footer ]
+[ chunk 3 header | page 1 | page 2 | ... | page x | chunk footer ]
+</pre>
+<p>
+Each map is a B-tree, and the data is stored as (B-tree-) pages in the chunks.
+Even thought this is not required by the file format, each B-tree is stored 
+"upside down", that means the leave pages first, then the internal nodes, and at last the root page.
+In addition to the user maps, there is one metadata map that contains names and 
+positions of user maps, and data about chunks (position, size, fill rate).
+The very last page of a chunk contains the root page of the metadata map.
+The exact position of that root page is stored in the chunk header. 
+This page (directly or indirectly) points to the root pages of all other maps.
+</p>
+<p>
+In the example above, each chunk header contains the position 
+of page x (which is the root page of the metadata map), which points to the internal 
+nodes of the metadata map (for example pages 9-11; not shown), and each internal 
+node points to the leave pages (for example pages 1-8).
+Data is never updated in-place. Instead, each chunk contains whatever pages were 
+actually changed in that version (there is one chunk per version, see above), 
+plus all the parent nodes of those pages, recursively, up to the root page.
+</p>
+
+
+
+Copy-on-write
+
 <h2 id="differences">Similar Projects and Differences to Other Storage Engines</h2>
 <p>
 Unlike similar storage engines like LevelDB and Kyoto Cabinet,

--- a/h2/src/main/org/h2/mvstore/Chunk.java
+++ b/h2/src/main/org/h2/mvstore/Chunk.java
@@ -17,6 +17,11 @@ import java.util.HashMap;
 * each chunk is at most 2 GB large.
 */
 public class Chunk {
+    
+    /**
+     * The maximum chunk id.
+     */
+    public static final int MAX_ID = (1 << 26) - 1;

    /**
     * The maximum length of a chunk header, in bytes.
@@ -159,7 +164,7 @@ public class Chunk {
        c.len = DataUtils.readHexInt(map, "len", 0);
        c.pageCount = DataUtils.readHexInt(map, "pages", 0);
        c.pageCountLive = DataUtils.readHexInt(map, "livePages", c.pageCount);
-        c.mapId = Integer.parseInt(map.get("map"), 16);
+        c.mapId = DataUtils.readHexInt(map, "map", 0);
        c.maxLength = DataUtils.readHexLong(map, "max", 0);
        c.maxLenLive = DataUtils.readHexLong(map, "liveMax", c.maxLength);
        c.metaRootPos = DataUtils.readHexLong(map, "root", 0);

--- a/h2/src/main/org/h2/mvstore/DataUtils.java
+++ b/h2/src/main/org/h2/mvstore/DataUtils.java
@@ -850,13 +850,53 @@ public class DataUtils {
            return (Long) v;
        }
        try {
-            return Long.parseLong((String) v, 16);
+            return parseHexLong((String) v);
        } catch (NumberFormatException e) {
            throw newIllegalStateException(ERROR_FILE_CORRUPT,
                    "Error parsing the value {0}", v, e);
        }
    }
    
+    /**
+     * Parse an unsigned, hex long.
+     *
+     * @param x the string
+     * @return the parsed value
+     * @throws IllegalStateException if parsing fails
+     */
+    public static long parseHexLong(String x) {
+        try {
+            if (x.length() == 16) {
+                // avoid problems with overflow
+                // in Java 8, this special case is not needed
+                return (Long.parseLong(x.substring(0, 8), 16) << 32) | 
+                        Long.parseLong(x.substring(8, 16), 16);
+            }
+            return Long.parseLong(x, 16);
+        } catch (NumberFormatException e) {
+            throw newIllegalStateException(ERROR_FILE_CORRUPT,
+                    "Error parsing the value {0}", x, e);
+        }
+    }
+    
+    /**
+     * Parse an unsigned, hex long.
+     *
+     * @param x the string
+     * @return the parsed value
+     * @throws IllegalStateException if parsing fails
+     */
+    public static int parseHexInt(String x) {
+        try {
+            // avoid problems with overflow
+            // in Java 8, we can use Integer.parseLong(x, 16);
+            return (int) Long.parseLong(x, 16);
+        } catch (NumberFormatException e) {
+            throw newIllegalStateException(ERROR_FILE_CORRUPT,
+                    "Error parsing the value {0}", x, e);
+        }
+    }
+    
    /**
     * Read a hex int value from a map.
     *

--- a/h2/src/main/org/h2/mvstore/MVStore.java
+++ b/h2/src/main/org/h2/mvstore/MVStore.java
@@ -66,6 +66,9 @@ TransactionStore:

 MVStore:

+- ensure data is overwritten eventually if the system doesn't have a
+    real-time clock (Raspberry Pi) and if there are few writes per startup
+- test chunk id rollover    
 - document and review the file format

 - automated 'kill process' and 'power failure' test
@@ -81,8 +84,6 @@ MVStore:
 - use a small object value cache (StringCache), test on Android
    for default serialization
 - MVStoreTool.dump: dump values (using a callback)
- ensure data is overwritten eventually if the system doesn't have a
-    real-time clock (Raspberry Pi) and if there are few writes per startup
 - close the file on out of memory or disk write error (out of disk space or so)
 - implement a sharded map (in one store, multiple stores)
    to support concurrent updates and writes, and very large maps
@@ -312,6 +313,7 @@ public class MVStore {
                creationTime = 0;
                creationTime = getTime();
                lastCommitTime = creationTime;
+                fileHeader.put("H", 2);
                fileHeader.put("blockSize", BLOCK_SIZE);
                fileHeader.put("format", FORMAT_WRITE);
                fileHeader.put("created", creationTime);
@@ -404,7 +406,7 @@ public class MVStore {
        HashMap<String, Object> c;
        M map;
        if (x != null) {
-            id = Integer.parseInt(x, 16);
+            id = DataUtils.parseHexInt(x);
            @SuppressWarnings("unchecked")
            M old = (M) maps.get(id);
            if (old != null) {
@@ -541,8 +543,7 @@ public class MVStore {
                if (check != checksum) {
                    continue;
                }
-                int chunk = DataUtils.readHexInt(m, "chunk", 0);
-                long version = DataUtils.readHexLong(m, "version", chunk);
+                long version = DataUtils.readHexLong(m, "version", 0);
                if (version > newestVersion) {
                    newestVersion = version;
                    fileHeader.putAll(m);
@@ -676,15 +677,11 @@ public class MVStore {
    }

    private void writeFileHeader() {
-        StringBuilder buff = new StringBuilder("H:2");
+        StringBuilder buff = new StringBuilder();
        if (lastChunk != null) {
-            fileHeader.put("chunk", lastChunk.id);
-            if (lastChunk.version != lastChunk.id) {
-                fileHeader.put("version", lastChunk.version);
-            } else {
-                fileHeader.remove("version");
-            }
            fileHeader.put("block", lastChunk.block);
+            fileHeader.put("chunk", lastChunk.id);
+            fileHeader.put("version", lastChunk.version);
        }
        DataUtils.appendMap(buff, fileHeader);
        byte[] bytes = buff.toString().getBytes(DataUtils.LATIN);
@@ -896,8 +893,11 @@ public class MVStore {
            // never go backward in time
            time = Math.max(lastChunk.time, time);
        }
-        Chunk c;
-        c = new Chunk(lastChunkId + 1);
+        int newChunkId = lastChunkId;
+        do {
+            newChunkId = (newChunkId + 1) % Chunk.MAX_ID;
+        } while (chunks.containsKey(newChunkId));
+        Chunk c = new Chunk(newChunkId);
        
        c.pageCount = Integer.MAX_VALUE;
        c.pageCountLive = Integer.MAX_VALUE;
@@ -1798,7 +1798,7 @@ public class MVStore {
    public int getStoreVersion() {
        checkOpen();
        String x = meta.get("setting.storeVersion");
-        return x == null ? 0 : Integer.parseInt(x, 16);
+        return x == null ? 0 : DataUtils.parseHexInt(x);
    }

    /**
@@ -1926,7 +1926,7 @@ public class MVStore {
    
    private static long getRootPos(MVMap<String, String> map, int mapId) {
        String root = map.get("root." + Integer.toHexString(mapId));
-        return root == null ? 0 : Long.parseLong(root, 16);
+        return root == null ? 0 : DataUtils.parseHexLong(root);
    }

    private void revertTemp(long storeVersion) {

--- a/h2/src/test/org/h2/test/store/TestDataUtils.java
+++ b/h2/src/test/org/h2/test/store/TestDataUtils.java
@@ -12,6 +12,7 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Random;

+import org.h2.mvstore.Chunk;
 import org.h2.mvstore.DataUtils;
 import org.h2.mvstore.WriteBuffer;
 import org.h2.test.TestBase;
@@ -32,6 +33,7 @@ public class TestDataUtils extends TestBase {

    @Override
    public void test() {
+        testParse();
        testWriteBuffer();
        testEncodeLength();
        testFletcher();
@@ -224,11 +226,40 @@ public class TestDataUtils extends TestBase {
        // 1000... xor 0 = 1000...
        assertEquals((short) (1 << 15), DataUtils.getCheckValue(1 << 31));
    }
+    
+    private void testParse() {
+        for (long i = -1; i != 0; i >>>= 1) {
+            String x = Long.toHexString(i);
+            assertEquals(i, DataUtils.parseHexLong(x));
+            x = Long.toHexString(-i);
+            assertEquals(-i, DataUtils.parseHexLong(x));
+            int j = (int) i;
+            x = Integer.toHexString(j);
+            assertEquals(j, DataUtils.parseHexInt(x));
+            j = (int) -i;
+            x = Integer.toHexString(j);
+            assertEquals(j, DataUtils.parseHexInt(x));
+        }
+    }

    private void testPagePos() {
        assertEquals(0, DataUtils.PAGE_TYPE_LEAF);
        assertEquals(1, DataUtils.PAGE_TYPE_NODE);
-        for (int i = 0; i < 67000000; i++) {
+
+        long max = DataUtils.getPagePos(Chunk.MAX_ID, Integer.MAX_VALUE, 
+                    Integer.MAX_VALUE, DataUtils.PAGE_TYPE_NODE);
+        String hex = Long.toHexString(max);
+        assertEquals(max, DataUtils.parseHexLong(hex));
+        assertEquals(Chunk.MAX_ID, DataUtils.getPageChunkId(max));
+        assertEquals(Integer.MAX_VALUE, DataUtils.getPageOffset(max));
+        assertEquals(Integer.MAX_VALUE, DataUtils.getPageMaxLength(max));
+        assertEquals(DataUtils.PAGE_TYPE_NODE, DataUtils.getPageType(max));
+
+        long overflow = DataUtils.getPagePos(Chunk.MAX_ID + 1, 
+                Integer.MAX_VALUE, Integer.MAX_VALUE, DataUtils.PAGE_TYPE_NODE);
+        assertTrue(Chunk.MAX_ID + 1 != DataUtils.getPageChunkId(overflow));
+
+        for (int i = 0; i < Chunk.MAX_ID; i++) {
            long pos = DataUtils.getPagePos(i, 3, 128, 1);
            assertEquals(i, DataUtils.getPageChunkId(pos));
            assertEquals(3, DataUtils.getPageOffset(pos));
@@ -236,7 +267,7 @@ public class TestDataUtils extends TestBase {
            assertEquals(1, DataUtils.getPageType(pos));
        }
        for (int type = 0; type <= 1; type++) {
-            for (int chunkId = 0; chunkId < 67000000; chunkId += 670000) {
+            for (int chunkId = 0; chunkId < Chunk.MAX_ID; chunkId += Chunk.MAX_ID / 100) {
                for (long offset = 0; offset < Integer.MAX_VALUE; offset += Integer.MAX_VALUE / 100) {
                    for (int length = 0; length < 2000000; length += 200000) {
                        long pos = DataUtils.getPagePos(chunkId, (int) offset, length, type);