提交 d3bd6b1d authored 作者: Thomas Mueller's avatar Thomas Mueller

MVStore: power failure could corrupt the store, if writes were re-ordered.

上级 ba429036
......@@ -21,6 +21,8 @@ Change Log
<h2>Next Version (unreleased)</h2>
<ul>
<li>MVStore: power failure could corrupt the store, if writes were re-ordered.
</li>
<li>For compatibility with other databases, support for (double and float)
-0.0 has been removed. 0.0 is used instead.
</li>
......
......@@ -22,9 +22,9 @@ import java.util.concurrent.ConcurrentHashMap;
import org.h2.compress.CompressDeflate;
import org.h2.compress.CompressLZF;
import org.h2.compress.Compressor;
import org.h2.mvstore.Page.PageChildren;
import org.h2.mvstore.cache.CacheLongKeyLIRS;
import org.h2.mvstore.type.StringDataType;
import org.h2.mvstore.Page.PageChildren;
import org.h2.util.MathUtils;
import org.h2.util.New;
......@@ -240,6 +240,13 @@ public class MVStore {
* The time the store was created, in milliseconds since 1970.
*/
private long creationTime;
/**
* How long to retain old, persisted chunks, in milliseconds. For larger or
* equal to zero, a chunk is never directly overwritten if unused, but
* instead, the unused field is set. If smaller zero, chunks are directly
* overwritten if unused.
*/
private int retentionTime;
private long lastCommitTime;
......@@ -270,6 +277,8 @@ public class MVStore {
private IllegalStateException panicException;
private long lastTimeAbsolute;
/**
* Create and open the store.
*
......@@ -337,7 +346,7 @@ public class MVStore {
fileStore.open(fileName, readOnly, encryptionKey);
}
if (fileStore.size() == 0) {
creationTime = getTime();
creationTime = getTimeAbsolute();
lastCommitTime = creationTime;
storeHeader.put("H", 2);
storeHeader.put("blockSize", BLOCK_SIZE);
......@@ -354,7 +363,7 @@ public class MVStore {
Arrays.fill(encryptionKey, (char) 0);
}
}
lastCommitTime = getTime();
lastCommitTime = getTimeSinceCreation();
// setAutoCommitDelay starts the thread, but only if
// the parameter is different from the old value
......@@ -520,13 +529,15 @@ public class MVStore {
}
private Chunk getChunkForVersion(long version) {
Chunk c = lastChunk;
while (true) {
if (c == null || c.version <= version) {
return c;
Chunk newest = null;
for (Chunk c : chunks.values()) {
if (c.version <= version) {
if (newest == null || c.id > newest.id) {
newest = c;
}
}
c = chunks.get(c.id - 1);
}
return newest;
}
/**
......@@ -546,10 +557,9 @@ public class MVStore {
}
private synchronized void readStoreHeader() {
boolean validHeader = false;
// we don't know yet which chunk and version are the newest
long newestVersion = -1;
long chunkBlock = -1;
Chunk newest = null;
boolean validStoreHeader = false;
// find out which chunk and version are the newest
// read the first two blocks
ByteBuffer fileHeaderBlocks = fileStore.readFully(0, 2 * BLOCK_SIZE);
byte[] buff = new byte[BLOCK_SIZE];
......@@ -578,18 +588,22 @@ public class MVStore {
continue;
}
long version = DataUtils.readHexLong(m, "version", 0);
if (version > newestVersion) {
newestVersion = version;
if (newest == null || version > newest.version) {
validStoreHeader = true;
storeHeader.putAll(m);
chunkBlock = DataUtils.readHexLong(m, "block", 0);
creationTime = DataUtils.readHexLong(m, "created", 0);
validHeader = true;
int chunkId = DataUtils.readHexInt(m, "chunk", 0);
long block = DataUtils.readHexLong(m, "block", 0);
Chunk test = readChunkHeaderAndFooter(block);
if (test != null && test.id == chunkId) {
newest = test;
}
}
} catch (Exception e) {
continue;
}
}
if (!validHeader) {
if (!validStoreHeader) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_FILE_CORRUPT,
"Store header is corrupt: {0}", fileStore);
......@@ -629,58 +643,51 @@ public class MVStore {
creationTime = now;
storeHeader.put("created", creationTime);
}
Chunk footer = readChunkFooter(fileStore.size());
if (footer != null) {
if (footer.version > newestVersion) {
newestVersion = footer.version;
chunkBlock = footer.block;
Chunk test = readChunkFooter(fileStore.size());
if (test != null) {
test = readChunkHeaderAndFooter(test.block);
if (test != null) {
if (newest == null || test.version > newest.version) {
newest = test;
}
}
if (chunkBlock <= 0) {
}
if (newest == null) {
// no chunk
return;
}
// read the chunk header and footer,
// and follow the chain of next chunks
lastChunk = null;
while (true) {
Chunk header;
try {
header = readChunkHeader(chunkBlock);
} catch (Exception e) {
// invalid chunk header: ignore, but stop
if (newest.next == 0 ||
newest.next >= fileStore.size() / BLOCK_SIZE) {
// no (valid) next
break;
}
if (header.version < newestVersion) {
// we have reached the end
test = readChunkHeaderAndFooter(newest.next);
if (test == null || test.id <= newest.id) {
break;
}
footer = readChunkFooter((chunkBlock + header.len) * BLOCK_SIZE);
if (footer == null || footer.id != header.id) {
// invalid chunk footer, or the wrong one
break;
newest = test;
}
lastChunk = header;
newestVersion = header.version;
if (header.next == 0 ||
header.next >= fileStore.size() / BLOCK_SIZE) {
// no (valid) next
break;
setLastChunk(newest);
loadChunkMeta();
// read all chunk headers and footers within the retention time,
// to detect unwritten data after a power failure
verifyLastChunks();
// build the free space list
for (Chunk c : chunks.values()) {
if (c.pageCountLive == 0) {
// remove this chunk in the next save operation
registerFreePage(currentVersion, c.id, 0, 0);
}
chunkBlock = header.next;
long start = c.block * BLOCK_SIZE;
int length = c.len * BLOCK_SIZE;
fileStore.markUsed(start, length);
}
if (lastChunk == null) {
// no valid chunk
return;
}
lastMapId = lastChunk.mapId;
currentVersion = lastChunk.version;
setWriteVersion(currentVersion);
chunks.put(lastChunk.id, lastChunk);
meta.setRootPos(lastChunk.metaRootPos, -1);
private void loadChunkMeta() {
// load the chunk metadata: we can load in any order,
// because loading chunk metadata might recursively load another chunk
for (Iterator<String> it = meta.keyIterator("chunk."); it.hasNext();) {
......@@ -699,18 +706,83 @@ public class MVStore {
chunks.put(c.id, c);
}
}
// build the free space list
for (Chunk c : chunks.values()) {
if (c.pageCountLive == 0) {
// remove this chunk in the next save operation
registerFreePage(currentVersion, c.id, 0, 0);
}
long start = c.block * BLOCK_SIZE;
int length = c.len * BLOCK_SIZE;
fileStore.markUsed(start, length);
private void setLastChunk(Chunk last) {
lastChunk = last;
if (last == null) {
// no valid chunk
lastMapId = 0;
currentVersion = 0;
meta.setRootPos(0, -1);
} else {
lastMapId = last.mapId;
currentVersion = last.version;
chunks.put(last.id, last);
meta.setRootPos(last.metaRootPos, -1);
}
setWriteVersion(currentVersion);
}
private void verifyLastChunks() {
long time = getTimeSinceCreation();
ArrayList<Integer> ids = new ArrayList<Integer>(chunks.keySet());
Collections.sort(ids);
int newestValidChunk = -1;
Chunk old = null;
for (Integer chunkId : ids) {
Chunk c = chunks.get(chunkId);
if (old != null && c.time < old.time) {
// old chunk (maybe leftover from a previous crash)
break;
}
old = c;
if (c.time + retentionTime < time) {
// old chunk, no need to verify
newestValidChunk = c.id;
continue;
}
Chunk test = readChunkHeaderAndFooter(c.block);
if (test == null || test.id != c.id) {
break;
}
newestValidChunk = chunkId;
}
Chunk newest = chunks.get(newestValidChunk);
if (newest != lastChunk) {
// to avoid re-using newer chunks later on, we could clear
// the headers and footers of those, but we might not know about all
// of them, so that could be incomplete - but we check that newer
// chunks are written after older chunks, so we are safe
rollbackTo(newest == null ? 0 : newest.version);
}
}
/**
* Read a chunk header and footer, and verify the stored data is consistent.
*
* @param block the block
* @return the chunk, or null if the header or footer don't match or are not
* consistent
*/
private Chunk readChunkHeaderAndFooter(long block) {
Chunk header;
try {
header = readChunkHeader(block);
} catch (Exception e) {
// invalid chunk header: ignore, but stop
return null;
}
if (header == null) {
return null;
}
Chunk footer = readChunkFooter((block + header.len) * BLOCK_SIZE);
if (footer == null || footer.id != header.id) {
return null;
}
return header;
}
/**
* Try to read a chunk footer.
*
......@@ -782,7 +854,8 @@ public class MVStore {
if (closed) {
return;
}
if (fileStore != null && !fileStore.isReadOnly()) {
FileStore f = fileStore;
if (f != null && !f.isReadOnly()) {
stopBackgroundThread();
if (hasUnsavedChanges()) {
commitAndSave();
......@@ -979,11 +1052,12 @@ public class MVStore {
private long storeNowTry() {
freeUnusedChunks();
int currentUnsavedPageCount = unsavedMemory;
long storeVersion = currentStoreVersion;
long version = ++currentVersion;
setWriteVersion(version);
long time = getTime();
long time = getTimeSinceCreation();
lastCommitTime = time;
retainChunk = null;
......@@ -1166,7 +1240,6 @@ public class MVStore {
// may only shrink after the store header was written
shrinkFileIfPossible(1);
}
for (MVMap<?, ?> m : changed) {
Page p = m.getRoot();
if (p.getTotalCount() > 0) {
......@@ -1192,7 +1265,7 @@ public class MVStore {
}
Set<Integer> referenced = collectReferencedChunks();
ArrayList<Chunk> free = New.arrayList();
long time = getTime();
long time = getTimeSinceCreation();
for (Chunk c : chunks.values()) {
if (!referenced.contains(c.id)) {
free.add(c);
......@@ -1347,12 +1420,14 @@ public class MVStore {
}
private boolean canOverwriteChunk(Chunk c, long time) {
if (retentionTime >= 0) {
if (c.time + retentionTime > time) {
return false;
}
if (c.unused == 0 || c.unused + retentionTime / 2 > time) {
return false;
}
}
Chunk r = retainChunk;
if (r != null && c.version > r.version) {
return false;
......@@ -1360,8 +1435,21 @@ public class MVStore {
return true;
}
private long getTime() {
return System.currentTimeMillis() - creationTime;
private long getTimeSinceCreation() {
return Math.max(0, getTimeAbsolute() - creationTime);
}
private long getTimeAbsolute() {
long now = System.currentTimeMillis();
if (lastTimeAbsolute != 0 && now < lastTimeAbsolute) {
// time seems to have run backwards - this can happen
// when the system time is adjusted, for example
// on a leap second
now = lastTimeAbsolute;
} else {
lastTimeAbsolute = now;
}
return now;
}
/**
......@@ -1394,20 +1482,12 @@ public class MVStore {
c.maxLenLive += f.maxLenLive;
c.pageCountLive += f.pageCountLive;
if (c.pageCountLive < 0 && c.pageCountLive > -MARKED_FREE) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_INTERNAL,
"Corrupt page count {0}", c.pageCountLive);
// can happen after a rollback
c.pageCountLive = 0;
}
if (c.maxLenLive < 0 && c.maxLenLive > -MARKED_FREE) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_INTERNAL,
"Corrupt max length {0}", c.maxLenLive);
}
if (c.pageCountLive <= 0 && c.maxLenLive > 0 ||
c.maxLenLive <= 0 && c.pageCountLive > 0) {
throw DataUtils.newIllegalStateException(
DataUtils.ERROR_INTERNAL,
"Corrupt max length {0}", c.maxLenLive);
// can happen after a rollback
c.maxLenLive = 0;
}
modified.add(c);
}
......@@ -1441,6 +1521,9 @@ public class MVStore {
if (savedPercent < minPercent) {
return;
}
if (!closed) {
sync();
}
fileStore.truncate(end);
}
......@@ -1548,7 +1631,7 @@ public class MVStore {
int oldRetentionTime = retentionTime;
boolean oldReuse = reuseSpace;
try {
retentionTime = 0;
retentionTime = -1;
freeUnusedChunks();
if (fileStore.getFillRate() > targetFillRate) {
return false;
......@@ -1618,7 +1701,7 @@ public class MVStore {
buff.position(0);
c.writeChunkHeader(buff, chunkHeaderLen);
buff.position(length - Chunk.FOOTER_LENGTH);
buff.put(lastChunk.getFooterBytes());
buff.put(c.getFooterBytes());
buff.position(0);
write(end, buff.getBuffer());
releaseWriteBuffer(buff);
......@@ -1629,7 +1712,6 @@ public class MVStore {
// update the metadata (store at the end of the file)
reuseSpace = false;
commitAndSave();
sync();
// now re-use the empty space
......@@ -1655,7 +1737,7 @@ public class MVStore {
c.block = pos / BLOCK_SIZE;
c.writeChunkHeader(buff, chunkHeaderLen);
buff.position(length - Chunk.FOOTER_LENGTH);
buff.put(lastChunk.getFooterBytes());
buff.put(c.getFooterBytes());
buff.position(0);
write(pos, buff.getBuffer());
releaseWriteBuffer(buff);
......@@ -1674,7 +1756,11 @@ public class MVStore {
* implementation calls FileChannel.force(true).
*/
public void sync() {
fileStore.sync();
checkOpen();
FileStore f = fileStore;
if (f != null) {
f.sync();
}
}
/**
......@@ -1723,7 +1809,7 @@ public class MVStore {
long maxLengthSum = 0;
long maxLengthLiveSum = 0;
long time = getTime();
long time = getTimeSinceCreation();
for (Chunk c : chunks.values()) {
// ignore young chunks, because we don't optimize those
......@@ -2061,6 +2147,7 @@ public class MVStore {
if (oldMeta == null) {
return false;
}
try {
for (Iterator<String> it = oldMeta.keyIterator("chunk.");
it.hasNext();) {
String chunkKey = it.next();
......@@ -2068,8 +2155,19 @@ public class MVStore {
break;
}
if (!meta.containsKey(chunkKey)) {
String s = oldMeta.get(chunkKey);
Chunk c2 = Chunk.fromString(s);
Chunk test = readChunkHeaderAndFooter(c2.block);
if (test == null || test.id != c2.id) {
return false;
}
// we store this chunk
chunks.put(c2.id, c2);
}
}
} catch (IllegalStateException e) {
// the chunk missing where the metadata is stored
return false;
}
return true;
}
......@@ -2183,43 +2281,43 @@ public class MVStore {
meta.rollbackTo(version);
metaChanged = false;
boolean loadFromFile = false;
// get the largest chunk with a version
// higher or equal the requested version
Chunk removeChunksNewerThan = null;
Chunk c = lastChunk;
while (true) {
if (c == null || c.version < version) {
break;
// find out which chunks to remove,
// and which is the newest chunk to keep
// (the chunk list can have gaps)
ArrayList<Integer> remove = new ArrayList<Integer>();
Chunk keep = null;
for (Chunk c : chunks.values()) {
if (c.version > version) {
remove.add(c.id);
} else if (keep == null || keep.id < c.id) {
keep = c;
}
removeChunksNewerThan = c;
c = chunks.get(c.id - 1);
}
Chunk last = lastChunk;
if (removeChunksNewerThan != null &&
last.version > removeChunksNewerThan.version) {
if (remove.size() > 0) {
// remove the youngest first, so we don't create gaps
// (in case we remove many chunks)
Collections.sort(remove, Collections.reverseOrder());
revertTemp(version);
loadFromFile = true;
while (true) {
last = lastChunk;
if (last == null) {
break;
} else if (last.version <= removeChunksNewerThan.version) {
break;
}
chunks.remove(lastChunk.id);
long start = last.block * BLOCK_SIZE;
int length = last.len * BLOCK_SIZE;
for (int id : remove) {
Chunk c = chunks.remove(id);
long start = c.block * BLOCK_SIZE;
int length = c.len * BLOCK_SIZE;
fileStore.free(start, length);
// need to overwrite the chunk,
// so it can not be used
// overwrite the chunk,
// so it is not be used later on
WriteBuffer buff = getWriteBuffer();
buff.limit(length);
// buff.clear() does not set the data
Arrays.fill(buff.getBuffer().array(), (byte) 0);
write(start, buff.getBuffer());
releaseWriteBuffer(buff);
lastChunk = chunks.get(lastChunk.id - 1);
// only really needed if we remove many chunks, when writes are
// re-ordered - but we do it always, because rollback is not
// performance criticial
sync();
}
lastChunk = keep;
writeStoreHeader();
readStoreHeader();
}
......@@ -2233,12 +2331,10 @@ public class MVStore {
m.setRootPos(getRootPos(meta, id), -1);
}
}
}
// rollback might have rolled back the stored chunk metadata as well
if (lastChunk != null) {
c = chunks.get(lastChunk.id - 1);
if (c != null) {
for (Chunk c : chunks.values()) {
meta.put(Chunk.getMetaKey(c.id), c.asString());
}
}
......@@ -2371,7 +2467,7 @@ public class MVStore {
// could also commit when there are many unsaved pages,
// but according to a test it doesn't really help
long time = getTime();
long time = getTimeSinceCreation();
if (time <= lastCommitTime + autoCommitDelay) {
return;
}
......
......@@ -795,6 +795,7 @@ kill -9 `jps -l | grep "org.h2.test." | cut -d " " -f 1`
addTest(new TestMVTableEngine());
addTest(new TestObjectDataType());
addTest(new TestRandomMapOps());
addTest(new TestReorderWrites());
addTest(new TestSpinLock());
addTest(new TestStreamStore());
addTest(new TestTransactionStore());
......
......@@ -13,6 +13,7 @@ import java.util.Map;
import java.util.Random;
import org.h2.mvstore.MVStore;
import org.h2.mvstore.MVStoreTool;
import org.h2.store.fs.FilePath;
import org.h2.store.fs.FileUtils;
import org.h2.test.TestBase;
......@@ -24,6 +25,8 @@ import org.h2.test.utils.FilePathReorderWrites;
*/
public class TestReorderWrites extends TestBase {
private static final boolean LOG = false;
/**
* Run just this test.
*
......@@ -35,16 +38,16 @@ public class TestReorderWrites extends TestBase {
@Override
public void test() throws Exception {
testMVStore();
testFileSystem();
// testMVStore();
}
private void testMVStore() {
FilePathReorderWrites fs = FilePathReorderWrites.register();
String fileName = "reorder:memFS:test.mv";
Random r = new Random(1);
for (int i = 0; i < 100; i++) {
System.out.println(i + " tst --------------------------------");
for (int i = 0; i < 1000; i++) {
log(i + " --------------------------------");
Random r = new Random(i);
fs.setPowerOffCountdown(100, i);
FileUtils.delete(fileName);
MVStore store = new MVStore.Builder().
......@@ -52,12 +55,12 @@ public class TestReorderWrites extends TestBase {
autoCommitDisabled().open();
// store.setRetentionTime(10);
Map<Integer, byte[]> map = store.openMap("data");
map.put(0, new byte[1]);
map.put(-1, new byte[1]);
store.commit();
// if (r.nextBoolean()) {
store.getFileStore().sync();
//}
fs.setPowerOffCountdown(4 + r.nextInt(20), i);
int stop = 4 + r.nextInt(20);
log("synched start");
fs.setPowerOffCountdown(stop, i);
try {
for (int j = 1; j < 100; j++) {
Map<Integer, Integer> newMap = store.openMap("d" + j);
......@@ -69,31 +72,61 @@ public class TestReorderWrites extends TestBase {
} else {
map.put(key, new byte[len]);
}
log("op " + j + ": ");
store.commit();
switch (r.nextInt(10)) {
case 0:
log("op compact");
store.compact(100, 10 * 1024);
break;
case 1:
log("op compactMoveChunks");
store.compactMoveChunks();
log("op compactMoveChunks done");
break;
}
}
// write has to fail at some point
fail();
} catch (IllegalStateException e) {
log("stop " + e);
// expected
}
try {
store.close();
System.out.println("-------------------------------- test");
} catch (IllegalStateException e) {
// expected
store.closeImmediately();
}
log("verify");
fs.setPowerOffCountdown(100, 0);
System.out.println("file size: " + FileUtils.size(fileName));
if (LOG) {
MVStoreTool.dump(fileName, true);
}
store = new MVStore.Builder().
fileName(fileName).
autoCommitDisabled().open();
map = store.openMap("data");
assertEquals(1, map.get(0).length);
if (!map.containsKey(-1)) {
fail("key not found, size=" + map.size() + " i=" + i);
} else {
assertEquals("i=" + i, 1, map.get(-1).length);
}
for (int j = 0; j < 100; j++) {
Map<Integer, Integer> newMap = store.openMap("d" + j);
newMap.get(j);
}
// map.keySet();
map.keySet();
store.close();
}
}
private static void log(String message) {
if (LOG) {
System.out.println(message);
}
}
private void testFileSystem() throws IOException {
FilePathReorderWrites fs = FilePathReorderWrites.register();
String fileName = "reorder:memFS:test";
......
......@@ -948,6 +948,10 @@ public class TestMVStore extends TestBase {
break;
}
}
// the last chunk is at the end
s.setReuseSpace(false);
map = s.openMap("test2");
map.put(1, new byte[1000]);
s.close();
FilePath f = FilePath.get(fileName);
int blockSize = 4 * 1024;
......@@ -976,6 +980,8 @@ public class TestMVStore extends TestBase {
s = openStore(fileName);
map = s.openMap("test");
assertEquals(100, map.get(0).length);
map = s.openMap("test2");
assertFalse(map.containsKey(1));
s.close();
} else {
// both headers are corrupt
......@@ -1414,7 +1420,7 @@ public class TestMVStore extends TestBase {
assertEquals(0, m.size());
s.commit();
// ensure only nodes are read, but not leaves
assertEquals(41, s.getFileStore().getReadCount());
assertEquals(45, s.getFileStore().getReadCount());
assertTrue(s.getFileStore().getWriteCount() < 5);
s.close();
}
......@@ -1579,7 +1585,6 @@ public class TestMVStore extends TestBase {
data.put("2", "World");
s.commit();
assertEquals(1, s.getCurrentVersion());
assertFalse(m.containsKey("chunk.2"));
assertEquals("[data]", s.getMapNames().toString());
assertEquals("data", s.getMapName(data.getId()));
......@@ -1599,8 +1604,6 @@ public class TestMVStore extends TestBase {
s.rollbackTo(1);
assertEquals("Hello", data.get("1"));
assertEquals("World", data.get("2"));
assertFalse(m.containsKey("chunk.1"));
assertFalse(m.containsKey("chunk.2"));
s.close();
}
......
......@@ -105,10 +105,11 @@ public class FilePathReorderWrites extends FilePathWrapper {
if (powerFailureCountdown == 0) {
return;
}
if (--powerFailureCountdown > 0) {
return;
if (powerFailureCountdown < 0) {
throw POWER_FAILURE;
}
if (powerFailureCountdown >= -1) {
powerFailureCountdown--;
if (powerFailureCountdown == 0) {
powerFailureCountdown--;
throw POWER_FAILURE;
}
......@@ -122,7 +123,7 @@ public class FilePathReorderWrites extends FilePathWrapper {
IOUtils.copy(in, out);
FileChannel base = getBase().open(mode);
FileChannel readBase = copy.open(mode);
return new FilePowerFailure(this, base, readBase);
return new FileReorderWrites(this, base, readBase);
}
@Override
......@@ -140,7 +141,7 @@ public class FilePathReorderWrites extends FilePathWrapper {
/**
* An file that checks for errors before each write operation.
*/
class FilePowerFailure extends FileBase {
class FileReorderWrites extends FileBase {
private final FilePathReorderWrites file;
/**
......@@ -162,7 +163,7 @@ class FilePowerFailure extends FileBase {
private int id;
FilePowerFailure(FilePathReorderWrites file, FileChannel base, FileChannel readBase) {
FileReorderWrites(FilePathReorderWrites file, FileChannel base, FileChannel readBase) {
this.file = file;
this.base = base;
this.readBase = readBase;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论