提交 f7da04b3 authored 作者: Thomas Mueller's avatar Thomas Mueller

Page store: save a few percent disk space

上级 827b5094
......@@ -99,7 +99,8 @@ Many thanks for those who helped by finding and reporting bugs, gave valuable fe
spread the word and have translated this project. Also many thanks to the donors who contributed
via PayPal:
</p>
<ul><li><a href="http://skycash.com">SkyCash, Poland</a>
<ul><li><a href="http://www.netsuxxess.de">NetSuxxess InfoSystems GmbH, Germany</a>
</li><li><a href="http://skycash.com">SkyCash, Poland</a>
</li><li>Donald Bleyl, USA
</li><li>lumber-mill.co.jp, Japan
</li><li>Frank Berger, Germany
......
......@@ -449,6 +449,8 @@ See also <a href="build.html#providing_patches">Providing Patches</a>.
if the file system is on a remote share (see Google Group 'Lock file modification time is in the future').
</li><li>Document internal features such as BELONGS_TO_TABLE, NULL_TO_DEFAULT, SEQUENCE.
</li><li>Issue 107: Prefer using the ORDER BY index if LIMIT is used.
</li><li>Support reading sequences using DatabaseMetaData.getTables(null, null, null, new String[]{"SEQUENCE"}).
See PostgreSQL.
</li></ul>
<h2>Not Planned</h2>
......
......@@ -36,7 +36,7 @@ public class BtreeNode extends BtreePage {
BtreeNode(BtreeIndex index, DataPage s) throws SQLException {
super(index);
int len = s.readInt();
int[] array = MemoryUtils.newInts(len);
int[] array = MemoryUtils.newIntArray(len);
for (int i = 0; i < array.length; i++) {
array[i] = s.readInt();
}
......
......@@ -24,7 +24,7 @@ import org.h2.store.PageStore;
* </li><li>4-4: page type
* </li><li>5-8: index id
* </li><li>9-10: entry count
* </li><li>11-: list of key / offset pairs (4 bytes key, 2 bytes offset)
* </li><li>11-: list offsets (2 bytes each)
* </li><li>data
* </li></ul>
*/
......
......@@ -28,14 +28,14 @@ import org.h2.util.MemoryUtils;
* <li>9-10: entry count</li>
* <li>11-14: row count of all children (-1 if not known)</li>
* <li>15-18: rightmost child page id</li>
* <li>19- entries: 4 bytes leaf page id, 4 bytes offset to data</li>
* <li>19- entries: 4 bytes leaf page id, 2 bytes offset to data</li>
* </ul>
* The row is the largest row of the respective child, meaning
* row[0] is the largest row of child[0].
*/
public class PageBtreeNode extends PageBtree {
private static final int CHILD_OFFSET_PAIR_LENGTH = 8;
private static final int CHILD_OFFSET_PAIR_LENGTH = 6;
private static final int CHILD_OFFSET_PAIR_START = 19;
/**
......@@ -85,10 +85,10 @@ public class PageBtreeNode extends PageBtree {
childPageIds = new int[entryCount + 1];
childPageIds[entryCount] = data.readInt();
rows = PageStore.newSearchRows(entryCount);
offsets = MemoryUtils.newInts(entryCount);
offsets = MemoryUtils.newIntArray(entryCount);
for (int i = 0; i < entryCount; i++) {
childPageIds[i] = data.readInt();
offsets[i] = data.readInt();
offsets[i] = data.readShortInt();
}
check();
start = data.length();
......@@ -243,7 +243,7 @@ public class PageBtreeNode extends PageBtree {
entryCount = 0;
childPageIds = new int[] { page1.getPos() };
rows = new SearchRow[0];
offsets = MemoryUtils.EMPTY_INTS;
offsets = MemoryUtils.EMPTY_INT_ARRAY;
addChild(0, page2.getPos(), pivot);
rowCount = page1.getRowCount() + page2.getRowCount();
check();
......@@ -370,7 +370,7 @@ public class PageBtreeNode extends PageBtree {
data.writeInt(childPageIds[entryCount]);
for (int i = 0; i < entryCount; i++) {
data.writeInt(childPageIds[i]);
data.writeInt(offsets[i]);
data.writeShortInt(offsets[i]);
}
for (int i = 0; i < entryCount; i++) {
index.writeRow(data, offsets[i], rows[i], onlyPosition);
......@@ -395,7 +395,7 @@ public class PageBtreeNode extends PageBtree {
Message.throwInternalError();
}
SearchRow[] newRows = PageStore.newSearchRows(entryCount);
int[] newOffsets = MemoryUtils.newInts(entryCount);
int[] newOffsets = MemoryUtils.newIntArray(entryCount);
int[] newChildPageIds = new int[entryCount + 1];
System.arraycopy(offsets, 0, newOffsets, 0, Math.min(entryCount, i));
System.arraycopy(rows, 0, newRows, 0, Math.min(entryCount, i));
......
......@@ -50,7 +50,7 @@ abstract class PageData extends Page {
/**
* The row keys.
*/
protected int[] keys;
protected long[] keys;
/**
* Whether the data page is up-to-date.
......@@ -83,11 +83,11 @@ abstract class PageData extends Page {
* @param key the key (may not exist)
* @return the matching or next index
*/
int find(int key) {
int find(long key) {
int l = 0, r = entryCount;
while (l < r) {
int i = (l + r) >>> 1;
int k = keys[i];
long k = keys[i];
if (k == key) {
return i;
} else if (k > key) {
......@@ -122,7 +122,7 @@ abstract class PageData extends Page {
* @param index the index
* @return the key
*/
int getKey(int index) {
long getKey(int index) {
return keys[index];
}
......
......@@ -26,13 +26,12 @@ import org.h2.store.PageStore;
* </li><li>5-8: table id
* </li><li>9-10: entry count
* </li><li>with overflow: 11-14: the first overflow page id
* </li><li>11- or 15-: list of key / offset pairs (4 bytes key, 2 bytes offset)
* </li><li>11- or 15-: list of key / offset pairs (varLong key, 2 bytes offset)
* </li><li>data
* </li></ul>
*/
public class PageDataLeaf extends PageData {
private static final int KEY_OFFSET_PAIR_LENGTH = 6;
private static final int KEY_OFFSET_PAIR_START = 11;
/**
......@@ -96,13 +95,13 @@ public class PageDataLeaf extends PageData {
}
entryCount = data.readShortInt();
offsets = new int[entryCount];
keys = new int[entryCount];
keys = new long[entryCount];
rows = new Row[entryCount];
if (type == Page.TYPE_DATA_LEAF) {
firstOverflowPageId = data.readInt();
}
for (int i = 0; i < entryCount; i++) {
keys[i] = data.readInt();
keys[i] = data.readVarInt();
offsets[i] = data.readShortInt();
}
start = data.length();
......@@ -112,7 +111,8 @@ public class PageDataLeaf extends PageData {
int rowLength = row.getByteCount(data);
int pageSize = index.getPageStore().getPageSize();
int last = entryCount == 0 ? pageSize : offsets[entryCount - 1];
if (entryCount > 0 && last - rowLength < start + KEY_OFFSET_PAIR_LENGTH) {
int keyOffsetPairLen = 2 + data.getVarLongLen(row.getPos());
if (entryCount > 0 && last - rowLength < start + keyOffsetPairLen) {
// split at the insertion point to better fill pages
// split in half would be:
// if (entryCount > 1) {
......@@ -122,7 +122,7 @@ public class PageDataLeaf extends PageData {
}
int offset = last - rowLength;
int[] newOffsets = new int[entryCount + 1];
int[] newKeys = new int[entryCount + 1];
long[] newKeys = new long[entryCount + 1];
Row[] newRows = new Row[entryCount + 1];
int x;
if (entryCount == 0) {
......@@ -148,7 +148,7 @@ public class PageDataLeaf extends PageData {
last = x == 0 ? pageSize : offsets[x - 1];
offset = last - rowLength;
entryCount++;
start += KEY_OFFSET_PAIR_LENGTH;
start += keyOffsetPairLen;
newOffsets[x] = offset;
newKeys[x] = row.getPos();
newRows[x] = row;
......@@ -205,8 +205,9 @@ public class PageDataLeaf extends PageData {
if (entryCount < 0) {
Message.throwInternalError();
}
int keyOffsetPairLen = 2 + data.getVarLongLen(keys[i]);
int[] newOffsets = new int[entryCount];
int[] newKeys = new int[entryCount];
long[] newKeys = new long[entryCount];
Row[] newRows = new Row[entryCount];
System.arraycopy(offsets, 0, newOffsets, 0, i);
System.arraycopy(keys, 0, newKeys, 0, i);
......@@ -218,7 +219,7 @@ public class PageDataLeaf extends PageData {
}
System.arraycopy(keys, i + 1, newKeys, i, entryCount - i);
System.arraycopy(rows, i + 1, newRows, i, entryCount - i);
start -= KEY_OFFSET_PAIR_LENGTH;
start -= keyOffsetPairLen;
offsets = newOffsets;
keys = newKeys;
rows = newRows;
......@@ -256,7 +257,7 @@ public class PageDataLeaf extends PageData {
}
data.setPos(offsets[at]);
r = index.readRow(data);
r.setPos(keys[at]);
r.setPos((int) keys[at]);
if (firstOverflowPageId != 0) {
rowRef = new SoftReference<Row>(r);
} else {
......@@ -386,7 +387,7 @@ public class PageDataLeaf extends PageData {
data.writeInt(firstOverflowPageId);
}
for (int i = 0; i < entryCount; i++) {
data.writeInt(keys[i]);
data.writeVarLong(keys[i]);
data.writeShortInt(offsets[i]);
}
for (int i = 0; i < entryCount; i++) {
......
......@@ -27,7 +27,7 @@ import org.h2.util.MemoryUtils;
* </li><li>9-10: entry count
* </li><li>11-14: row count of all children (-1 if not known)
* </li><li>15-18: rightmost child page id
* </li><li>19- entries: 4 bytes leaf page id, 4 bytes key
* </li><li>19- entries: 4 bytes leaf page id, varLong key
* </li></ul>
* The key is the largest key of the respective child, meaning
* key[0] is the largest key of child[0].
......@@ -36,8 +36,6 @@ public class PageDataNode extends PageData {
private static final int ENTRY_START = 19;
private static final int ENTRY_LENGTH = 8;
/**
* The page ids of the children.
*/
......@@ -47,6 +45,11 @@ public class PageDataNode extends PageData {
private int rowCount = UNKNOWN_ROWCOUNT;
/**
* The number of bytes used in the page
*/
private int length = ENTRY_START;
PageDataNode(PageScanIndex index, int pageId, Data data) {
super(index, pageId, data);
}
......@@ -79,17 +82,18 @@ public class PageDataNode extends PageData {
rowCount = rowCountStored = data.readInt();
childPageIds = new int[entryCount + 1];
childPageIds[entryCount] = data.readInt();
keys = MemoryUtils.newInts(entryCount);
keys = MemoryUtils.newLongArray(entryCount);
for (int i = 0; i < entryCount; i++) {
childPageIds[i] = data.readInt();
keys[i] = data.readInt();
keys[i] = data.readVarLong();
}
length = data.length();
check();
}
private void addChild(int x, int childPageId, int key) {
private void addChild(int x, int childPageId, long key) {
written = false;
int[] newKeys = new int[entryCount + 1];
long[] newKeys = new long[entryCount + 1];
int[] newChildPageIds = new int[entryCount + 2];
if (childPageIds != null) {
System.arraycopy(childPageIds, 0, newChildPageIds, 0, x + 1);
......@@ -106,9 +110,11 @@ public class PageDataNode extends PageData {
keys = newKeys;
childPageIds = newChildPageIds;
entryCount++;
length += 4 + data.getVarLongLen(key);
}
int addRowTry(Row row) throws SQLException {
int keyOffsetPairLen = 4 + data.getVarLongLen(row.getPos());
while (true) {
int x = find(row.getPos());
PageData page = index.getPage(childPageIds[x], getPos());
......@@ -116,11 +122,10 @@ public class PageDataNode extends PageData {
if (splitPoint == -1) {
break;
}
int maxEntries = (index.getPageStore().getPageSize() - ENTRY_START) / ENTRY_LENGTH;
if (entryCount >= maxEntries) {
if (length + keyOffsetPairLen > index.getPageStore().getPageSize()) {
return entryCount / 2;
}
int pivot = splitPoint == 0 ? row.getPos() : page.getKey(splitPoint - 1);
long pivot = splitPoint == 0 ? row.getPos() : page.getKey(splitPoint - 1);
PageData page2 = page.split(splitPoint);
index.getPageStore().updateRecord(page, true, page.data);
index.getPageStore().updateRecord(page2, true, page2.data);
......@@ -178,10 +183,11 @@ public class PageDataNode extends PageData {
* @param pivot the pivot key
* @param page2 the last child page
*/
void init(PageData page1, int pivot, PageData page2) {
void init(PageData page1, long pivot, PageData page2) {
entryCount = 1;
childPageIds = new int[] { page1.getPos(), page2.getPos() };
keys = new int[] { pivot };
keys = new long[] { pivot };
length += 4 + data.getVarLongLen(pivot);
check();
}
......@@ -195,7 +201,7 @@ public class PageDataNode extends PageData {
* @param key the last key of the current page
* @return the next leaf page
*/
PageDataLeaf getNextPage(int key) throws SQLException {
PageDataLeaf getNextPage(long key) throws SQLException {
int i = find(key) + 1;
if (i > entryCount) {
if (parentPageId == PageData.ROOT) {
......@@ -304,7 +310,10 @@ public class PageDataNode extends PageData {
data.writeInt(childPageIds[entryCount]);
for (int i = 0; i < entryCount; i++) {
data.writeInt(childPageIds[i]);
data.writeInt(keys[i]);
data.writeVarLong(keys[i]);
}
if (length != data.length()) {
Message.throwInternalError("expected pos: " + length + " got: " + data.length());
}
written = true;
}
......@@ -312,10 +321,12 @@ public class PageDataNode extends PageData {
private void removeChild(int i) {
written = false;
entryCount--;
int removedKeyIndex = i < keys.length ? i : i - 1;
length -= 4 + data.getVarLongLen(keys[removedKeyIndex]);
if (entryCount < 0) {
Message.throwInternalError();
}
int[] newKeys = MemoryUtils.newInts(entryCount);
long[] newKeys = MemoryUtils.newLongArray(entryCount);
int[] newChildPageIds = new int[entryCount + 1];
System.arraycopy(keys, 0, newKeys, 0, Math.min(entryCount, i));
System.arraycopy(childPageIds, 0, newChildPageIds, 0, i);
......@@ -340,6 +351,7 @@ public class PageDataNode extends PageData {
p2.keys = keys;
p2.entryCount = entryCount;
p2.parentPageId = parentPageId;
p2.length = length;
store.updateRecord(p2, false, null);
if (parentPageId == ROOT) {
index.setRootPageId(session, newPos);
......
......@@ -114,7 +114,7 @@ public class PageScanIndex extends PageIndex implements RowIndex {
if (trace.isDebugEnabled()) {
trace.debug("split " + splitPoint);
}
int pivot = splitPoint == 0 ? row.getPos() : root.getKey(splitPoint - 1);
long pivot = splitPoint == 0 ? row.getPos() : root.getKey(splitPoint - 1);
PageData page1 = root;
PageData page2 = root.split(splitPoint);
int rootPageId = root.getPos();
......
......@@ -99,7 +99,7 @@ public class ViewIndex extends BaseIndex {
}
public double getCost(Session session, int[] masks) throws SQLException {
IntArray masksArray = new IntArray(masks == null ? MemoryUtils.EMPTY_INTS : masks);
IntArray masksArray = new IntArray(masks == null ? MemoryUtils.EMPTY_INT_ARRAY : masks);
CostElement cachedCost = costCache.get(masksArray);
if (cachedCost != null) {
long time = System.currentTimeMillis();
......
......@@ -859,6 +859,12 @@ public class Data extends DataPage {
}
}
/**
* The number of bytes required for a variable size int.
*
* @param x the value
* @return the len
*/
private int getVarIntLen(int x) {
if ((x & (-1 << 7)) == 0) {
return 1;
......@@ -872,6 +878,11 @@ public class Data extends DataPage {
return 5;
}
/**
* Write a variable size int.
*
* @param x the value
*/
private void writeVarInt(int x) {
while ((x & ~0x7f) != 0) {
data[pos++] = (byte) (0x80 | (x & 0x7f));
......@@ -880,7 +891,12 @@ public class Data extends DataPage {
data[pos++] = (byte) x;
}
private int readVarInt() {
/**
* Read a variable size int.
*
* @return the value
*/
public int readVarInt() {
int b = data[pos++];
if (b >= 0) {
return b;
......@@ -903,7 +919,13 @@ public class Data extends DataPage {
return x | ((b & 0x7f) << 21) | (data[pos++] << 28);
}
private int getVarLongLen(long x) {
/**
* The number of bytes required for a variable size long.
*
* @param x the value
* @return the len
*/
public int getVarLongLen(long x) {
int i = 1;
while (true) {
x >>>= 7;
......@@ -914,7 +936,12 @@ public class Data extends DataPage {
}
}
private void writeVarLong(long x) {
/**
* Write a variable size long.
*
* @param x the value
*/
public void writeVarLong(long x) {
while ((x & ~0x7f) != 0) {
data[pos++] = (byte) ((x & 0x7f) | 0x80);
x >>>= 7;
......@@ -922,7 +949,12 @@ public class Data extends DataPage {
data[pos++] = (byte) x;
}
private long readVarLong() {
/**
* Read a variable size long.
*
* @return the value
*/
public long readVarLong() {
long x = data[pos++];
if (x >= 0) {
return x;
......
......@@ -77,27 +77,23 @@ import org.h2.value.ValueString;
*/
public class PageStore implements CacheWriter {
// TODO a correctly closed database should not contain log pages
// TODO shrinking: a way to load pages centrally
// TODO shrinking: Page.moveTo(int pageId).
// TODO implement checksum; 0 for empty pages
// TODO in log, don't store empty space
// TODO utf-x: test if it's faster
// TODO after opening the database, delay writing until required
// TODO scan index: support long keys, and use var long
// TODO don't save the direct parent (only root); remove setPageId
// TODO implement checksum; 0 for empty pages
// TODO remove parent, use tableId if required
// TODO maybe remove parent pointer
// TODO replace CRC32
// TODO optimization: try to avoid allocating a byte array per page
// TODO optimization: check if calling Data.getValueLen slows things down
// TODO PageBtreeNode: 4 bytes offset - others use only 2
// TODO undo pages: don't store the middle zeroes
// TODO undo pages compression: try http://en.wikipedia.org/wiki/LZJB
// TODO order pages so that searching for a key only seeks forward
// TODO completely re-use keys of deleted rows; maybe
// remember last page with deleted keys (in the root page?),
// and chain such pages
// TODO delete: only log the key
// TODO update: only log the key and changed values
// TODO detect circles in linked lists
// (input stream, free list, extend pages...)
......@@ -107,7 +103,6 @@ public class PageStore implements CacheWriter {
// TODO recover tool: don't re-do uncommitted operations
// TODO no need to log old page if it was always empty
// TODO don't store default values (store a special value)
// TODO maybe split at the last insertion point
// TODO split files (1 GB max size)
// TODO add a setting (that can be changed at runtime) to call fsync
// and delay on each commit
......@@ -115,11 +110,8 @@ public class PageStore implements CacheWriter {
// TODO test running out of disk space (using a special file system)
// TODO check for file size (exception if not exact size expected)
// TODO implement missing code for STORE_BTREE_ROWCOUNT (maybe enable)
// TODO delete: only log the key
// TODO update: only log the key and changed values
// TODO store dates differently in Data; test moving db to another timezone
// TODO online backup using bsdiff
// TODO trying to insert duplicate key can split a page: not in recovery
// TODO online backup using bsdif
// TODO when removing DiskFile:
// remove CacheObject.blockCount
......
......@@ -141,7 +141,7 @@ public class TableView extends Table {
public PlanItem getBestPlanItem(Session session, int[] masks) throws SQLException {
PlanItem item = new PlanItem();
item.cost = index.getCost(session, masks);
IntArray masksArray = new IntArray(masks == null ? MemoryUtils.EMPTY_INTS : masks);
IntArray masksArray = new IntArray(masks == null ? MemoryUtils.EMPTY_INT_ARRAY : masks);
ViewIndex i2 = indexCache.get(masksArray);
if (i2 == null || i2.getSession() != session) {
i2 = new ViewIndex(this, index, session, masks);
......
......@@ -83,6 +83,10 @@ public class Recover extends Tool implements DataHandler {
private HashMap<Integer, String> tableMap;
private boolean remove;
private long pageDataEmpty;
private int pageDataRows;
private int pageDataHead;
/**
* Options are case sensitive. Supported options are:
* <table>
......@@ -722,6 +726,9 @@ public class Recover extends Tool implements DataHandler {
PrintWriter writer = null;
int[] pageTypeCount = new int[Page.TYPE_STREAM_DATA + 2];
int emptyPages = 0;
pageDataEmpty = 0;
pageDataRows = 0;
pageDataHead = 0;
try {
writer = getWriter(fileName, ".sql");
writer.println("CREATE ALIAS IF NOT EXISTS READ_CLOB FOR \"" + this.getClass().getName() + ".readClob\";");
......@@ -869,6 +876,7 @@ public class Recover extends Tool implements DataHandler {
// ignore
}
writer.println("-- page count: " + pageCount + " empty: " + emptyPages + " free: " + free);
writer.println("-- page data head: " + pageDataHead + " empty: " + pageDataEmpty + " rows: " + pageDataRows);
for (int i = 0; i < pageTypeCount.length; i++) {
int count = pageTypeCount[i];
if (count > 0) {
......@@ -1160,20 +1168,23 @@ public class Recover extends Tool implements DataHandler {
}
private void dumpPageDataLeaf(FileStore store, int pageSize, PrintWriter writer, Data s, boolean last, long pageId, int entryCount) throws SQLException {
int[] keys = new int[entryCount];
long[] keys = new long[entryCount];
int[] offsets = new int[entryCount];
long next = 0;
if (!last) {
next = s.readInt();
}
int empty = Integer.MAX_VALUE;
int empty = pageSize;
for (int i = 0; i < entryCount; i++) {
keys[i] = s.readInt();
keys[i] = s.readVarLong();
int off = s.readShortInt();
empty = Math.min(off, empty);
offsets[i] = off;
}
pageDataRows += pageSize - empty;
empty = empty - s.length();
pageDataHead += s.length();
pageDataEmpty += empty;
writer.println("-- empty: " + empty);
if (!last) {
DataPage s2 = DataPage.create(this, pageSize);
......@@ -1204,7 +1215,7 @@ public class Recover extends Tool implements DataHandler {
}
}
for (int i = 0; i < entryCount; i++) {
int key = keys[i];
long key = keys[i];
int off = offsets[i];
writer.println("-- [" + i + "] storage: " + storageId + " key: " + key + " off: " + off);
s.setPos(off);
......
......@@ -21,7 +21,12 @@ public class MemoryUtils {
/**
* An 0-size int array.
*/
public static final int[] EMPTY_INTS = new int[0];
public static final int[] EMPTY_INT_ARRAY = new int[0];
/**
* An 0-size long array.
*/
public static final long[] EMPTY_LONG_ARRAY = new long[0];
private static long lastGC;
private static final int GC_DELAY = 50;
......@@ -113,16 +118,29 @@ public class MemoryUtils {
}
/**
* Create an array of ints with the given size.
* Create an int array with the given size.
*
* @param len the number of bytes requested
* @return the int array
*/
public static int[] newInts(int len) {
public static int[] newIntArray(int len) {
if (len == 0) {
return EMPTY_INTS;
return EMPTY_INT_ARRAY;
}
return new int[len];
}
/**
* Create a long array with the given size.
*
* @param len the number of bytes requested
* @return the int array
*/
public static long[] newLongArray(int len) {
if (len == 0) {
return EMPTY_LONG_ARRAY;
}
return new long[len];
}
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论