提交 eff9d795 authored 作者: Thomas Mueller's avatar Thomas Mueller

CSV reading is now faster.

上级 f733494f
......@@ -18,7 +18,8 @@ Change Log
<h1>Change Log</h1>
<h2>Next Version (unreleased)</h2>
<ul><li>-
<ul><li>SimpleResultSet.newInstance(SimpleRowSource rs) did not work.
</li><li>CSV reading is now faster.
</li></ul>
<h2>Version 1.1.116 (2009-07-18)</h2>
......
......@@ -6,7 +6,6 @@
*/
package org.h2.bnf;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
......@@ -81,7 +80,6 @@ public class Bnf {
}
private void parse(Reader csv) throws SQLException, IOException {
csv = new BufferedReader(csv);
Rule functions = null;
statements = New.arrayList();
ResultSet rs = Csv.getInstance().read(csv, null);
......
......@@ -6,7 +6,6 @@
*/
package org.h2.table;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
......@@ -920,7 +919,7 @@ public class MetaTable extends Table {
String resource = "/org/h2/res/help.csv";
try {
byte[] data = Resources.get(resource);
Reader reader = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(data)));
Reader reader = new InputStreamReader(new ByteArrayInputStream(data));
ResultSet rs = Csv.getInstance().read(reader, null);
for (int i = 0; rs.next(); i++) {
add(rows, new String[] {
......
......@@ -8,7 +8,6 @@ package org.h2.tools;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
......@@ -25,12 +24,12 @@ import java.sql.Statement;
import java.sql.Types;
import java.util.ArrayList;
import org.h2.constant.SysProperties;
import org.h2.engine.Constants;
import org.h2.message.Message;
import org.h2.util.FileUtils;
import org.h2.util.IOUtils;
import org.h2.util.JdbcUtils;
import org.h2.util.New;
import org.h2.util.StringCache;
/**
* A facility to read from and write to CSV (comma separated values) files.
......@@ -40,7 +39,6 @@ import org.h2.util.StringCache;
public class Csv implements SimpleRowSource {
private String streamCharset = SysProperties.FILE_ENCODING;
private int bufferSize = 8 * 1024;
private String[] columnNames;
private char fieldSeparatorRead = ',';
private char commentLineStart = '#';
......@@ -52,8 +50,11 @@ public class Csv implements SimpleRowSource {
private String nullString = "";
private String fileName;
private Reader input;
private char[] inputBuffer;
private int inputBufferPos;
private int inputBufferStart = -1;
private int inputBufferEnd;
private Writer output;
private int back;
private boolean endOfLine, endOfFile;
private Csv() {
......@@ -229,7 +230,7 @@ public class Csv implements SimpleRowSource {
if (output == null) {
try {
OutputStream out = FileUtils.openFileOutputStream(fileName, false);
out = new BufferedOutputStream(out, bufferSize);
out = new BufferedOutputStream(out, Constants.IO_BUFFER_SIZE);
output = new BufferedWriter(new OutputStreamWriter(out, streamCharset));
} catch (SQLException e) {
close();
......@@ -250,7 +251,6 @@ public class Csv implements SimpleRowSource {
if (escapeCharacter != 0) {
if (fieldDelimiter != 0) {
output.write(fieldDelimiter);
}
output.write(escape(s));
if (fieldDelimiter != 0) {
......@@ -290,14 +290,14 @@ public class Csv implements SimpleRowSource {
if (input == null) {
try {
InputStream in = FileUtils.openFileInputStream(fileName);
in = new BufferedInputStream(in, bufferSize);
in = new BufferedInputStream(in, Constants.IO_BUFFER_SIZE);
input = new InputStreamReader(in, streamCharset);
input = new BufferedReader(input);
} catch (IOException e) {
close();
throw e;
}
}
inputBuffer = new char[Constants.IO_BUFFER_SIZE * 2];
if (columnNames == null) {
readHeader();
}
......@@ -317,132 +317,149 @@ public class Csv implements SimpleRowSource {
}
} else {
list.add(v);
if (endOfLine) {
break;
}
}
}
columnNames = new String[list.size()];
list.toArray(columnNames);
}
private void pushBack(int ch) {
back = ch;
private void pushBack() {
inputBufferPos--;
}
private int readChar() throws IOException {
int ch = back;
if (ch != -1) {
back = -1;
return ch;
} else if (endOfFile) {
if (inputBufferPos >= inputBufferEnd) {
return readBuffer();
}
return inputBuffer[inputBufferPos++];
}
private int readBuffer() throws IOException {
if (endOfFile) {
return -1;
}
ch = input.read();
if (ch < 0) {
int keep;
if (inputBufferStart >= 0) {
keep = inputBufferPos - inputBufferStart;
if (keep > 0) {
char[] src = inputBuffer;
if (keep + Constants.IO_BUFFER_SIZE > src.length) {
inputBuffer = new char[src.length * 2];
}
System.arraycopy(src, inputBufferStart, inputBuffer, 0, keep);
}
inputBufferStart = 0;
} else {
keep = 0;
}
inputBufferPos = keep;
int len = input.read(inputBuffer, keep, Constants.IO_BUFFER_SIZE);
if (len == -1) {
// ensure bufferPos > bufferEnd
// even after pushBack
inputBufferEnd = -1024;
endOfFile = true;
close();
// ensure the right number of characters are read
// in case the input buffer is still used
inputBufferPos++;
return -1;
}
return ch;
inputBufferEnd = keep + len;
return inputBuffer[inputBufferPos++];
}
private String readValue() throws IOException {
endOfLine = false;
String value = null;
outer:
inputBufferStart = inputBufferPos;
while (true) {
int ch = readChar();
if (ch < 0 || ch == '\r' || ch == '\n') {
endOfLine = true;
break;
} else if (ch == fieldSeparatorRead) {
// null
break;
} else if (ch <= ' ') {
// ignore spaces
continue;
} else if (ch == fieldDelimiter) {
if (ch == fieldDelimiter) {
// delimited value
StringBuilder buff = new StringBuilder();
boolean containsEscape = false;
inputBufferStart = inputBufferPos;
int sep;
while (true) {
ch = readChar();
if (ch < 0) {
value = buff.toString();
break outer;
} else if (ch == fieldDelimiter) {
ch = readChar();
if (ch == fieldDelimiter) {
buff.append((char) ch);
} else {
pushBack(ch);
ch = readChar();
if (ch != fieldDelimiter) {
sep = 2;
break;
}
containsEscape = true;
} else if (ch == escapeCharacter) {
buff.append((char) ch);
ch = readChar();
if (ch < 0) {
sep = 1;
break;
}
containsEscape = true;
buff.append((char) ch);
} else {
buff.append((char) ch);
} else if (ch < 0) {
sep = 1;
break;
}
}
value = buff.toString();
String s = new String(inputBuffer, inputBufferStart, inputBufferPos - inputBufferStart - sep);
if (containsEscape) {
value = unEscape(value);
s = unEscape(s);
}
inputBufferStart = -1;
while (true) {
ch = readChar();
if (ch < 0) {
if (ch == fieldSeparatorRead) {
break;
} else if (ch == fieldSeparatorRead) {
} else if (ch == '\n' || ch < 0 || ch == '\r') {
endOfLine = true;
break;
} else if (ch == ' ' || ch == '\t') {
// ignore
} else if (ch == '\r' || ch == '\n') {
pushBack(ch);
endOfLine = true;
break;
} else {
pushBack(ch);
pushBack();
break;
}
ch = readChar();
}
break;
return s;
} else if (ch == '\n' || ch < 0 || ch == '\r') {
endOfLine = true;
return null;
} else if (ch == fieldSeparatorRead) {
// null
return null;
} else if (ch <= ' ') {
// ignore spaces
continue;
} else if (ch == commentLineStart) {
// comment until end of line
inputBufferStart = -1;
while (true) {
ch = readChar();
if (ch < 0 || ch == '\r' || ch == '\n') {
if (ch == '\n' || ch < 0 || ch == '\r') {
break;
}
}
endOfLine = true;
break;
return null;
} else {
// un-delimited value
StringBuilder buff = new StringBuilder();
buff.append((char) ch);
while (true) {
ch = readChar();
if (ch == fieldSeparatorRead) {
break;
} else if (ch == '\r' || ch == '\n') {
pushBack(ch);
} else if (ch == '\n' || ch < 0 || ch == '\r') {
endOfLine = true;
break;
} else if (ch < 0) {
break;
}
buff.append((char) ch);
}
String s = new String(inputBuffer, inputBufferStart, inputBufferPos - inputBufferStart - 1);
inputBufferStart = -1;
// check un-delimited value for nullString
value = readNull(buff.toString().trim());
break;
return readNull(s.trim());
}
}
// save memory
return StringCache.get(value);
}
private String readNull(String s) {
......@@ -455,9 +472,12 @@ public class Csv implements SimpleRowSource {
char[] chars = null;
while (true) {
int idx = s.indexOf(escapeCharacter, start);
if (idx < 0) {
idx = s.indexOf(fieldDelimiter, start);
if (idx < 0) {
break;
}
}
if (chars == null) {
chars = s.toCharArray();
}
......@@ -486,11 +506,11 @@ public class Csv implements SimpleRowSource {
while (true) {
String v = readValue();
if (v == null) {
if (endOfFile && i == 0) {
return null;
}
if (endOfLine) {
if (i == 0) {
if (endOfFile) {
return null;
}
// empty line
continue;
}
......@@ -500,6 +520,9 @@ public class Csv implements SimpleRowSource {
if (i < row.length) {
row[i++] = v;
}
if (endOfLine) {
break;
}
}
} catch (IOException e) {
throw convertException("IOException reading from " + fileName, e);
......
......@@ -732,7 +732,7 @@ public class Recover extends Tool implements DataHandler {
} catch (Exception e) {
writeError(writer, e);
}
DataPage s = DataPage.create(this, 128);
Data s = Data.create(this, 128);
store.seek(0);
store.readFully(s.getBytes(), 0, 128);
s.setPos(48);
......@@ -747,7 +747,7 @@ public class Recover extends Tool implements DataHandler {
writer.println("-- ERROR: page size; using " + pageSize);
}
int pageCount = (int) (length / pageSize);
s = DataPage.create(this, pageSize);
s = Data.create(this, pageSize);
int logFirstTrunkPage = 0, logFirstDataPage = 0;
for (int i = 1;; i++) {
if (i == 3) {
......@@ -778,10 +778,10 @@ public class Recover extends Tool implements DataHandler {
writer.println("-- firstTrunkPage: " + logFirstTrunkPage +
" firstDataPage: " + logFirstDataPage);
s = DataPage.create(this, pageSize);
s = Data.create(this, pageSize);
int free = 0;
for (long page = 3; page < pageCount; page++) {
s = DataPage.create(this, pageSize);
s = Data.create(this, pageSize);
store.seek(page * pageSize);
store.readFully(s.getBytes(), 0, pageSize);
int parentPageId = s.readInt();
......@@ -814,7 +814,7 @@ public class Recover extends Tool implements DataHandler {
break;
}
case Page.TYPE_BTREE_NODE:
writer.println("-- page " + page + ": btree node" + (last ? "(last)" : ""));
writer.println("-- page " + page + ": b-tree node" + (last ? "(last)" : ""));
if (trace) {
dumpPageBtreeNode(writer, s, !last);
}
......@@ -822,7 +822,7 @@ public class Recover extends Tool implements DataHandler {
case Page.TYPE_BTREE_LEAF: {
setStorage(s.readInt());
int entries = s.readShortInt();
writer.println("-- page " + page + ": btree leaf " + (last ? "(last)" : "") + " table: " + storageId + " entries: " + entries);
writer.println("-- page " + page + ": b-tree leaf " + (last ? "(last)" : "") + " table: " + storageId + " entries: " + entries);
if (trace) {
dumpPageBtreeLeaf(writer, s, entries, !last);
}
......@@ -1030,16 +1030,21 @@ public class Recover extends Tool implements DataHandler {
}
}
private void dumpPageBtreeNode(PrintWriter writer, DataPage s, boolean positionOnly) {
private void dumpPageBtreeNode(PrintWriter writer, Data s, boolean positionOnly) {
int entryCount = s.readShortInt();
int rowCount = s.readInt();
int[] children = new int[entryCount + 1];
int[] offsets = new int[entryCount];
children[entryCount] = s.readInt();
int empty = Integer.MAX_VALUE;
for (int i = 0; i < entryCount; i++) {
children[i] = s.readInt();
offsets[i] = s.readInt();
int off = s.readInt();
empty = Math.min(off, empty);
offsets[i] = off;
}
empty = empty - s.length();
writer.println("-- empty: " + empty);
for (int i = 0; i < entryCount; i++) {
int off = offsets[i];
s.setPos(off);
......@@ -1060,7 +1065,7 @@ public class Recover extends Tool implements DataHandler {
writer.println("-- [" + entryCount + "] child: " + children[entryCount] + " rowCount: " + rowCount);
}
private int dumpPageFreeList(PrintWriter writer, DataPage s, int pageSize, long pageId, long pageCount) {
private int dumpPageFreeList(PrintWriter writer, Data s, int pageSize, long pageId, long pageCount) {
int pagesAddressed = PageFreeList.getPagesAddressed(pageSize);
BitField used = new BitField();
for (int i = 0; i < pagesAddressed; i += 8) {
......@@ -1087,11 +1092,16 @@ public class Recover extends Tool implements DataHandler {
return free;
}
private void dumpPageBtreeLeaf(PrintWriter writer, DataPage s, int entryCount, boolean positionOnly) {
private void dumpPageBtreeLeaf(PrintWriter writer, Data s, int entryCount, boolean positionOnly) {
int[] offsets = new int[entryCount];
int empty = Integer.MAX_VALUE;
for (int i = 0; i < entryCount; i++) {
offsets[i] = s.readShortInt();
int off = s.readShortInt();
empty = Math.min(off, empty);
offsets[i] = off;
}
empty = empty - s.length();
writer.println("-- empty: " + empty);
for (int i = 0; i < entryCount; i++) {
int off = offsets[i];
s.setPos(off);
......@@ -1111,17 +1121,22 @@ public class Recover extends Tool implements DataHandler {
}
}
private void dumpPageDataLeaf(FileStore store, int pageSize, PrintWriter writer, DataPage s, boolean last, long pageId, int entryCount) throws SQLException {
private void dumpPageDataLeaf(FileStore store, int pageSize, PrintWriter writer, Data s, boolean last, long pageId, int entryCount) throws SQLException {
int[] keys = new int[entryCount];
int[] offsets = new int[entryCount];
long next = 0;
if (!last) {
next = s.readInt();
}
int empty = Integer.MAX_VALUE;
for (int i = 0; i < entryCount; i++) {
keys[i] = s.readInt();
offsets[i] = s.readShortInt();
int off = s.readShortInt();
empty = Math.min(off, empty);
offsets[i] = off;
}
empty = empty - s.length();
writer.println("-- empty: " + empty);
if (!last) {
DataPage s2 = DataPage.create(this, pageSize);
s.setPos(pageSize);
......
......@@ -92,18 +92,18 @@ public class TestCsv extends TestBase {
csv.setNullString("\\N");
ResultSet rs = csv.read(f.getPath(), null, "UTF8");
ResultSetMetaData meta = rs.getMetaData();
assertEquals(meta.getColumnCount(), 4);
assertEquals(meta.getColumnLabel(1), "A");
assertEquals(meta.getColumnLabel(2), "B");
assertEquals(meta.getColumnLabel(3), "C");
assertEquals(meta.getColumnLabel(4), "D");
assertEquals(4, meta.getColumnCount());
assertEquals("A", meta.getColumnLabel(1));
assertEquals("B", meta.getColumnLabel(2));
assertEquals("C", meta.getColumnLabel(3));
assertEquals("D", meta.getColumnLabel(4));
assertTrue(rs.next());
assertEquals(rs.getString(1), null);
assertEquals(rs.getString(2), "");
assertEquals(null, rs.getString(1));
assertEquals("", rs.getString(2));
// null is never quoted
assertEquals(rs.getString(3), "\\N");
assertEquals("\\N", rs.getString(3));
// an empty string is always parsed as null
assertEquals(rs.getString(4), null);
assertEquals(null, rs.getString(4));
assertFalse(rs.next());
Connection conn = getConnection("csv");
......@@ -177,12 +177,12 @@ public class TestCsv extends TestBase {
assertEquals("ID|NAME 1|Hello", text);
ResultSet rs = stat.executeQuery("select * from csvread('" + baseDir + "/test.csv', null, null, '|', '')");
ResultSetMetaData meta = rs.getMetaData();
assertEquals(meta.getColumnCount(), 2);
assertEquals(meta.getColumnLabel(1), "ID");
assertEquals(meta.getColumnLabel(2), "NAME");
assertEquals(2, meta.getColumnCount());
assertEquals("ID", meta.getColumnLabel(1));
assertEquals("NAME", meta.getColumnLabel(2));
assertTrue(rs.next());
assertEquals(rs.getString(1), "1");
assertEquals(rs.getString(2), "Hello");
assertEquals("1", rs.getString(1));
assertEquals("Hello", rs.getString(2));
assertFalse(rs.next());
conn.close();
FileUtils.delete(baseDir + "/test.csv");
......@@ -198,22 +198,22 @@ public class TestCsv extends TestBase {
Statement stat = conn.createStatement();
ResultSet rs = stat.executeQuery("select * from csvread('" + baseDir + "/test.csv', null, null, ';', '''', '\\')");
ResultSetMetaData meta = rs.getMetaData();
assertEquals(meta.getColumnCount(), 2);
assertEquals(meta.getColumnLabel(1), "A");
assertEquals(meta.getColumnLabel(2), "B");
assertEquals(2, meta.getColumnCount());
assertEquals("A", meta.getColumnLabel(1));
assertEquals("B", meta.getColumnLabel(2));
assertTrue(rs.next());
assertEquals(rs.getString(1), "It's nice");
assertEquals(rs.getString(2), "\nHello*\n");
assertEquals("It's nice", rs.getString(1));
assertEquals("\nHello*\n", rs.getString(2));
assertFalse(rs.next());
stat.execute("call csvwrite('" + baseDir + "/test2.csv', 'select * from csvread(''" + baseDir + "/test.csv'', null, null, '';'', '''''''', ''\\'')', null, '+', '*', '#')");
rs = stat.executeQuery("select * from csvread('" + baseDir + "/test2.csv', null, null, '+', '*', '#')");
meta = rs.getMetaData();
assertEquals(meta.getColumnCount(), 2);
assertEquals(meta.getColumnLabel(1), "A");
assertEquals(meta.getColumnLabel(2), "B");
assertEquals(2, meta.getColumnCount());
assertEquals("A", meta.getColumnLabel(1));
assertEquals("B", meta.getColumnLabel(2));
assertTrue(rs.next());
assertEquals(rs.getString(1), "It's nice");
assertEquals(rs.getString(2), "\nHello*\n");
assertEquals("It's nice", rs.getString(1));
assertEquals("\nHello*\n", rs.getString(2));
assertFalse(rs.next());
conn.close();
FileUtils.delete(baseDir + "/test.csv");
......@@ -227,8 +227,8 @@ public class TestCsv extends TestBase {
stat.execute("call csvwrite('" + baseDir + "/test.csv', 'select 1 id, ''Hello'' name', 'utf-8', '|')");
ResultSet rs = stat.executeQuery("select * from csvread('" + baseDir + "/test.csv', null, 'utf-8', '|')");
assertTrue(rs.next());
assertEquals(rs.getInt(1), 1);
assertEquals(rs.getString(2), "Hello");
assertEquals(1, rs.getInt(1));
assertEquals("Hello", rs.getString(2));
assertFalse(rs.next());
new File(baseDir + "/test.csv").delete();
......@@ -250,12 +250,12 @@ public class TestCsv extends TestBase {
stat.execute("call csvwrite('" + baseDir + "/test.csv', 'select 1 id, ''Hello'' name')");
ResultSet rs = stat.executeQuery("select name from csvread('" + baseDir + "/test.csv')");
assertTrue(rs.next());
assertEquals(rs.getString(1), "Hello");
assertEquals("Hello", rs.getString(1));
assertFalse(rs.next());
rs = stat.executeQuery("call csvread('" + baseDir + "/test.csv')");
assertTrue(rs.next());
assertEquals(rs.getInt(1), 1);
assertEquals(rs.getString(2), "Hello");
assertEquals(1, rs.getInt(1));
assertEquals("Hello", rs.getString(2));
assertFalse(rs.next());
new File(baseDir + "/test.csv").delete();
conn.close();
......@@ -269,31 +269,31 @@ public class TestCsv extends TestBase {
file.close();
ResultSet rs = Csv.getInstance().read(baseDir + "/test.csv", null, "UTF8");
ResultSetMetaData meta = rs.getMetaData();
assertEquals(meta.getColumnCount(), 4);
assertEquals(meta.getColumnLabel(1), "a");
assertEquals(meta.getColumnLabel(2), "b");
assertEquals(meta.getColumnLabel(3), "c");
assertEquals(meta.getColumnLabel(4), "d");
assertEquals(4, meta.getColumnCount());
assertEquals("a", meta.getColumnLabel(1));
assertEquals("b", meta.getColumnLabel(2));
assertEquals("c", meta.getColumnLabel(3));
assertEquals("d", meta.getColumnLabel(4));
assertTrue(rs.next());
assertEquals(rs.getString(1), "201");
assertEquals(rs.getString(2), "-2");
assertEquals(rs.getString(3), "0");
assertEquals(rs.getString(4), "18");
assertEquals("201", rs.getString(1));
assertEquals("-2", rs.getString(2));
assertEquals("0", rs.getString(3));
assertEquals("18", rs.getString(4));
assertTrue(rs.next());
assertEquals(rs.getString(1), null);
assertEquals(rs.getString(2), "abc\"");
assertEquals(rs.getString(3), null);
assertEquals(rs.getString(4), "");
assertEquals(null, rs.getString(1));
assertEquals("abc\"", rs.getString(2));
assertEquals(null, rs.getString(3));
assertEquals("", rs.getString(4));
assertTrue(rs.next());
assertEquals(rs.getString(1), "1");
assertEquals(rs.getString(2), "2");
assertEquals(rs.getString(3), "3");
assertEquals(rs.getString(4), "4");
assertEquals("1", rs.getString(1));
assertEquals("2", rs.getString(2));
assertEquals("3", rs.getString(3));
assertEquals("4", rs.getString(4));
assertTrue(rs.next());
assertEquals(rs.getString(1), "5");
assertEquals(rs.getString(2), "6");
assertEquals(rs.getString(3), "7");
assertEquals(rs.getString(4), "8");
assertEquals("5", rs.getString(1));
assertEquals("6", rs.getString(2));
assertEquals("7", rs.getString(3));
assertEquals("8", rs.getString(4));
assertFalse(rs.next());
// a,b,c,d
......@@ -324,8 +324,8 @@ public class TestCsv extends TestBase {
assertEquals(2, meta.getColumnCount());
for (int i = 0; i < len; i++) {
rs.next();
assertEquals(rs.getString("ID"), "" + (i + 1));
assertEquals(rs.getString("NAME"), "Ruebezahl");
assertEquals("" + (i + 1), rs.getString("ID"));
assertEquals("Ruebezahl", rs.getString("NAME"));
}
assertFalse(rs.next());
rs.close();
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论