提交 c89e1934 authored 作者: Thomas Mueller's avatar Thomas Mueller

More accurate calculation of variance and standard deviation for large number of…

More accurate calculation of variance and standard deviation for large number of samples with low variance (now using Welford's method).
上级 e62cf0ff
...@@ -31,7 +31,7 @@ class AggregateData { ...@@ -31,7 +31,7 @@ class AggregateData {
private IntIntHashMap distinctHashes; private IntIntHashMap distinctHashes;
private ValueHashMap<AggregateData> distinctValues; private ValueHashMap<AggregateData> distinctValues;
private Value value; private Value value;
private double sum, vpn; private double m2, mean;
private ArrayList<Value> list; private ArrayList<Value> list;
AggregateData(int aggregateType, int dataType) { AggregateData(int aggregateType, int dataType) {
...@@ -55,7 +55,7 @@ class AggregateData { ...@@ -55,7 +55,7 @@ class AggregateData {
int size = distinctHashes.size(); int size = distinctHashes.size();
if (size > Constants.SELECTIVITY_DISTINCT_COUNT) { if (size > Constants.SELECTIVITY_DISTINCT_COUNT) {
distinctHashes = new IntIntHashMap(); distinctHashes = new IntIntHashMap();
sum += size; m2 += size;
} }
int hash = v.hashCode(); int hash = v.hashCode();
// the value -1 is not supported // the value -1 is not supported
...@@ -117,16 +117,17 @@ class AggregateData { ...@@ -117,16 +117,17 @@ class AggregateData {
case Aggregate.STDDEV_SAMP: case Aggregate.STDDEV_SAMP:
case Aggregate.VAR_POP: case Aggregate.VAR_POP:
case Aggregate.VAR_SAMP: { case Aggregate.VAR_SAMP: {
// Using Welford's method, see also
// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
// http://www.johndcook.com/standard_deviation.html // http://www.johndcook.com/standard_deviation.html
double x = v.getDouble(); double x = v.getDouble();
if (count == 1) { if (count == 1) {
sum = x; mean = x;
vpn = 0; m2 = 0;
} else { } else {
double xs = sum - (x * (count - 1)); double delta = x - mean;
vpn += (xs * xs) / count / (count - 1); mean += delta / count;
sum += x; m2 += delta * (x - mean);
} }
break; break;
} }
...@@ -174,9 +175,9 @@ class AggregateData { ...@@ -174,9 +175,9 @@ class AggregateData {
if (count == 0) { if (count == 0) {
s = 0; s = 0;
} else { } else {
sum += distinctHashes.size(); m2 += distinctHashes.size();
sum = 100 * sum / count; m2 = 100 * m2 / count;
s = (int) sum; s = (int) m2;
s = s <= 0 ? 1 : s > 100 ? 100 : s; s = s <= 0 ? 1 : s > 100 ? 100 : s;
} }
v = ValueInt.get(s); v = ValueInt.get(s);
...@@ -204,28 +205,28 @@ class AggregateData { ...@@ -204,28 +205,28 @@ class AggregateData {
if (count < 1) { if (count < 1) {
return ValueNull.INSTANCE; return ValueNull.INSTANCE;
} }
v = ValueDouble.get(Math.sqrt(vpn / count)); v = ValueDouble.get(Math.sqrt(m2 / count));
break; break;
} }
case Aggregate.STDDEV_SAMP: { case Aggregate.STDDEV_SAMP: {
if (count < 2) { if (count < 2) {
return ValueNull.INSTANCE; return ValueNull.INSTANCE;
} }
v = ValueDouble.get(Math.sqrt(vpn / (count - 1))); v = ValueDouble.get(Math.sqrt(m2 / (count - 1)));
break; break;
} }
case Aggregate.VAR_POP: { case Aggregate.VAR_POP: {
if (count < 1) { if (count < 1) {
return ValueNull.INSTANCE; return ValueNull.INSTANCE;
} }
v = ValueDouble.get(vpn / count); v = ValueDouble.get(m2 / count);
break; break;
} }
case Aggregate.VAR_SAMP: { case Aggregate.VAR_SAMP: {
if (count < 2) { if (count < 2) {
return ValueNull.INSTANCE; return ValueNull.INSTANCE;
} }
v = ValueDouble.get(vpn / (count - 1)); v = ValueDouble.get(m2 / (count - 1));
break; break;
} }
default: default:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论