// # simple-statistics
let ss = {};
// # [Linear Regression](http://en.wikipedia.org/wiki/Linear_regression)
//
// [Simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression)
// is a simple way to find a fitted line
// between a set of coordinates.
ss.linear_regression = function() {
let linreg = {},
data = [];
// Assign data to the model. Data is assumed to be an array.
linreg.data = function(x) {
if (!arguments.length) return data;
data = x.slice();
return linreg;
};
// ## Fitting The Regression Line
//
// This is called after `.data()` and returns the
// equation `y = f(x)` which gives the position
// of the regression line at each point in `x`.
linreg.line = function() {
//if there's only one point, arbitrarily choose a slope of 0
//and a y-intercept of whatever the y of the initial point is
if (data.length == 1) {
m = 0;
b = data[0][1];
} else {
// Initialize our sums and scope the `m` and `b`
// variables that define the line.
let sum_x = 0,
sum_y = 0,
sum_xx = 0,
sum_xy = 0,
m,
b;
// Gather the sum of all x values, the sum of all
// y values, and the sum of x^2 and (x*y) for each
// value.
//
// In math notation, these would be SS_x, SS_y, SS_xx, and SS_xy
for (let i = 0; i < data.length; i++) {
sum_x += data[i][0];
sum_y += data[i][1];
sum_xx += data[i][0] * data[i][0];
sum_xy += data[i][0] * data[i][1];
}
// `m` is the slope of the regression line
m =
(data.length * sum_xy - sum_x * sum_y) /
(data.length * sum_xx - sum_x * sum_x);
// `b` is the y-intercept of the line.
b = sum_y / data.length - (m * sum_x) / data.length;
}
// Return a function that computes a `y` value for each
// x value it is given, based on the values of `b` and `a`
// that we just computed.
return function(x) {
return b + m * x;
};
};
return linreg;
};
// # [R Squared](http://en.wikipedia.org/wiki/Coefficient_of_determination)
//
// The r-squared value of data compared with a function `f`
// is the sum of the squared differences between the prediction
// and the actual value.
ss.r_squared = function(data, f) {
if (data.length < 2) return 1;
// Compute the average y value for the actual
// data set in order to compute the
// _total sum of squares_
let sum = 0,
average;
for (let i = 0; i < data.length; i++) {
sum += data[i][1];
}
average = sum / data.length;
// Compute the total sum of squares - the
// squared difference between each point
// and the average of all points.
let sum_of_squares = 0;
for (let j = 0; j < data.length; j++) {
sum_of_squares += Math.pow(average - data[j][1], 2);
}
// Finally estimate the error: the squared
// difference between the estimate and the actual data
// value at each point.
let err = 0;
for (let k = 0; k < data.length; k++) {
err += Math.pow(data[k][1] - f(data[k][0]), 2);
}
// As the error grows larger, it's ratio to the
// sum of squares increases and the r squared
// value grows lower.
return 1 - err / sum_of_squares;
};
// # [Bayesian Classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
//
// This is a naïve bayesian classifier that takes
// singly-nested objects.
ss.bayesian = function() {
// The `bayes_model` object is what will be exposed
// by this closure, with all of its extended methods, and will
// have access to all scope variables, like `total_count`.
let bayes_model = {},
// The number of items that are currently
// classified in the model
total_count = 0,
// Every item classified in the model
data = {};
// ## Train
// Train the classifier with a new item, which has a single
// dimension of Javascript literal keys and values.
bayes_model.train = function(item, category) {
// If the data object doesn't have any values
// for this category, create a new object for it.
if (!data[category]) data[category] = {};
// Iterate through each key in the item.
for (let k in item) {
let v = item[k];
// Initialize the nested object `data[category][k][item[k]]`
// with an object of keys that equal 0.
if (data[category][k] === undefined) data[category][k] = {};
if (data[category][k][v] === undefined) data[category][k][v] = 0;
// And increment the key for this key/value combination.
data[category][k][item[k]]++;
}
// Increment the number of items classified
total_count++;
};
// ## Score
// Generate a score of how well this item matches all
// possible categories based on its attributes
bayes_model.score = function(item) {
// Initialize an empty array of odds per category.
let odds = {},
category;
// Iterate through each key in the item,
// then iterate through each category that has been used
// in previous calls to `.train()`
for (let k in item) {
let v = item[k];
for (category in data) {
// Create an empty object for storing key - value combinations
// for this category.
if (odds[category] === undefined) odds[category] = {};
// If this item doesn't even have a property, it counts for nothing,
// but if it does have the property that we're looking for from
// the item to categorize, it counts based on how popular it is
// versus the whole population.
if (data[category][k]) {
odds[category][k + "_" + v] =
(data[category][k][v] || 0) / total_count;
} else {
odds[category][k + "_" + v] = 0;
}
}
}
// Set up a new object that will contain sums of these odds by category
let odds_sums = {};
for (category in odds) {
// Tally all of the odds for each category-combination pair -
// the non-existence of a category does not add anything to the
// score.
for (let combination in odds[category]) {
if (odds_sums[category] === undefined) odds_sums[category] = 0;
odds_sums[category] += odds[category][combination];
}
}
return odds_sums;
};
// Return the completed model.
return bayes_model;
};
// # sum
//
// is simply the result of adding all numbers
// together, starting from zero.
//
// This runs on `O(n)`, linear time in respect to the array
ss.sum = function(x) {
let sum = 0;
for (let i = 0; i < x.length; i++) {
sum += x[i];
}
return sum;
};
// # mean
//
// is the sum over the number of values
//
// This runs on `O(n)`, linear time in respect to the array
ss.mean = function(x) {
// The mean of no numbers is null
if (x.length === 0) return null;
return ss.sum(x) / x.length;
};
// # geometric mean
//
// a mean function that is more useful for numbers in different
// ranges.
//
// this is the nth root of the input numbers multipled by each other
//
// This runs on `O(n)`, linear time in respect to the array
ss.geometric_mean = function(x) {
// The mean of no numbers is null
if (x.length === 0) return null;
// the starting value.
let value = 1;
for (let i = 0; i < x.length; i++) {
// the geometric mean is only valid for positive numbers
if (x[i] <= 0) return null;
// repeatedly multiply the value by each number
value *= x[i];
}
return Math.pow(value, 1 / x.length);
};
// Alias this into its common name
ss.average = ss.mean;
// # min
//
// This is simply the minimum number in the set.
//
// This runs on `O(n)`, linear time in respect to the array
ss.min = function(x) {
let min;
for (let i = 0; i < x.length; i++) {
// On the first iteration of this loop, min is
// undefined and is thus made the minimum element in the array
if (x[i] < min || min === undefined) min = x[i];
}
return min;
};
// # max
//
// This is simply the maximum number in the set.
//
// This runs on `O(n)`, linear time in respect to the array
ss.max = function(x) {
let max;
for (let i = 0; i < x.length; i++) {
// On the first iteration of this loop, min is
// undefined and is thus made the minimum element in the array
if (x[i] > max || max === undefined) max = x[i];
}
return max;
};
// # [variance](http://en.wikipedia.org/wiki/Variance)
//
// is the sum of squared deviations from the mean
ss.variance = function(x) {
// The variance of no numbers is null
if (x.length === 0) return null;
let mean = ss.mean(x),
deviations = [];
// Make a list of squared deviations from the mean.
for (let i = 0; i < x.length; i++) {
deviations.push(Math.pow(x[i] - mean, 2));
}
// Find the mean value of that list
return ss.mean(deviations);
};
// # [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
//
// is just the square root of the variance.
ss.standard_deviation = function(x) {
// The standard deviation of no numbers is null
if (x.length === 0) return null;
return Math.sqrt(ss.variance(x));
};
ss.sum_squared_deviations = function(x) {
// The variance of no numbers is null
if (x.length <= 1) return null;
let mean = ss.mean(x),
sum = 0;
// Make a list of squared deviations from the mean.
for (let i = 0; i < x.length; i++) {
sum += Math.pow(x[i] - mean, 2);
}
return sum;
};
// # [variance](http://en.wikipedia.org/wiki/Variance)
//
// is the sum of squared deviations from the mean
ss.sample_variance = function(x) {
let sum_squared_deviations = ss.sum_squared_deviations(x);
if (sum_squared_deviations === null) return null;
// Find the mean value of that list
return sum_squared_deviations / (x.length - 1);
};
// # [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
//
// is just the square root of the variance.
ss.sample_standard_deviation = function(x) {
// The standard deviation of no numbers is null
if (x.length <= 1) return null;
return Math.sqrt(ss.sample_variance(x));
};
// # [covariance](http://en.wikipedia.org/wiki/Covariance)
//
// sample covariance of two datasets:
// how much do the two datasets move together?
// x and y are two datasets, represented as arrays of numbers.
ss.sample_covariance = function(x, y) {
// The two datasets must have the same length which must be more than 1
if (x.length <= 1 || x.length != y.length) {
return null;
}
// determine the mean of each dataset so that we can judge each
// value of the dataset fairly as the difference from the mean. this
// way, if one dataset is [1, 2, 3] and [2, 3, 4], their covariance
// does not suffer because of the difference in absolute values
let xmean = ss.mean(x),
ymean = ss.mean(y),
sum = 0;
// for each pair of values, the covariance increases when their
// difference from the mean is associated - if both are well above
// or if both are well below
// the mean, the covariance increases significantly.
for (let i = 0; i < x.length; i++) {
sum += (x[i] - xmean) * (y[i] - ymean);
}
// the covariance is weighted by the length of the datasets.
return sum / (x.length - 1);
};
// # [correlation](http://en.wikipedia.org/wiki/Correlation_and_dependence)
//
// Gets a measure of how correlated two datasets are, between -1 and 1
ss.sample_correlation = function(x, y) {
let cov = ss.sample_covariance(x, y),
xstd = ss.sample_standard_deviation(x),
ystd = ss.sample_standard_deviation(y);
if (cov === null || xstd === null || ystd === null) {
return null;
}
return cov / xstd / ystd;
};
// # [median](http://en.wikipedia.org/wiki/Median)
ss.median = function(x) {
// The median of an empty list is null
if (x.length === 0) return null;
// Sorting the array makes it easy to find the center, but
// use `.slice()` to ensure the original array `x` is not modified
let sorted = x.slice().sort(function(a, b) {
return a - b;
});
// If the length of the list is odd, it's the central number
if (sorted.length % 2 === 1) {
return sorted[(sorted.length - 1) / 2];
// Otherwise, the median is the average of the two numbers
// at the center of the list
} else {
let a = sorted[sorted.length / 2 - 1];
let b = sorted[sorted.length / 2];
return (a + b) / 2;
}
};
// # [mode](http://bit.ly/W5K4Yt)
// This implementation is inspired by [science.js](https://github.com/jasondavies/science.js/blob/master/src/stats/mode.js)
ss.mode = function(x) {
// Handle edge cases:
// The median of an empty list is null
if (x.length === 0) return null;
else if (x.length === 1) return x[0];
// Sorting the array lets us iterate through it below and be sure
// that every time we see a new number it's new and we'll never
// see the same number twice
let sorted = x.slice().sort(function(a, b) {
return a - b;
});
// This assumes it is dealing with an array of size > 1, since size
// 0 and 1 are handled immediately. Hence it starts at index 1 in the
// array.
let last = sorted[0],
// store the mode as we find new modes
mode,
// store how many times we've seen the mode
max_seen = 0,
// how many times the current candidate for the mode
// has been seen
seen_this = 1;
// end at sorted.length + 1 to fix the case in which the mode is
// the highest number that occurs in the sequence. the last iteration
// compares sorted[i], which is undefined, to the highest number
// in the series
for (let i = 1; i < sorted.length + 1; i++) {
// we're seeing a new number pass by
if (sorted[i] !== last) {
// the last number is the new mode since we saw it more
// often than the old one
if (seen_this > max_seen) {
max_seen = seen_this;
seen_this = 1;
mode = last;
}
last = sorted[i];
// if this isn't a new number, it's one more occurrence of
// the potential mode
} else {
seen_this++;
}
}
return mode;
};
// # [t-test](http://en.wikipedia.org/wiki/Student's_t-test)
//
// This is to compute a one-sample t-test, comparing the mean
// of a sample to a known value, x.
//
// in this case, we're trying to determine whether the
// population mean is equal to the value that we know, which is `x`
// here. usually the results here are used to look up a
// [p-value](http://en.wikipedia.org/wiki/P-value), which, for
// a certain level of significance, will let you determine that the
// null hypothesis can or cannot be rejected.
ss.t_test = function(sample, x) {
// The mean of the sample
let sample_mean = ss.mean(sample);
// The standard deviation of the sample
let sd = ss.standard_deviation(sample);
// Square root the length of the sample
let rootN = Math.sqrt(sample.length);
// Compute the known value against the sample,
// returning the t value
return (sample_mean - x) / (sd / rootN);
};
// # quantile
// This is a population quantile, since we assume to know the entire
// dataset in this library. Thus I'm trying to follow the
// [Quantiles of a Population](http://en.wikipedia.org/wiki/Quantile#Quantiles_of_a_population)
// algorithm from wikipedia.
//
// Sample is a one-dimensional array of numbers,
// and p is a decimal number from 0 to 1. In terms of a k/q
// quantile, p = k/q - it's just dealing with fractions or dealing
// with decimal values.
ss.quantile = function(sample, p) {
// We can't derive quantiles from an empty list
if (sample.length === 0) return null;
// invalid bounds. Microsoft Excel accepts 0 and 1, but
// we won't.
if (p >= 1 || p <= 0) return null;
// Sort a copy of the array. We'll need a sorted array to index
// the values in sorted order.
let sorted = sample.slice().sort(function(a, b) {
return a - b;
});
// Find a potential index in the list. In Wikipedia's terms, this
// is I<sub>p</sub>.
let idx = sorted.length * p;
// If this isn't an integer, we'll round up to the next value in
// the list.
if (idx % 1 !== 0) {
return sorted[Math.ceil(idx) - 1];
} else if (sample.length % 2 === 0) {
// If the list has even-length and we had an integer in the
// first place, we'll take the average of this number
// and the next value, if there is one
return (sorted[idx - 1] + sorted[idx]) / 2;
} else {
// Finally, in the simple case of an integer value
// with an odd-length list, return the sample value at the index.
return sorted[idx];
}
};
// Compute the matrices required for Jenks breaks. These matrices
// can be used for any classing of data with `classes <= n_classes`
ss.jenksMatrices = function(data, n_classes) {
// in the original implementation, these matrices are referred to
// as `LC` and `OP`
//
// * lower_class_limits (LC): optimal lower class limits
// * variance_combinations (OP): optimal variance combinations for all classes
let lower_class_limits = [],
variance_combinations = [],
// loop counters
i,
j,
// the variance, as computed at each step in the calculation
variance = 0;
// Initialize and fill each matrix with zeroes
for (i = 0; i < data.length + 1; i++) {
let tmp1 = [],
tmp2 = [];
for (j = 0; j < n_classes + 1; j++) {
tmp1.push(0);
tmp2.push(0);
}
lower_class_limits.push(tmp1);
variance_combinations.push(tmp2);
}
for (i = 1; i < n_classes + 1; i++) {
lower_class_limits[1][i] = 1;
variance_combinations[1][i] = 0;
// in the original implementation, 9999999 is used but
// since Javascript has `Infinity`, we use that.
for (j = 2; j < data.length + 1; j++) {
variance_combinations[j][i] = Infinity;
}
}
for (let l = 2; l < data.length + 1; l++) {
// `SZ` originally. this is the sum of the values seen thus
// far when calculating variance.
let sum = 0,
// `ZSQ` originally. the sum of squares of values seen
// thus far
sum_squares = 0,
// `WT` originally. This is the number of
w = 0,
// `IV` originally
i4 = 0;
// in several instances, you could say `Math.pow(x, 2)`
// instead of `x * x`, but this is slower in some browsers
// introduces an unnecessary concept.
for (let m = 1; m < l + 1; m++) {
// `III` originally
let lower_class_limit = l - m + 1,
val = data[lower_class_limit - 1];
// here we're estimating variance for each potential classing
// of the data, for each potential number of classes. `w`
// is the number of data points considered so far.
w++;
// increase the current sum and sum-of-squares
sum += val;
sum_squares += val * val;
// the variance at this point in the sequence is the difference
// between the sum of squares and the total x 2, over the number
// of samples.
variance = sum_squares - (sum * sum) / w;
i4 = lower_class_limit - 1;
if (i4 !== 0) {
for (j = 2; j < n_classes + 1; j++) {
if (
variance_combinations[l][j] >=
variance + variance_combinations[i4][j - 1]
) {
lower_class_limits[l][j] = lower_class_limit;
variance_combinations[l][j] =
variance + variance_combinations[i4][j - 1];
}
}
}
}
lower_class_limits[l][1] = 1;
variance_combinations[l][1] = variance;
}
return {
lower_class_limits: lower_class_limits,
variance_combinations: variance_combinations
};
};
// # [Jenks natural breaks optimization](http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization)
//
// Implementations: [1](http://danieljlewis.org/files/2010/06/Jenks.pdf) (python),
// [2](https://github.com/vvoovv/djeo-jenks/blob/master/main.js) (buggy),
// [3](https://github.com/simogeo/geostats/blob/master/lib/geostats.js#L407) (works)
ss.jenks = function(data, n_classes) {
// sort data in numerical order
data = data.slice().sort(function(a, b) {
return a - b;
});
// get our basic matrices
let matrices = ss.jenksMatrices(data, n_classes),
// we only need lower class limits here
lower_class_limits = matrices.lower_class_limits,
k = data.length - 1,
kclass = [],
countNum = n_classes;
// the calculation of classes will never include the upper and
// lower bounds, so we need to explicitly set them
kclass[n_classes] = data[data.length - 1];
kclass[0] = data[0];
// the lower_class_limits matrix is used as indexes into itself
// here: the `k` variable is reused in each iteration.
while (countNum > 1) {
kclass[countNum - 1] = data[lower_class_limits[k][countNum] - 2];
k = lower_class_limits[k][countNum] - 1;
countNum--;
}
return kclass;
};
// # Mixin
//
// Mixin simple_statistics to the Array native object. This is an optional
// feature that lets you treat simple_statistics as a native feature
// of Javascript.
ss.mixin = function() {
let support = !!(Object.defineProperty && Object.defineProperties);
if (!support)
throw new Error(
"without defineProperty, simple-statistics cannot be mixed in"
);
// only methods which work on basic arrays in a single step
// are supported
let arrayMethods = [
"median",
"standard_deviation",
"sum",
"mean",
"min",
"max",
"quantile",
"geometric_mean"
];
// create a closure with a method name so that a reference
// like `arrayMethods[i]` doesn't follow the loop increment
function wrap(method) {
return function() {
// cast any arguments into an array, since they're
// natively objects
let args = Array.prototype.slice.apply(arguments);
// make the first argument the array itself
args.unshift(this);
// return the result of the ss method
return ss[method].apply(ss, args);
};
}
// for each array function, define a function off of the Array
// prototype which automatically gets the array as the first
// argument. We use [defineProperty](https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/Object/defineProperty)
// because it allows these properties to be non-enumerable:
// `for (let in x)` loops will not run into problems with this
// implementation.
for (let i = 0; i < arrayMethods.length; i++) {
Object.defineProperty(Array.prototype, arrayMethods[i], {
value: wrap(arrayMethods[i]),
configurable: true,
enumerable: false,
writable: true
});
}
};
export default ss;
统计学分段方法(JS版)
最新推荐文章于 2024-05-31 09:55:42 发布