统计学分段方法(JS版)

// # simple-statistics
let ss = {};
// # [Linear Regression](http://en.wikipedia.org/wiki/Linear_regression)
//
// [Simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression)
// is a simple way to find a fitted line
// between a set of coordinates.
ss.linear_regression = function() {
  let linreg = {},
    data = [];

  // Assign data to the model. Data is assumed to be an array.
  linreg.data = function(x) {
    if (!arguments.length) return data;
    data = x.slice();
    return linreg;
  };

  // ## Fitting The Regression Line
  //
  // This is called after `.data()` and returns the
  // equation `y = f(x)` which gives the position
  // of the regression line at each point in `x`.
  linreg.line = function() {
    //if there's only one point, arbitrarily choose a slope of 0
    //and a y-intercept of whatever the y of the initial point is
    if (data.length == 1) {
      m = 0;
      b = data[0][1];
    } else {
      // Initialize our sums and scope the `m` and `b`
      // variables that define the line.
      let sum_x = 0,
        sum_y = 0,
        sum_xx = 0,
        sum_xy = 0,
        m,
        b;

      // Gather the sum of all x values, the sum of all
      // y values, and the sum of x^2 and (x*y) for each
      // value.
      //
      // In math notation, these would be SS_x, SS_y, SS_xx, and SS_xy
      for (let i = 0; i < data.length; i++) {
        sum_x += data[i][0];
        sum_y += data[i][1];

        sum_xx += data[i][0] * data[i][0];
        sum_xy += data[i][0] * data[i][1];
      }

      // `m` is the slope of the regression line
      m =
        (data.length * sum_xy - sum_x * sum_y) /
        (data.length * sum_xx - sum_x * sum_x);

      // `b` is the y-intercept of the line.
      b = sum_y / data.length - (m * sum_x) / data.length;
    }

    // Return a function that computes a `y` value for each
    // x value it is given, based on the values of `b` and `a`
    // that we just computed.
    return function(x) {
      return b + m * x;
    };
  };

  return linreg;
};

// # [R Squared](http://en.wikipedia.org/wiki/Coefficient_of_determination)
//
// The r-squared value of data compared with a function `f`
// is the sum of the squared differences between the prediction
// and the actual value.
ss.r_squared = function(data, f) {
  if (data.length < 2) return 1;

  // Compute the average y value for the actual
  // data set in order to compute the
  // _total sum of squares_
  let sum = 0,
    average;
  for (let i = 0; i < data.length; i++) {
    sum += data[i][1];
  }
  average = sum / data.length;

  // Compute the total sum of squares - the
  // squared difference between each point
  // and the average of all points.
  let sum_of_squares = 0;
  for (let j = 0; j < data.length; j++) {
    sum_of_squares += Math.pow(average - data[j][1], 2);
  }

  // Finally estimate the error: the squared
  // difference between the estimate and the actual data
  // value at each point.
  let err = 0;
  for (let k = 0; k < data.length; k++) {
    err += Math.pow(data[k][1] - f(data[k][0]), 2);
  }

  // As the error grows larger, it's ratio to the
  // sum of squares increases and the r squared
  // value grows lower.
  return 1 - err / sum_of_squares;
};

// # [Bayesian Classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
//
// This is a naïve bayesian classifier that takes
// singly-nested objects.
ss.bayesian = function() {
  // The `bayes_model` object is what will be exposed
  // by this closure, with all of its extended methods, and will
  // have access to all scope variables, like `total_count`.
  let bayes_model = {},
    // The number of items that are currently
    // classified in the model
    total_count = 0,
    // Every item classified in the model
    data = {};

  // ## Train
  // Train the classifier with a new item, which has a single
  // dimension of Javascript literal keys and values.
  bayes_model.train = function(item, category) {
    // If the data object doesn't have any values
    // for this category, create a new object for it.
    if (!data[category]) data[category] = {};

    // Iterate through each key in the item.
    for (let k in item) {
      let v = item[k];
      // Initialize the nested object `data[category][k][item[k]]`
      // with an object of keys that equal 0.
      if (data[category][k] === undefined) data[category][k] = {};
      if (data[category][k][v] === undefined) data[category][k][v] = 0;

      // And increment the key for this key/value combination.
      data[category][k][item[k]]++;
    }
    // Increment the number of items classified
    total_count++;
  };

  // ## Score
  // Generate a score of how well this item matches all
  // possible categories based on its attributes
  bayes_model.score = function(item) {
    // Initialize an empty array of odds per category.
    let odds = {},
      category;
    // Iterate through each key in the item,
    // then iterate through each category that has been used
    // in previous calls to `.train()`
    for (let k in item) {
      let v = item[k];
      for (category in data) {
        // Create an empty object for storing key - value combinations
        // for this category.
        if (odds[category] === undefined) odds[category] = {};

        // If this item doesn't even have a property, it counts for nothing,
        // but if it does have the property that we're looking for from
        // the item to categorize, it counts based on how popular it is
        // versus the whole population.
        if (data[category][k]) {
          odds[category][k + "_" + v] =
            (data[category][k][v] || 0) / total_count;
        } else {
          odds[category][k + "_" + v] = 0;
        }
      }
    }

    // Set up a new object that will contain sums of these odds by category
    let odds_sums = {};

    for (category in odds) {
      // Tally all of the odds for each category-combination pair -
      // the non-existence of a category does not add anything to the
      // score.
      for (let combination in odds[category]) {
        if (odds_sums[category] === undefined) odds_sums[category] = 0;
        odds_sums[category] += odds[category][combination];
      }
    }

    return odds_sums;
  };

  // Return the completed model.
  return bayes_model;
};

// # sum
//
// is simply the result of adding all numbers
// together, starting from zero.
//
// This runs on `O(n)`, linear time in respect to the array
ss.sum = function(x) {
  let sum = 0;
  for (let i = 0; i < x.length; i++) {
    sum += x[i];
  }
  return sum;
};

// # mean
//
// is the sum over the number of values
//
// This runs on `O(n)`, linear time in respect to the array
ss.mean = function(x) {
  // The mean of no numbers is null
  if (x.length === 0) return null;

  return ss.sum(x) / x.length;
};

// # geometric mean
//
// a mean function that is more useful for numbers in different
// ranges.
//
// this is the nth root of the input numbers multipled by each other
//
// This runs on `O(n)`, linear time in respect to the array
ss.geometric_mean = function(x) {
  // The mean of no numbers is null
  if (x.length === 0) return null;

  // the starting value.
  let value = 1;

  for (let i = 0; i < x.length; i++) {
    // the geometric mean is only valid for positive numbers
    if (x[i] <= 0) return null;

    // repeatedly multiply the value by each number
    value *= x[i];
  }

  return Math.pow(value, 1 / x.length);
};

// Alias this into its common name
ss.average = ss.mean;

// # min
//
// This is simply the minimum number in the set.
//
// This runs on `O(n)`, linear time in respect to the array
ss.min = function(x) {
  let min;
  for (let i = 0; i < x.length; i++) {
    // On the first iteration of this loop, min is
    // undefined and is thus made the minimum element in the array
    if (x[i] < min || min === undefined) min = x[i];
  }
  return min;
};

// # max
//
// This is simply the maximum number in the set.
//
// This runs on `O(n)`, linear time in respect to the array
ss.max = function(x) {
  let max;
  for (let i = 0; i < x.length; i++) {
    // On the first iteration of this loop, min is
    // undefined and is thus made the minimum element in the array
    if (x[i] > max || max === undefined) max = x[i];
  }
  return max;
};

// # [variance](http://en.wikipedia.org/wiki/Variance)
//
// is the sum of squared deviations from the mean
ss.variance = function(x) {
  // The variance of no numbers is null
  if (x.length === 0) return null;

  let mean = ss.mean(x),
    deviations = [];

  // Make a list of squared deviations from the mean.
  for (let i = 0; i < x.length; i++) {
    deviations.push(Math.pow(x[i] - mean, 2));
  }

  // Find the mean value of that list
  return ss.mean(deviations);
};

// # [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
//
// is just the square root of the variance.
ss.standard_deviation = function(x) {
  // The standard deviation of no numbers is null
  if (x.length === 0) return null;

  return Math.sqrt(ss.variance(x));
};

ss.sum_squared_deviations = function(x) {
  // The variance of no numbers is null
  if (x.length <= 1) return null;

  let mean = ss.mean(x),
    sum = 0;

  // Make a list of squared deviations from the mean.
  for (let i = 0; i < x.length; i++) {
    sum += Math.pow(x[i] - mean, 2);
  }

  return sum;
};

// # [variance](http://en.wikipedia.org/wiki/Variance)
//
// is the sum of squared deviations from the mean
ss.sample_variance = function(x) {
  let sum_squared_deviations = ss.sum_squared_deviations(x);
  if (sum_squared_deviations === null) return null;

  // Find the mean value of that list
  return sum_squared_deviations / (x.length - 1);
};

// # [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
//
// is just the square root of the variance.
ss.sample_standard_deviation = function(x) {
  // The standard deviation of no numbers is null
  if (x.length <= 1) return null;

  return Math.sqrt(ss.sample_variance(x));
};

// # [covariance](http://en.wikipedia.org/wiki/Covariance)
//
// sample covariance of two datasets:
// how much do the two datasets move together?
// x and y are two datasets, represented as arrays of numbers.
ss.sample_covariance = function(x, y) {
  // The two datasets must have the same length which must be more than 1
  if (x.length <= 1 || x.length != y.length) {
    return null;
  }

  // determine the mean of each dataset so that we can judge each
  // value of the dataset fairly as the difference from the mean. this
  // way, if one dataset is [1, 2, 3] and [2, 3, 4], their covariance
  // does not suffer because of the difference in absolute values
  let xmean = ss.mean(x),
    ymean = ss.mean(y),
    sum = 0;

  // for each pair of values, the covariance increases when their
  // difference from the mean is associated - if both are well above
  // or if both are well below
  // the mean, the covariance increases significantly.
  for (let i = 0; i < x.length; i++) {
    sum += (x[i] - xmean) * (y[i] - ymean);
  }

  // the covariance is weighted by the length of the datasets.
  return sum / (x.length - 1);
};

// # [correlation](http://en.wikipedia.org/wiki/Correlation_and_dependence)
//
// Gets a measure of how correlated two datasets are, between -1 and 1
ss.sample_correlation = function(x, y) {
  let cov = ss.sample_covariance(x, y),
    xstd = ss.sample_standard_deviation(x),
    ystd = ss.sample_standard_deviation(y);

  if (cov === null || xstd === null || ystd === null) {
    return null;
  }

  return cov / xstd / ystd;
};

// # [median](http://en.wikipedia.org/wiki/Median)
ss.median = function(x) {
  // The median of an empty list is null
  if (x.length === 0) return null;

  // Sorting the array makes it easy to find the center, but
  // use `.slice()` to ensure the original array `x` is not modified
  let sorted = x.slice().sort(function(a, b) {
    return a - b;
  });

  // If the length of the list is odd, it's the central number
  if (sorted.length % 2 === 1) {
    return sorted[(sorted.length - 1) / 2];
    // Otherwise, the median is the average of the two numbers
    // at the center of the list
  } else {
    let a = sorted[sorted.length / 2 - 1];
    let b = sorted[sorted.length / 2];
    return (a + b) / 2;
  }
};

// # [mode](http://bit.ly/W5K4Yt)
// This implementation is inspired by [science.js](https://github.com/jasondavies/science.js/blob/master/src/stats/mode.js)
ss.mode = function(x) {
  // Handle edge cases:
  // The median of an empty list is null
  if (x.length === 0) return null;
  else if (x.length === 1) return x[0];

  // Sorting the array lets us iterate through it below and be sure
  // that every time we see a new number it's new and we'll never
  // see the same number twice
  let sorted = x.slice().sort(function(a, b) {
    return a - b;
  });

  // This assumes it is dealing with an array of size > 1, since size
  // 0 and 1 are handled immediately. Hence it starts at index 1 in the
  // array.
  let last = sorted[0],
    // store the mode as we find new modes
    mode,
    // store how many times we've seen the mode
    max_seen = 0,
    // how many times the current candidate for the mode
    // has been seen
    seen_this = 1;

  // end at sorted.length + 1 to fix the case in which the mode is
  // the highest number that occurs in the sequence. the last iteration
  // compares sorted[i], which is undefined, to the highest number
  // in the series
  for (let i = 1; i < sorted.length + 1; i++) {
    // we're seeing a new number pass by
    if (sorted[i] !== last) {
      // the last number is the new mode since we saw it more
      // often than the old one
      if (seen_this > max_seen) {
        max_seen = seen_this;
        seen_this = 1;
        mode = last;
      }
      last = sorted[i];
      // if this isn't a new number, it's one more occurrence of
      // the potential mode
    } else {
      seen_this++;
    }
  }
  return mode;
};

// # [t-test](http://en.wikipedia.org/wiki/Student's_t-test)
//
// This is to compute a one-sample t-test, comparing the mean
// of a sample to a known value, x.
//
// in this case, we're trying to determine whether the
// population mean is equal to the value that we know, which is `x`
// here. usually the results here are used to look up a
// [p-value](http://en.wikipedia.org/wiki/P-value), which, for
// a certain level of significance, will let you determine that the
// null hypothesis can or cannot be rejected.
ss.t_test = function(sample, x) {
  // The mean of the sample
  let sample_mean = ss.mean(sample);

  // The standard deviation of the sample
  let sd = ss.standard_deviation(sample);

  // Square root the length of the sample
  let rootN = Math.sqrt(sample.length);

  // Compute the known value against the sample,
  // returning the t value
  return (sample_mean - x) / (sd / rootN);
};

// # quantile
// This is a population quantile, since we assume to know the entire
// dataset in this library. Thus I'm trying to follow the
// [Quantiles of a Population](http://en.wikipedia.org/wiki/Quantile#Quantiles_of_a_population)
// algorithm from wikipedia.
//
// Sample is a one-dimensional array of numbers,
// and p is a decimal number from 0 to 1. In terms of a k/q
// quantile, p = k/q - it's just dealing with fractions or dealing
// with decimal values.
ss.quantile = function(sample, p) {
  // We can't derive quantiles from an empty list
  if (sample.length === 0) return null;

  // invalid bounds. Microsoft Excel accepts 0 and 1, but
  // we won't.
  if (p >= 1 || p <= 0) return null;

  // Sort a copy of the array. We'll need a sorted array to index
  // the values in sorted order.
  let sorted = sample.slice().sort(function(a, b) {
    return a - b;
  });

  // Find a potential index in the list. In Wikipedia's terms, this
  // is I<sub>p</sub>.
  let idx = sorted.length * p;

  // If this isn't an integer, we'll round up to the next value in
  // the list.
  if (idx % 1 !== 0) {
    return sorted[Math.ceil(idx) - 1];
  } else if (sample.length % 2 === 0) {
    // If the list has even-length and we had an integer in the
    // first place, we'll take the average of this number
    // and the next value, if there is one
    return (sorted[idx - 1] + sorted[idx]) / 2;
  } else {
    // Finally, in the simple case of an integer value
    // with an odd-length list, return the sample value at the index.
    return sorted[idx];
  }
};

// Compute the matrices required for Jenks breaks. These matrices
// can be used for any classing of data with `classes <= n_classes`
ss.jenksMatrices = function(data, n_classes) {
  // in the original implementation, these matrices are referred to
  // as `LC` and `OP`
  //
  // * lower_class_limits (LC): optimal lower class limits
  // * variance_combinations (OP): optimal variance combinations for all classes
  let lower_class_limits = [],
    variance_combinations = [],
    // loop counters
    i,
    j,
    // the variance, as computed at each step in the calculation
    variance = 0;

  // Initialize and fill each matrix with zeroes
  for (i = 0; i < data.length + 1; i++) {
    let tmp1 = [],
      tmp2 = [];
    for (j = 0; j < n_classes + 1; j++) {
      tmp1.push(0);
      tmp2.push(0);
    }
    lower_class_limits.push(tmp1);
    variance_combinations.push(tmp2);
  }

  for (i = 1; i < n_classes + 1; i++) {
    lower_class_limits[1][i] = 1;
    variance_combinations[1][i] = 0;
    // in the original implementation, 9999999 is used but
    // since Javascript has `Infinity`, we use that.
    for (j = 2; j < data.length + 1; j++) {
      variance_combinations[j][i] = Infinity;
    }
  }

  for (let l = 2; l < data.length + 1; l++) {
    // `SZ` originally. this is the sum of the values seen thus
    // far when calculating variance.
    let sum = 0,
      // `ZSQ` originally. the sum of squares of values seen
      // thus far
      sum_squares = 0,
      // `WT` originally. This is the number of
      w = 0,
      // `IV` originally
      i4 = 0;

    // in several instances, you could say `Math.pow(x, 2)`
    // instead of `x * x`, but this is slower in some browsers
    // introduces an unnecessary concept.
    for (let m = 1; m < l + 1; m++) {
      // `III` originally
      let lower_class_limit = l - m + 1,
        val = data[lower_class_limit - 1];

      // here we're estimating variance for each potential classing
      // of the data, for each potential number of classes. `w`
      // is the number of data points considered so far.
      w++;

      // increase the current sum and sum-of-squares
      sum += val;
      sum_squares += val * val;

      // the variance at this point in the sequence is the difference
      // between the sum of squares and the total x 2, over the number
      // of samples.
      variance = sum_squares - (sum * sum) / w;

      i4 = lower_class_limit - 1;

      if (i4 !== 0) {
        for (j = 2; j < n_classes + 1; j++) {
          if (
            variance_combinations[l][j] >=
            variance + variance_combinations[i4][j - 1]
          ) {
            lower_class_limits[l][j] = lower_class_limit;
            variance_combinations[l][j] =
              variance + variance_combinations[i4][j - 1];
          }
        }
      }
    }

    lower_class_limits[l][1] = 1;
    variance_combinations[l][1] = variance;
  }

  return {
    lower_class_limits: lower_class_limits,
    variance_combinations: variance_combinations
  };
};

// # [Jenks natural breaks optimization](http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization)
//
// Implementations: [1](http://danieljlewis.org/files/2010/06/Jenks.pdf) (python),
// [2](https://github.com/vvoovv/djeo-jenks/blob/master/main.js) (buggy),
// [3](https://github.com/simogeo/geostats/blob/master/lib/geostats.js#L407) (works)

ss.jenks = function(data, n_classes) {
  // sort data in numerical order
  data = data.slice().sort(function(a, b) {
    return a - b;
  });

  // get our basic matrices
  let matrices = ss.jenksMatrices(data, n_classes),
    // we only need lower class limits here
    lower_class_limits = matrices.lower_class_limits,
    k = data.length - 1,
    kclass = [],
    countNum = n_classes;

  // the calculation of classes will never include the upper and
  // lower bounds, so we need to explicitly set them
  kclass[n_classes] = data[data.length - 1];
  kclass[0] = data[0];

  // the lower_class_limits matrix is used as indexes into itself
  // here: the `k` variable is reused in each iteration.
  while (countNum > 1) {
    kclass[countNum - 1] = data[lower_class_limits[k][countNum] - 2];
    k = lower_class_limits[k][countNum] - 1;
    countNum--;
  }

  return kclass;
};

// # Mixin
//
// Mixin simple_statistics to the Array native object. This is an optional
// feature that lets you treat simple_statistics as a native feature
// of Javascript.
ss.mixin = function() {
  let support = !!(Object.defineProperty && Object.defineProperties);
  if (!support)
    throw new Error(
      "without defineProperty, simple-statistics cannot be mixed in"
    );

  // only methods which work on basic arrays in a single step
  // are supported
  let arrayMethods = [
    "median",
    "standard_deviation",
    "sum",
    "mean",
    "min",
    "max",
    "quantile",
    "geometric_mean"
  ];

  // create a closure with a method name so that a reference
  // like `arrayMethods[i]` doesn't follow the loop increment
  function wrap(method) {
    return function() {
      // cast any arguments into an array, since they're
      // natively objects
      let args = Array.prototype.slice.apply(arguments);
      // make the first argument the array itself
      args.unshift(this);
      // return the result of the ss method
      return ss[method].apply(ss, args);
    };
  }

  // for each array function, define a function off of the Array
  // prototype which automatically gets the array as the first
  // argument. We use [defineProperty](https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/Object/defineProperty)
  // because it allows these properties to be non-enumerable:
  // `for (let in x)` loops will not run into problems with this
  // implementation.
  for (let i = 0; i < arrayMethods.length; i++) {
    Object.defineProperty(Array.prototype, arrayMethods[i], {
      value: wrap(arrayMethods[i]),
      configurable: true,
      enumerable: false,
      writable: true
    });
  }
};
export default ss;
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值