classification的算法还有一些,不过还是打算先进入clustering的阶段。后续再回去补。
这一篇主要看看kmeans。kmeans是最简单的一种聚类算法,很清晰的EM思路。他的主要缺陷是聚类个数无法确定(靠人为设定),受初始中心点影响较大。
下面直接看看代码:国际惯例,看看buildCluster
getCapabilities().testWithFail(data);
m_Iterations = 0;
m_ReplaceMissingFilter = new ReplaceMissingValues();
Instances instances = new Instances(data);
instances.setClassIndex(-1);
if (!m_dontReplaceMissing) {
m_ReplaceMissingFilter.setInputFormat(instances);
instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
}
m_FullMissingCounts = new int[instances.numAttributes()];
if (m_displayStdDevs) {
m_FullStdDevs = new double[instances.numAttributes()];
}
m_FullNominalCounts = new int[instances.numAttributes()][0];
m_FullMissingCounts就是统计每个属性的缺失值
m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false);
for (int i = 0; i < instances.numAttributes(); i++) {
m_FullMissingCounts[i] = instances.attributeStats(i).missingCount;
if (instances.attribute(i).isNumeric()) {
if (m_displayStdDevs) {
m_FullStdDevs[i] = Math.sqrt(instances.variance(i));
}
if (m_FullMissingCounts[i] == instances.numInstances()) {
m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
}
} else {
m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts;
if (m_FullMissingCounts[i]
> m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
m_FullMeansOrMediansOrModes[i] =