在完成了预处理和特征提取后,下一步就是用 聚类算法 进行文本聚类。在聚类算法中 距离函数的选择很重要,文本挖掘中最好的距离函数就是 余弦距离,但是Weka 3.6.10中 尚不支持 余弦距离,需要自己实现。
我们可以在 Eclipse 中创建一个文本挖掘的项目,引入 weka.jar,然后然后实现一个计算余弦距离的类,让这个类继承自weka中用于计算欧氏距离的类,代码如下:
package cn.csdn.test;
import java.util.Enumeration;
import weka.core.Attribute;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.neighboursearch.PerformanceStats;
//public class CosineDistance implements DistanceFunction, OptionHandler, Serializable, RevisionHandler{
public class CosineDistance extends EuclideanDistance{
public Instances m_Data = null;
public String version ="1.0";
@Override
public double distance(Instance arg0, Instance arg1) {
// TODO Auto-generated method stub
return distance(arg0, arg1, Double.POSITIVE_INFINITY, null);
}
@Override
public double distance(Instance arg0, Instance arg1, PerformanceStats arg2) {
// TODO Auto-generated method stub
return distance(arg0, arg1, Double.POSITIVE_INFINITY, arg2);
}
@Override
public double distance(Instance arg0, Instance arg1, double arg2) {
// TODO Auto-generated method stub
return distance(arg0, arg1, arg2, null);
}
@Override
public double distance(Instance first, Instance second, double cutOffValue,
PerformanceStats arg3) {
double distance = 0;
int firstI, secondI;
int firstNumValues = first.numValues();
int secondNumValues = second.numValues();
int numAttributes = m_Data.numAttributes();
int classIndex = m_Data.classIndex();
double normA, normB;
normA = 0;
normB = 0;
for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) {
if (p1 >= firstNumValues)
firstI = numAttributes;
else
firstI = first.index(p1);
if (p2 >= secondNumValues)
secondI = numAttributes;
else
secondI = second.index(p2);
if (firstI == classIndex) {
p1++;
continue;
}
// if ((firstI < numAttributes)) {
// p1++;
// continue;
// }
if (secondI == classIndex) {
p2++;
continue;
}
// if ((secondI < numAttributes)) {
// p2++;
// continue;
// }
double diff;
if (firstI == secondI) {
diff = difference(firstI, first.valueSparse(p1), second
.valueSparse(p2));
normA += Math.pow(first.valueSparse(p1), 2);
normB += Math.pow(second.valueSparse(p2), 2);
p1++;
p2++;
} else if (firstI > secondI) {
diff = difference(secondI, 0, second.valueSparse(p2));
normB += Math.pow(second.valueSparse(p2), 2);
p2++;
} else {
diff = difference(firstI, first.valueSparse(p1), 0);
normA += Math.pow(first.valueSparse(p1), 2);
p1++;
}
if (arg3 != null)
arg3.incrCoordCount();
distance = updateDistance(distance, diff);
if (distance > cutOffValue)
return Double.POSITIVE_INFINITY;
}
//do the post here, don't depends on other functions
//System.out.println(distance + " " + normA + " "+ normB);
distance = distance/Math.sqrt(normA)/Math.sqrt(normB);
distance = 1-distance;
if(distance < 0 || distance > 1)
System.err.println("unknown: " + distance);
return distance;
}
public double updateDistance(double currDist, double diff){
double result;
result = currDist;
result += diff;
return result;
}
public double difference(int index, double val1, double val2){
switch(m_Data.attribute(index).type()){
case Attribute.NOMINAL:
return Double.NaN;
//break;
case Attribute.NUMERIC:
return val1 * val2;
//break;
}
return Double.NaN;
}
@Override
public String getAttributeIndices() {
// TODO Auto-generated method stub
return null;
}
@Override
public Instances getInstances() {
// TODO Auto-generated method stub
return m_Data;
}
@Override
public boolean getInvertSelection() {
// TODO Auto-generated method stub
return false;
}
@Override
public void postProcessDistances(double[] arg0) {
// TODO Auto-generated method stub
}
@Override
public void setAttributeIndices(String arg0) {
// TODO Auto-generated method stub
}
@Override
public void setInstances(Instances arg0) {
// TODO Auto-generated method stub
m_Data = arg0;
}
@Override
public void setInvertSelection(boolean arg0) {
// TODO Auto-generated method stub
//do nothing
}
@Override
public void update(Instance arg0) {
// TODO Auto-generated method stub
//do nothing
}
@Override
public String[] getOptions() {
// TODO Auto-generated method stub
return null;
}
@Override
public Enumeration listOptions() {
// TODO Auto-generated method stub
return null;
}
@Override
public void setOptions(String[] arg0) throws Exception {
// TODO Auto-generated method stub
}
@Override
public String getRevision() {
// TODO Auto-generated method stub
return "Cosine Distance function writtern by Tom, version " + version;
}
public static void main(String[] args) throws Exception{
String src = "sample.csv";
DataSource source = new DataSource(src);
Instances data = source.getDataSet();
CosineDistance cd = new CosineDistance();
cd.setInstances(data);
System.out.println(cd.distance(data.instance(0), data.instance(1)));
System.out.println(cd.distance(data.instance(1), data.instance(2)));
}
}
然后再建立一个MyTextCluster.java文件用该类作为计算距离的实例,进行 K均值聚类。代码如下:
package cn.csdn.test;
import java.io.File;
import weka.clusterers.ClusterEvaluation;
import weka.clusterers.SimpleKMeans;
import weka.core.Instances;
import weka.core.converters.ArffLoader;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.StringToWordVector;
public class MyTextCluster {
public static String arffpath="E://2.arff"; //在这里配置arff文件路径
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
ArffLoader loader = new ArffLoader();
loader.setFile(new File(arffpath));
Instances dataRaw = loader.getDataSet();
StringToWordVector filter = new StringToWordVector();
filter.setInputFormat(dataRaw);
filter.setWordsToKeep(1000);
filter.setIDFTransform(true);
filter.setOutputWordCounts(true);
Instances dataFiltered = Filter.useFilter(dataRaw, filter);
SimpleKMeans skm = new SimpleKMeans();
skm.setDisplayStdDevs(false);
//skm.setDistanceFunction(new EuclideanDistance());
skm.setDistanceFunction(new CosineDistance());
skm.setMaxIterations(500);
skm.setDontReplaceMissingValues(true);
skm.setNumClusters(3);
skm.setPreserveInstancesOrder(false);
skm.setSeed(100);
skm.buildClusterer(dataFiltered);
ClusterEvaluation eval;
eval = new ClusterEvaluation();
eval.setClusterer(skm);
eval.evaluateClusterer(dataFiltered);
//System.out.println("# of clusters: " + eval.getNumClusters());
String a = eval.clusterResultsToString();
System.out.println(a);
for(int i = 0 ; i< dataFiltered.numInstances(); i++) {
System.out.println("Instance"+ String.valueOf(i)+" is in cluster" +skm.clusterInstance(dataFiltered.instance(i)));
}
}
}
运行结果如下:
kMeans
======
Number of iterations: 2
Within cluster sum of squared errors: 2.6483113613228255
Cluster centroids:
Cluster#
Attribute Full Data 0 1 2
(7) (1) (2) (4)
=============================================================
# 1.9459 0 0 3.4053
+ 13.3434 0 0 23.3509
- 3.557 0.6729 0.3365 5.8883
-- 0.3579 1.2528 0 0.3132
0 5.6761 21.2654 0 4.6168
01 0.7159 3.7583 0 0.3132
1 2.4783 0 0 4.337
10 0.8473 0 0 1.4828
11 0.4797 0 0 0.8394
12 1.2104 1.6946 0 1.6946
13 0.4842 0.8473 0 0.6355
14 0.3631 0 0 0.6355
15 0.3579 1.2528 0 0.3132
16 0.3579 0 0 0.6264
17 0.3579 0 0 0.6264
19 0.556 0 0 0.973
2 1.9187 0.5596 0 3.2178
20 0.3579 0 0 0.6264
2004 0.8948 0 0.6264 1.2528
2005 3.0579 0 0 5.3513
2006 3.4376 11.1923 0 3.2178
21 0.3631 0 0 0.6355
22 0.3579 0 0 0.6264
23 0.3579 1.2528 0 0.3132
24 0.5369 0 0 0.9396
25 0.3579 0 0 0.6264
26 0.7159 0 0 1.2528
27 0.7159 3.7583 0 0.3132
28 1.0393 5.0365 0 0.5596
29 0.3198 0.5596 0 0.4197
3 0.6249 0.3365 0 1.0094
30 0.5287 0.6729 0.3365 0.5888
31 0.4797 1.1192 0 0.5596
36 0.556 0 0 0.973
37 0.3579 0 0 0.6264
3rd 0.3579 0 0 0.6264
4 0.9593 0 0.2798 1.5389
40 0.3579 0 0 0.6264
49 0.7159 0 0 1.2528
5 1.0894 0 0 1.9064
55 0.556 0 0 0.973
6 1.6107 0 0 2.8187
7 0.7263 0 0 1.2709
7-inch 0.556 0 0 0.973
7/26/06 16.9572 0 0 29.6751
7/27/06 4.7258 0 0 8.2701
8 0.9683 0 0 1.6946
9 0.9683 0 0 1.6946
@ 4.8321 0 0 8.4562
A 0.721 0.6729 0.5047 0.8412
AM 0.5369 0 0 0.9396
API 2.2239 15.5673 0 0
About 0.3845 0.3365 0 0.5888
Actually 0.834 0 0 1.4594
Add 0.556 0 0 0.973
All 0.8948 0 0 1.566
Allah 0.556 0 0 0.973
Also 0.4842 0.8473 0 0.6355
Amazon 0.556 3.8918 0 0
America 0.556 0 0 0.973
An 0.4842 1.6946 0.4236 0.2118
And 0.6729 0 0.1682 1.0935
Annoucements 0.556 0 0 0.973
Antihexe 0.556 0 0 0.973
Anyway 0.3579 0 0 0.6264
April 0.3579 0 0 0.6264
Arbogast 0.834 5.8377 0 0
Archives 0.3631 0.8473 0 0.4236
As 0.3631 0 0 0.6355
Atom 0.5369 2.5055 0 0.3132
August 0.9683 2.5419 0 1.0591
Baker 0.556 3.8918 0 0
Bands 0.556 0 0 0.973
Be 0.3579 0 0.6264 0.3132
Because 0.5369 0 0 0.9396
Best 0.834 0 0 1.4594
Bill 0.556 3.8918 0 0
Blog 0.8473 0 0 1.4828
Blogroll 0.3579 1.2528 0 0.3132
Boondoggle 0.834 0 0 1.4594
Brainiac 1.9459 0 0 3.4053
(略)
Clustered Instances
0 1 ( 14%)
1 2 ( 29%)
2 4 ( 57%)
Instance0 is in cluster 2
Instance1 is in cluster 1
Instance2 is in cluster 0
Instance3 is in cluster 2
Instance4 is in cluster 1
Instance5 is in cluster 2
Instance6 is in cluster 2