本篇主要是根据KMediods算法实现文档集的聚类。首先是要将需要聚类的文档进行向量化处理,这里采用的是TFIDF值来表示。文档之间的距离选用的是余弦距离,后面步骤没什么变化。KMediods算法聚类完成之后发现结果不是很理想,后面发现将数据降维后,结果还是比较理想的。
java实现代码如下:
public class DocKMediodsCluster extends AbstractCluster {
//阀值
public static final double THRESHOLD = 0.028;
//迭代次数
public static final int ITER_NUM = 10;
/*
* 初始化数据
*/
public List<DataPoint> initData() {
List<DataPoint> dataPoints = new ArrayList<DataPoint>();
try {
String path = DocKMediodsCluster.class.getClassLoader().getResource("测试").toURI().getPath();
DocumentSet documentSet = DocumentLoader.loadDocumentSetByThread(path);
List<Document> documents = documentSet.getDocuments();
DocumentUtils.calculateTFIDF_0(documents);
for(Document document : documents) {
DataPoint dataPoint = new DataPoint();
dataPoint.setValues(document.getTfidfWords());
dataPoint.setCategory(document.getCategory());
dataPoints.add(dataPoint);
}
} catch (URISyntaxException e) {
e.printStackTrace();
}
return dataPoints;
}
//随机生成中心点,并生成初始的K个聚类
public List<DataPointCluster> genInitCluster(List<DataPoint> points, int k) {
List<DataPointCluster> clusters = new ArrayList<DataPointCluster>();
Random random = new Random();
Set<String> categories = new HashSet<String>();
while