NLP的各种距离结合增量聚类思想计算当前进行信息量大的文本的筛选

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import org.apache.commons.lang3.StringUtils;


public class CountDistance {
private  ArrayList<String> dataSet = new ArrayList<>();
private  ArrayList<ArrayList<String>>   cluster = new ArrayList<>();
private  double threshold = 0.6; 
/*
* 数据初始化
*/
public void setDataSet(ArrayList<String> text) {
this.dataSet=text;
}
/*
* 相似聚类
*/
public ArrayList<ArrayList<String>>  getCluster (ArrayList<String> dataSet,double threshold) {
for(int i = 0;i < dataSet.size();i++) {
if(cluster.isEmpty()) {
ArrayList<String> temp = new ArrayList<>();
temp.add(dataSet.get(i));
cluster.add(temp);
}
else {
double maxSim = 0;
int loc = 0;
boolean mark = false;
for(int j = 0;j<cluster.size();j++) {
int size = cluster.get(j).size();
for(int k = 0;k < size;k++) {
if(computeSimilarity(dataSet.get(i), cluster.get(j).get(k)) > threshold) {
// System.out.println(computeSimilarity(dataSet.get(i), cluster.get(j).get(k)));
double Sim = computeSimilarity(dataSet.get(i), cluster.get(j).get(k));
if(Sim > maxSim) {
maxSim = Sim;
loc = j;
mark = true;
}
}
}
}
if(mark == false) {
ArrayList<String> temp = new ArrayList<>();
temp.add(dataSet.get(i));
cluster.add(temp);
}
else {
    cluster.get(loc).add(dataSet.get(i));
}
}
}
System.out.println(cluster.size());
return cluster;
}
/*
* 计算相似度
*/
public double computeSimilarity (String str1, String str2) {
@SuppressWarnings("deprecation")
double dis = StringUtils.getJaroWinklerDistance(str1, str2);
return dis;
}
/*
* 判断字符串是否包含数字
*/
public boolean containNumber(String str) {
boolean result = false;
for(int i = 0;i < str.length();i++ ) {
if(str.charAt(i)>=48&&str.charAt(i)<=57) {
result = true;
}
}
return result;
}
/*
* 判断字符串是否全部数字
*/
public boolean isDigit(String str) {
boolean result = true;
for(int i = 0;i < str.length();i++) {
if((str.charAt(i)<48 || str.charAt(i)>57))
result = false;
}
return result;
}
/*
* 祛除字符串中的非法字符
*/
public  String removeIllegalString(String str) {
String result = str;
String reg = "[\\u4e00-\\u9fa5]+";
boolean temp = true;
if(str.length()>1) {
   temp = str.substring(0, str.length()-1).matches(reg); 
}
if(str==""||isDigit(str)||str.contains("E+")||str.contains("<img")||containNumber(str) || temp == false)
result = null;
return result;
}
/*
* 文件的读取
*/
public ArrayList<String> readTxt(String textPath) {
ArrayList<String>  text = new ArrayList<>();
InputStream is = null;
try {
    is = new FileInputStream(textPath);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
    String temp = null;
    int a=0;
    // 读取一行,存储于字符串列表中
    for (String line = reader.readLine(); line != null; line = reader.readLine()) {
        line = line.trim();
        line = line.replaceAll("#", "");
        line = line.replaceAll("!", "");
        line = line.replaceAll("?", "");
        temp = removeIllegalString(line);
        if(temp!=null&&temp!=""&&temp!=" "&&temp.length()>2) {
        // if(a<1000){
        // a++;
                text.add(temp);
                System.out.println(temp);
        // }
        }
    }
    System.out.println(text.size());   
}catch (FileNotFoundException fnfe){
    fnfe.printStackTrace();
}catch (IOException ioe){
    ioe.printStackTrace();
} finally {
    try {
        if (is != null) {
            is.close();
            is = null;
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}
return text;
}
/*
* 写入文本文件
*/
public void writeTxt(String filepath, ArrayList<String> result) {
try {
FileWriter writer = new FileWriter(filepath);
for(int i = 0;i < result.size();i++) {
writer.write(result.get(i));
writer.write("\n");
}
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}  
}
public static void main(String[] args) {
// TODO Auto-generated method stub
CountDistance countDistance = new CountDistance();
ArrayList< String> text = new ArrayList<>();
text = countDistance.readTxt("D:\\USERS\\test.txt");
String filepath = "D:\\USERS\\test_01.txt";
countDistance.setDataSet(text);
countDistance.getCluster(countDistance.dataSet, countDistance.threshold);
// countDistance.writeTxt(filepath, text);
}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值