import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import org.apache.commons.lang3.StringUtils;
public class CountDistance {
private ArrayList<String> dataSet = new ArrayList<>();
private ArrayList<ArrayList<String>> cluster = new ArrayList<>();
private double threshold = 0.6;
/*
* 数据初始化
*/
public void setDataSet(ArrayList<String> text) {
this.dataSet=text;
}
/*
* 相似聚类
*/
public ArrayList<ArrayList<String>> getCluster (ArrayList<String> dataSet,double threshold) {
for(int i = 0;i < dataSet.size();i++) {
if(cluster.isEmpty()) {
ArrayList<String> temp = new ArrayList<>();
temp.add(dataSet.get(i));
cluster.add(temp);
}
else {
double maxSim = 0;
int loc = 0;
boolean mark = false;
for(int j = 0;j<cluster.size();j++) {
int size = cluster.get(j).size();
for(int k = 0;k < size;k++) {
if(computeSimilarity(dataSet.get(i), cluster.get(j).get(k)) > threshold) {
// System.out.println(computeSimilarity(dataSet.get(i), cluster.get(j).get(k)));
double Sim = computeSimilarity(dataSet.get(i), cluster.get(j).get(k));
if(Sim > maxSim) {
maxSim = Sim;
loc = j;
mark = true;
}
}
}
}
if(mark == false) {
ArrayList<String> temp = new ArrayList<>();
temp.add(dataSet.get(i));
cluster.add(temp);
}
else {
cluster.get(loc).add(dataSet.get(i));
}
}
}
System.out.println(cluster.size());
return cluster;
}
/*
* 计算相似度
*/
public double computeSimilarity (String str1, String str2) {
@SuppressWarnings("deprecation")
double dis = StringUtils.getJaroWinklerDistance(str1, str2);
return dis;
}
/*
* 判断字符串是否包含数字
*/
public boolean containNumber(String str) {
boolean result = false;
for(int i = 0;i < str.length();i++ ) {
if(str.charAt(i)>=48&&str.charAt(i)<=57) {
result = true;
}
}
return result;
}
/*
* 判断字符串是否全部数字
*/
public boolean isDigit(String str) {
boolean result = true;
for(int i = 0;i < str.length();i++) {
if((str.charAt(i)<48 || str.charAt(i)>57))
result = false;
}
return result;
}
/*
* 祛除字符串中的非法字符
*/
public String removeIllegalString(String str) {
String result = str;
String reg = "[\\u4e00-\\u9fa5]+";
boolean temp = true;
if(str.length()>1) {
temp = str.substring(0, str.length()-1).matches(reg);
}
if(str==""||isDigit(str)||str.contains("E+")||str.contains("<img")||containNumber(str) || temp == false)
result = null;
return result;
}
/*
* 文件的读取
*/
public ArrayList<String> readTxt(String textPath) {
ArrayList<String> text = new ArrayList<>();
InputStream is = null;
try {
is = new FileInputStream(textPath);
BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String temp = null;
int a=0;
// 读取一行,存储于字符串列表中
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
line = line.trim();
line = line.replaceAll("#", "");
line = line.replaceAll("!", "");
line = line.replaceAll("?", "");
temp = removeIllegalString(line);
if(temp!=null&&temp!=""&&temp!=" "&&temp.length()>2) {
// if(a<1000){
// a++;
text.add(temp);
System.out.println(temp);
// }
}
}
System.out.println(text.size());
}catch (FileNotFoundException fnfe){
fnfe.printStackTrace();
}catch (IOException ioe){
ioe.printStackTrace();
} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
return text;
}
/*
* 写入文本文件
*/
public void writeTxt(String filepath, ArrayList<String> result) {
try {
FileWriter writer = new FileWriter(filepath);
for(int i = 0;i < result.size();i++) {
writer.write(result.get(i));
writer.write("\n");
}
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
CountDistance countDistance = new CountDistance();
ArrayList< String> text = new ArrayList<>();
text = countDistance.readTxt("D:\\USERS\\test.txt");
String filepath = "D:\\USERS\\test_01.txt";
countDistance.setDataSet(text);
countDistance.getCluster(countDistance.dataSet, countDistance.threshold);
// countDistance.writeTxt(filepath, text);
}
}
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import org.apache.commons.lang3.StringUtils;
public class CountDistance {
private ArrayList<String> dataSet = new ArrayList<>();
private ArrayList<ArrayList<String>> cluster = new ArrayList<>();
private double threshold = 0.6;
/*
* 数据初始化
*/
public void setDataSet(ArrayList<String> text) {
this.dataSet=text;
}
/*
* 相似聚类
*/
public ArrayList<ArrayList<String>> getCluster (ArrayList<String> dataSet,double threshold) {
for(int i = 0;i < dataSet.size();i++) {
if(cluster.isEmpty()) {
ArrayList<String> temp = new ArrayList<>();
temp.add(dataSet.get(i));
cluster.add(temp);
}
else {
double maxSim = 0;
int loc = 0;
boolean mark = false;
for(int j = 0;j<cluster.size();j++) {
int size = cluster.get(j).size();
for(int k = 0;k < size;k++) {
if(computeSimilarity(dataSet.get(i), cluster.get(j).get(k)) > threshold) {
// System.out.println(computeSimilarity(dataSet.get(i), cluster.get(j).get(k)));
double Sim = computeSimilarity(dataSet.get(i), cluster.get(j).get(k));
if(Sim > maxSim) {
maxSim = Sim;
loc = j;
mark = true;
}
}
}
}
if(mark == false) {
ArrayList<String> temp = new ArrayList<>();
temp.add(dataSet.get(i));
cluster.add(temp);
}
else {
cluster.get(loc).add(dataSet.get(i));
}
}
}
System.out.println(cluster.size());
return cluster;
}
/*
* 计算相似度
*/
public double computeSimilarity (String str1, String str2) {
@SuppressWarnings("deprecation")
double dis = StringUtils.getJaroWinklerDistance(str1, str2);
return dis;
}
/*
* 判断字符串是否包含数字
*/
public boolean containNumber(String str) {
boolean result = false;
for(int i = 0;i < str.length();i++ ) {
if(str.charAt(i)>=48&&str.charAt(i)<=57) {
result = true;
}
}
return result;
}
/*
* 判断字符串是否全部数字
*/
public boolean isDigit(String str) {
boolean result = true;
for(int i = 0;i < str.length();i++) {
if((str.charAt(i)<48 || str.charAt(i)>57))
result = false;
}
return result;
}
/*
* 祛除字符串中的非法字符
*/
public String removeIllegalString(String str) {
String result = str;
String reg = "[\\u4e00-\\u9fa5]+";
boolean temp = true;
if(str.length()>1) {
temp = str.substring(0, str.length()-1).matches(reg);
}
if(str==""||isDigit(str)||str.contains("E+")||str.contains("<img")||containNumber(str) || temp == false)
result = null;
return result;
}
/*
* 文件的读取
*/
public ArrayList<String> readTxt(String textPath) {
ArrayList<String> text = new ArrayList<>();
InputStream is = null;
try {
is = new FileInputStream(textPath);
BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
String temp = null;
int a=0;
// 读取一行,存储于字符串列表中
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
line = line.trim();
line = line.replaceAll("#", "");
line = line.replaceAll("!", "");
line = line.replaceAll("?", "");
temp = removeIllegalString(line);
if(temp!=null&&temp!=""&&temp!=" "&&temp.length()>2) {
// if(a<1000){
// a++;
text.add(temp);
System.out.println(temp);
// }
}
}
System.out.println(text.size());
}catch (FileNotFoundException fnfe){
fnfe.printStackTrace();
}catch (IOException ioe){
ioe.printStackTrace();
} finally {
try {
if (is != null) {
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
return text;
}
/*
* 写入文本文件
*/
public void writeTxt(String filepath, ArrayList<String> result) {
try {
FileWriter writer = new FileWriter(filepath);
for(int i = 0;i < result.size();i++) {
writer.write(result.get(i));
writer.write("\n");
}
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
// TODO Auto-generated method stub
CountDistance countDistance = new CountDistance();
ArrayList< String> text = new ArrayList<>();
text = countDistance.readTxt("D:\\USERS\\test.txt");
String filepath = "D:\\USERS\\test_01.txt";
countDistance.setDataSet(text);
countDistance.getCluster(countDistance.dataSet, countDistance.threshold);
// countDistance.writeTxt(filepath, text);
}
}