一、问题描述
这是一些文本数据,数据的User表示用户,后面的2:1这些,冒号前面表示用户所属的社区,后面表示所属社区的次数,例如第一行表示User1属于2社区1次,属于3社区2次,属于4社区7次,能不能编写一个java程序计算出User在各个社区的概率?例如第一行数据,User1在社区2的概率0.1,在3社区概率0.2,在4社区概率0.7,并且设置一个阈值为去除小概率社区,输出社区概率较大的社区,例如设置的阈值为0.1,则输出社区3和4
【ps:仅供参考,没有去优化】
二、实现代码
package com.test;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
public class JustTest {
public static void main(String[] args) {
String path = "E:\\";
String text = readTxt(path + "users.txt", 0.4f);
writeText(text, path + "users_new.txt");
}
/**
* 写入文件
* @param text
* @param path
*/
public static void writeText(String text, String path) {
System.out.println("保存的文件路径为:"+path);
FileWriter fileWrite = null;
try {
fileWrite = new FileWriter(path);
fileWrite.write(text);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
fileWrite.flush();
fileWrite.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 匹配
* @param regex
* @param data
* @return
*/
public static String[] match(String regex, String data) {
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(data);
StringBuilder result = new StringBuilder(45);
while (m.find()) {
result.append(m.group() + ",");
}
String[] arr = result.toString().split(",");
return arr;
}
/**
* 数据处理
* @param data
* @param thresholdValue
* @param path
* @return
* @throws IOException
*/
public static String processingData(String data, float thresholdValue,
String path) throws IOException {
String[] timeArr = match("((?<=(\\d+:))\\d+)", data);// 匹配次数
int timeArrLen = timeArr.length;
int countTime = 0;
// 统计总次数
for (String time : timeArr) {
if(StringUtils.isNotBlank(time)){
countTime += Integer.parseInt(time);
}
}
// 计算概率
float[] probability = new float[timeArrLen];
for (int i = 0; i < timeArrLen; i++) {
if(StringUtils.isNotBlank(timeArr[i])){
probability[i] = Float.parseFloat(timeArr[i]) / countTime;
}
}
// 封装数据
String[] communityArr = match("-?\\d+(?=:)", data);// 匹配社区
StringBuilder resultRow = new StringBuilder(150);
resultRow.append(data).append("\t\t概率");
for (int i = 0; i < timeArrLen; i++) {
if (probability[i] > thresholdValue) {
resultRow.append(communityArr[i]).append(":")
.append(probability[i]).append("\t");
}
}
resultRow.append("\r\n");
return resultRow.toString();
}
/**
* 读取txt文本
*
* @param path
*/
public static String readTxt(String path, float thresholdValue) {
System.out.println("读取的文件路径为:"+path);
File file = new File(path);
BufferedReader reader = null;
String tempString = null;
StringBuilder resultTxt = new StringBuilder(3000);
try {
reader = new BufferedReader(new FileReader(file));
while ((tempString = reader.readLine()) != null) {
//System.out.println(tempString);
if (!tempString.isEmpty()) {
resultTxt.append(processingData(tempString, thresholdValue,
path));
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return resultTxt.toString();
}
}