从网页获取单词,并统计数量,然后输出到CSV文档中。
代码是第一次写,逻辑很混乱,而且功能也不是很完善...
尝试了几个网页,仅对单纯的文档类网页有效.....
虽然是个练习,不过练习的挺失败的...传上来权当纪念。
下面是部分代码。
package com.test.dataCount;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import com.test.bean.DataBean;
import com.test.bean.ParmBean;
public class DataCount {
private static Pattern SPLIT_PATTERN =
Pattern.compile("<(\".*?\"|'.*?'|[^'\"])*?>");
public DataBean execute(ParmBean parmBean) throws Exception {
DataBean dataBean = new DataBean();
URL url = null;
URLConnection URLconn = null;
InputStream inputStream = null;
InputStreamReader inputSR = null;
BufferedReader bufferedReader = null;
try{
url = new URL(parmBean.getUrl());
URLconn = url.openConnection();
inputStream = URLconn.getInputStream();
inputSR = new InputStreamReader(inputStream, "UTF-8");
bufferedReader = new BufferedReader(inputSR);
List<String> dataCountsList = this.getCounts(bufferedReader, parmBean.getTarget());
dataBean.setDataList(dataCountsList);
} finally {
if (bufferedReader != null) {
bufferedReader.close();
}
if (inputSR != null) {
inputSR.close();
}
if (inputStream != null) {
inputStream.close();
}
}
return dataBean;
}
private List<String> getCounts(BufferedReader bufferedReader, String target) throws Exception {
List<String> list = new ArrayList<String>();
String line = null;
while ((line = bufferedReader.readLine()) != null) {
if (line.trim().isEmpty()) {
continue;
}
String[] strSplit = SPLIT_PATTERN.split(line);
for (int i = 0; i < strSplit.length; i++) {
if (strSplit[i].startsWith(target)) {
list.add(strSplit[i]);
} else {
continue;
}
}
}
return list;
}
}
上面这个class用来获取并统计词数。
package com.test.fileWrite;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.List;
import com.test.bean.DataBean;
public class CSVFileWriter {
private File file = null;
protected FileOutputStream fileOutputStream = null;
private OutputStreamWriter outputStreamWriter = null;
protected PrintWriter printWriter = null;
public void execute(DataBean dataBean) throws Exception {
init(dataBean);
execProcess(dataBean);
close();
}
public void init(DataBean dataBean) throws Exception {
this.file = new File(dataBean.getFileName());
System.out.println("ファイルのパス: " + this.file.getAbsolutePath());
this.fileOutputStream = new FileOutputStream(this.file);
this.outputStreamWriter = new OutputStreamWriter(this.fileOutputStream, "UTF-8");
this.printWriter = new PrintWriter(this.outputStreamWriter);
}
public void execProcess(DataBean dataBean) {
List<String> list = dataBean.getDataList();
printWriter.println("No.\tItem\t");
for (int i = 0; i < list.size(); i++) {
printWriter.println((i + 1) + "\t" + list.get(i));
}
}
public void close() throws Exception {
if (this.outputStreamWriter != null) {
this.outputStreamWriter.flush();
this.outputStreamWriter.close();
}
if (this.fileOutputStream != null) {
this.fileOutputStream.flush();
this.fileOutputStream.close();
}
if (this.printWriter != null) {
this.printWriter.flush();
this.printWriter.close();
}
}
}
上面这段代码用来输出到CSV。
main函数和两个Bean省略。前者调用这两个class,后者声明单词,数字以及URL等成员变量。
全程没有注释自己回过头来看也是费劲的很.(主要是代码很乱..)等有时间再整理一下。