最近因为论文的原因,需要爬取一些文本数据。所以找了本书看了一下,写了一个爬虫程序,和对原始爬取数据的清洗程序。
第一版程序没有用到线程的概念,所以比较慢,第二版用了线程,速度确实提升了很多。
爬虫用到了几个包:
commons-httpclient-3.0.1.jar
htmlparser.jar
第一个是模拟浏览器http的包,第二个是解析网页的包。
一般情况下,爬虫用的策略为广度优先,实现用的是一个队列,首先把要爬取的初始url进队列,然后出队列,解析出队列的url中的<a>标签,做一下筛选,把符合条件的标签url依次进队列,对队列进行出队列操作,直到满足条件退出。退出条件一定设好,否则容易死循环,一般情况下是队列不为空。
看一下单线程版的主程序:
public class MyCrawler {
/**
* 使用种子初始化 URL 队列
*
* @return
* @param seeds
* 种子URL
*/
private void initCrawlerWithSeeds(String seeds) {
LinkQueue.addUnvisitedUrl(seeds);
}
/**
* 抓取过程
* @return
* @param seeds
*/
public void crawling(String seeds) { // 定义过滤器,提取以xxx开头的链接
LinkFilter filter = new LinkFilter() {
public boolean accept(String url) {
if (url.startsWith("http://xxxx"))
return true;
else
return false;
}
};
// 初始化 URL 队列
initCrawlerWithSeeds(seeds);
Set<String> links = HtmlParserTool.extracLinks(seeds, filter);
// 新的未访问的 URL 入队
for (String link : links) {
LinkQueue.addUnvisitedUrl(link);
}
while (!LinkQueue.unVisitedUrlsEmpty()) {
// 队头URL出队列
String visitUrl = (String) LinkQueue.unVisitedUrlDeQueue();
if (visitUrl == null)
continue;
DownLoadFile downLoader = new DownLoadFile();
// 下载网页
downLoader.downloadFile(visitUrl);
}
}
private void initCrawl() {
LinkQueue.removeAllUnvisited();
LinkQueue.removeAllVisited();
}
// main 方法入口
public static void main(String[] args) {
MyCrawler crawler = new MyCrawler();
for (int j = 1; j < 201; j++) {
crawler.initCrawl();
crawler.crawling("http://xxxx"+j+".htm");
}
}
}
我用的网页结构比较简单,所以直接用了循环来做, LinkQueue是定义的队列。
第二版用的多线程:
public class MyCrawlerMultiThread {
public static List<Thread> childThread = new ArrayList<Thread>();
private final static int FROM=1;
private final static int TO=201;
/**
* 使用种子初始化 URL 队列
* @return
* @param seeds
* 种子URL
*/
private static void initCrawlerWithSeeds(String seeds) {
LinkQueue.addUnvisitedUrl(seeds);
}
/**
* 抓取过程
* @return
* @param seeds
*/
public void crawling(String seeds) { // 定义过滤器,提取以xxx开头的链接
while (!LinkQueue.unVisitedUrlsEmpty()) {// 这里用多线程
String visitUrl;
// 队头URL出队列
visitUrl = (String) LinkQueue.unVisitedUrlDeQueue();// 很快不会影响吧
if (visitUrl == null)
continue;
DownLoadFile downLoader = new DownLoadFile();
// 下载网页
downLoader.downloadFile(visitUrl);
}
}
private void initCrawl() {
LinkQueue.removeAllUnvisited();
LinkQueue.removeAllVisited();
}
// main 方法入口
public static void main(String[] args) {
MyCrawlerMultiThread crawler = new MyCrawlerMultiThread();
BThread bt = null;
AThread at = null;
for (int j = FROM; j < TO; j++) {
crawler.initCrawl();
LinkFilter filter = new LinkFilter() {
public boolean accept(String url) {
if (url.startsWith("http://xxx"))
return true;
else
return false;
}
};
String seeds = null;//
seeds = "http://xxxx" + j + ".htm";
// 初始化 URL 队列
initCrawlerWithSeeds(seeds);
Set<String> links = HtmlParserTool.extracLinks(seeds, filter);
// 新的未访问的 URL 入队
for (String link : links) {
LinkQueue.addUnvisitedUrl(link);
} // 进队列
bt=new BThread();
at=new AThread(bt);
try {
bt.start();
at.start();
bt.join();
} catch (Exception e) {
System.out.println("Exception from main");
}
}
}
}
class CThread extends Thread {
private String visitUrl;
public CThread(String url) {
super("[CThread] Thread");
this.visitUrl = url;
};
public void run() {
String threadName = Thread.currentThread().getName();
try {
DownLoadFile downLoader = new DownLoadFile();
// 下载网页
downLoader.downloadFile(visitUrl);
} catch (Exception e) {
System.out.println("Exception from " + threadName + ".run");
}
}
}
class BThread extends Thread {
public BThread() {
super("[BThread] Thread");
};
public void run() {
String threadName = Thread.currentThread().getName();
System.out.println(threadName + " start.");
try {
while (!LinkQueue.unVisitedUrlsEmpty()) {// 这里用多线程
String visitUrl;
// 队头URL出队列
visitUrl = (String) LinkQueue.unVisitedUrlDeQueue();
if (visitUrl == null)
continue;
new CThread(visitUrl).start();
}
} catch (Exception e) {
System.out.println("Exception from " + threadName + ".run");
}
}
}
class AThread extends Thread {
BThread bt;
public AThread(BThread bt) {
super("[AThread] Thread");
this.bt = bt;
}
public void run() {
String threadName = Thread.currentThread().getName();
System.out.println(threadName + " start.");
try {
bt.join();
System.out.println(threadName + " end.");
} catch (Exception e) {
System.out.println("Exception from " + threadName + ".run");
}
}
}
这样可以把我们需要的网页下载下来。下一步需要对下载的网页进行清洗去噪等工作。
public class FileUtil {
private static String filePath = "temp";//html文件路径
public static File[] getAllFiles(String filePath) {// UTF-8
File root = new File(filePath);
File[] files = root.listFiles();
return files;
}
public static String openFile(File fileName, String encode) {
try {
BufferedReader bis = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), encode));
String szContent = "";
String szTemp;
while ((szTemp = bis.readLine()) != null) {
szContent += szTemp + "\n";
}
bis.close();
return szContent;
} catch (Exception e) {
return "";
}
}
public static String getContent(File file) throws ParserException {
String eL1 = "[0-9]{4}-[0-9]{2}-[0-9]{2}[0-9]{2}:[0-9]{2}:[0-9]{2}";// 正则表达式匹配时间
String eL2 = "[0-9]{1,2}岁";
NodeFilter titleFilter = new HasAttributeFilter("class", "fl dib fb");
NodeFilter infoFilter = new HasAttributeFilter("class", "f12 graydeep Userinfo clearfix pl29");
NodeFilter describeFilter = new HasAttributeFilter("class", "graydeep User_quecol pt10 mt10");// 病人自己的描述与想获得的帮助
NodeFilter answerFilter = new HasAttributeFilter("class", "Doc_dochf mb15 bc");// 普通回复
NodeFilter adoptFilter = new HasAttributeFilter("class", "Doc_dochf Best_dochf bc");// 被患者采纳的回复
Parser parser1 = new Parser();
Parser parser2 = new Parser();
Parser parser3 = new Parser();
Parser parser4 = new Parser();
Parser parser5 = new Parser();
Parser parser6 = new Parser();
Pattern p1 = Pattern.compile(eL1);
Pattern p2 = Pattern.compile(eL2);
String fileContent = FileUtil.openFile(file, "GBK");
parser1.setInputHTML(fileContent);
parser2.setInputHTML(fileContent);
parser3.setInputHTML(fileContent);
parser4.setInputHTML(fileContent);
parser5.setInputHTML(fileContent);
parser6.setInputHTML(fileContent);
NodeList nodes = new NodeList();
nodes.add(parser1.extractAllNodesThatMatch(titleFilter));
nodes.add(parser2.extractAllNodesThatMatch(infoFilter));
nodes.add(parser3.extractAllNodesThatMatch(describeFilter));
nodes.add(parser5.extractAllNodesThatMatch(answerFilter));
nodes.add(parser6.extractAllNodesThatMatch(adoptFilter));
StringBuffer textLine = new StringBuffer();
StringBuffer splitLine = new StringBuffer();
String date = "";
HtmlParser.totalFileNum++;
for (int j = 0; j < nodes.size(); j++) {
Node textNode = (Node) nodes.elementAt(j);
if (j == 0) {
textLine.append(HtmlParser.totalFileNum + "|" + textNode.toPlainTextString() + "|");
} else if (j == 1) {// 获取一部分:病人信息
NodeList infoList = new NodeList();
infoList = textNode.getChildren();
int nodeNeed = 0;
for (int m = 0; m < infoList.size(); m++) {// listnode很多空格
Node tmp = (Node) infoList.elementAt(m);
String textTmp = tmp.toPlainTextString();
if (nodeNeed == 4)
break;
String trimTextTmp = textTmp.replace("\n", "").replaceAll("\r", "").replaceAll(" ", "");
if (trimTextTmp.length() != 0) {
Matcher matcher = p1.matcher(trimTextTmp);
Matcher matcher2 = p2.matcher(trimTextTmp);
if (matcher2.matches()) {// 年龄规范
trimTextTmp = trimTextTmp.replaceFirst("岁", "");
}
if (matcher.matches()) {// 只匹配日期
date = textTmp.replace("\n", "").replaceAll("\r", "");
} else {
textLine.append(trimTextTmp + "|");
}
nodeNeed++;
}
}
} else if (j == 2) {// 病情描述,与想获得的帮助
textLine.append("健康咨询描述:" + textNode.toPlainTextString().replaceAll("\n", "") + "|null|" + date + "|");
} else if (j >= 3) {// 医生诊断,可能有好几个
NodeList docAns = new NodeList();
docAns = textNode.getChildren();
splitLine.append(textLine.toString() + "医生" + j + "|null|"
+ docAns.elementAt(1).toPlainTextString().trim().replaceAll("\n", "") + "|"
+ docAns.elementAt(3).toPlainTextString().trim().replaceAll("\n", "") + "|\n");
}
}
// System.out.println(textLine);
return splitLine.toString();
}
public static void writeContent() throws ParserException {
File[] files = FileUtil.getAllFiles(filePath);
try {
String path = "data\\data_xywy.txt";
File dataFile = new File(path);
if (!dataFile.exists())
dataFile.createNewFile();
FileOutputStream out = new FileOutputStream(dataFile, true); // 如果追加方式用true
for(File file:files){
String content = FileUtil.getContent(file);
if (content == null)
break;
StringBuffer sb = new StringBuffer();
sb.append(content);
System.out.println(HtmlParser.totalFileNum);
out.write(sb.toString().getBytes("utf-8"));// 注意需要转换对应的字符集*/
}
out.close();
} catch (IOException ex) {
System.out.println(ex.getStackTrace());
} finally {
}
}
}
源码:http://download.csdn.net/detail/zbuger/9173757