实现功能:优化解析代码
xpath路径常常需要改变时可以通过两种方法减少工作:
一是通过使用配置文件,将xpath路径都保存在配置文件中
二是将路径保存在数据库中使用
将很多重复的代码写成不同的方法放到工具包中
减少代码的重复性。
CSDNProcessService.java
package work.spider.service.impl;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import work.spider.entity.Page;
import work.spider.service.IProcessService;
import work.spider.util.HtmlUtil;
import work.spider.util.LoadPropertyUtil;
import work.spider.util.RegexUtil;
/**
*
*CSDN页面解析实现类
* @auther lwr
* create by 2020-03-13
* */
public class CSDNProcessService implements IProcessService {
public void process(Page page) {
// TODO Auto-generated method stub
String content =page.getContent();
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode rootNode =htmlCleaner.clean(content);
//获取总阅读数
page.setAllnumber(HtmlUtil.getFieldByRegex(rootNode,LoadPropertyUtil.getCSDN("readNumberPath"),LoadPropertyUtil.getCSDN("allnumberRegex")));
//获取评论数
page.setCommentNuber(HtmlUtil.getFieldByRegex(rootNode, LoadPropertyUtil.getCSDN("commentNumberPath"),LoadPropertyUtil.getCSDN("commentnumberRegex")));
//获取发文时间
page.setPublishtime(HtmlUtil.getFieldByRegex(rootNode,LoadPropertyUtil.getCSDN("publishTimePath") ,LoadPropertyUtil.getCSDN("publishTime")));
}
}
HTMLUtil.java
package work.spider.util;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
/*
* 页面解析工具
* @auther lwr
* created by 2020-03-14
* */
public class HtmlUtil {
public static String getFieldByRegex(TagNode rootnode,String xpath,String regex) {
String result="";
Object[] evaluateXPath =null;
try {
evaluateXPath =rootnode.evaluateXPath(xpath);
if(evaluateXPath.length>0) {
TagNode node=(TagNode)evaluateXPath[0];
result=RegexUtil.getPageInfoByRegex(node.getText().toString(), regex, 0);
}
} catch (XPatherException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return result;
}
}
读取properties文件的工具类
package work.spider.util;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;
/*
*
* 读取配置文件属性工具类
* @auther lwer
* created by 2020-03-14
*/
public class LoadPropertyUtil {
//读取csdn配置文件
public static String getCSDN(String key) {
String value="";
try {
Properties properties=new Properties();
BufferedReader bufferedReader = new BufferedReader(new FileReader("D:\\JAVA\\workplace\\spider\\src\\main\\resources\\csdn.properties"));
properties.load(bufferedReader);
value=properties.getProperty(key);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return value;
}
}
正则表达式匹配工具类
package work.spider.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/*
*正则表达式匹配工具
* @auther lwr
* create by 2020-03-14
* */
public class RegexUtil { //正则表达式匹配
public static String getPageInfoByRegex(String content,String regex,int groupNo) {
Pattern numberPattern = Pattern.compile(regex,Pattern.DOTALL);
Matcher matcher = numberPattern.matcher(content);
if(matcher.find()) {
return matcher.group(groupNo);
}
else return "fail to match";
}
}
csdn.properties 文件
allnumberRegex=^([1-9][0-9]*)|0$
publishTime=[0-9]*-[0-9]*-[0-9]*.[0-9]*:[0-9]*:[0-9]*
commentnumberRegex=[\\d]+
readNumberPath=//*[@id=\"mainBox\"]/main/div[2]/div[1]/div/p[3]/span/span
commentNumberPath=//*[@id=\"mainBox\"]/main/div[2]/div[1]/div/p[5]/span/span
publishTimePath=//*[@id=\"mainBox\"]/main/div[2]/div[1]/div/p[1]/span