最近的项目用到了爬虫技术,这里主要说明下crawler4j技术的获取对应的标签的链接及title的实现:
首先是抓取类:
1、必须继承于WebCrawler,实现shouldVisit和visit两个方法。
2、使用一个入口类进行采集任务,并进行处理。
第一点:
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.lowagie.text.pdf.codec.Base64.InputStream;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
/**
* 数据采集-爬虫页面处理 shouldVisit: 过滤某个链接是否符合 visit: 读取某个链接内容
*
* @author liuzc
* @date 2015-2-12
*/
public class GdhhCrawler extends WebCrawler {
private static final Logger logger = LoggerFactory.getLogger(GdhhCrawler.class);
/**
* 过滤二进制文件
*/
private static final Pattern FILTERS = Pattern
.compile(".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf"
+ "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
private GdhhCrawlController myController = null;
private DaJob daJob = null;//采集任务
private Pattern keyWordPattern = null;//关健词形成的正则表达式
private GdhhCrawlResult crawResult = null;
public GdhhCrawler() {
crawResult = new GdhhCrawlResult();
}
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
//读取CrawlController 及DaJob
if(myController == null){
myController = (GdhhCrawlController)this.getMyController();
daJob = myController.getDaJob();
}
String href = url.getURL().toLowerCase();//链接
String anchor = url.getAnchor();//标题
logger.info("href: {}", href);
logger.info("anchor: {}", anchor);
//如果采集关健词为空,则按默认规划过滤
if(daJob == null || daJob.getCrawlKeyWord().equals("")){
return !FILTERS.matcher(href).matches();
}
//如果是种子页面,而且设置为不记录,则不处理
if(daJob.isSeedUrl(href) /*&& daJob.getFlgStoreSeedPage().equals("N")*/){
return false;
}
//采集关健词不为空,而标题为空,则不采集
if(NullUtils.isNull(anchor))
return false;
boolean isMatches;
if("Y".equals(daJob.getIsGroup())){
isMatches = getIsMatches(anchor,daJob.getRegPattern());
}else{
if(keyWordPattern == null)
keyWordPattern = Pattern.compile("("+daJob.getCrawlKeyWord()+")");
isMatches = keyWordPattern.matcher(anchor).find();
}
return !FILTERS.matcher(href).matches()
&& isMatches;
}
/**
* 关键词组是否匹配网页锚点
* author:lhail
* @param anchor
* @param regPattern
* @return
*/
public boolean getIsMatches(String anchor,Map<String,List<String>> regPattern){
List<String> listA = regPattern.get("relatedOfKeyWord");//关键词之间的关系
List<String> listB = regPattern.get("strategy");//关键词策略
List<String> listC = regPattern.get("keyWords");//关键词
boolean isMatches = true;
boolean flag;
if(!NullUtils.isEmpty(listA)&&"A".equals(listA.get(0))){//与的关系
for(int i=0;i<listB.size();i++){
if("0".equals(listB.get(i))){//不包含
flag = !Pattern.compile("("+listC.get(i)+")").matcher(anchor).find();
}else{//包含
flag = Pattern.compile("("+listC.get(i)+")").matcher(anchor).find();
}
if(!flag){
isMatches = false;
break;
}
}
}else if(!NullUtils.isEmpty(listA)&&"O".equals(listA.get(0))){//或的关系
isMatches = false;
for(int i=0;i<listB.size();i++){
if("0".equals(listB.get(i))){//不包含
flag = !Pattern.compile("("+listC.get(i)+")").matcher(anchor).find();
}else{//包含
flag = Pattern.compile("("+listC.get(i)+")").matcher(anchor).find();
}
if(flag){
isMatches = true;
break;
}
}
}else{
isMatches = false;
}
return isMatches;
}
public void visit(Page page) {
try{
//读取CrawlController 及DaJob
if(myController == null || daJob == null){
myController = (GdhhCrawlController)this.getMyController();
daJob = myController.getDaJob();
}
WebURL webUrl = page.getWebURL();
String linkUrl = webUrl.getURL();
/*
* 基于数据库的考虑,linkUrl极其有可能超出对应的数据库字段,故限制其对应的长度进入数据库
* @author lhlong
*/
if (linkUrl.length() < 1000) {
logger.info("Visited: {}", linkUrl);
SimpleDateFormat dateFormat = new SimpleDateFormat(
"yyyy/MM/dd HH:mm:ss");
Date today = dateFormat.parse(dateFormat.format(new Date()));
DaResult daResult = new DaResult();
if (!NullUtils.isNull(daResult.getLinkTitle()))
daResult.setUrlname(webUrl.getDomain());
daResult.setLinkUrl(linkUrl);// 链接URL
daResult.setCreateTime(today);
crawResult.incProcessedPages();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData parseData = (HtmlParseData) page
.getParseData();
Set<WebURL> links = parseData.getOutgoingUrls();
crawResult.incTotalLinks(links.size());
// 下载页面内容另保存为xml文件保存在本地
String content = parseData.getHtml();// 链接内容
String htmlTitle = parseData.getTitle();//链接的title
String tmpTagTitle = modifyHtml(htmlTitle,webUrl.getAnchor()) ;
if(!NullUtils.isNull(tmpTagTitle)){
daResult.setLinkTitle(tmpTagTitle);// 链接标题
}
else
daResult.setLinkTitle(webUrl.getAnchor());// 链接标题
// 文件名为:链接地址去掉前缀,后缀,及中间分隔符合
String pageName = daResult.getPageName();
String filePath = daJob.getCrawlStorageFolder()
+ File.separator + pageName + ".xml";
logger.info("pageName: {}", pageName);
// logger.info("content: {}", content);
logger.info("filePath: {}", filePath);
FileOutputStream out = null;
try {
out = new FileOutputStream(new File(filePath));
out.write(content.getBytes());
out.flush();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
out.close();
} catch (IOException e) {
}
}
}
crawResult.addDaResult(daResult);
}
}catch(Exception e){
e.printStackTrace();
}
}
/**
* @function 使用jsoup过滤<a>标签里的title属性,找到对应的完整标题
* @param content
* @param anchorTmp
* @return
* @author lhlong
* @modify
* @date 2015-12-17
*/
public String modifyHtml(String title,String anchorTmp){
String [] aTitle ;
aTitle = title.split("-");
String tmpTagTitle = "";
if(!NullUtils.isNull(anchorTmp)){
//不为空的话,就抓取对应的标题
if(!NullUtils.isNull(aTitle[0])){
tmpTagTitle = aTitle[0];
}
}
/*
try {
for (WebURL webURL : urls) {
URL url;
url = new URL(webURL.getURL());
Document document;
document = Jsoup.parse(url, 10000);
Elements elements = document.getElementsByTag("a");
for (Element element : elements) {
if(!NullUtils.isNull(element.text())){
//链接不为空
if(element.text().equals(anchorTmp)){
if(!NullUtils.isNull(element.attr("title"))){
//title属性不为空
tmpTagTitle = element.attr("title");
}
}
}
}
}
}
catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
*/
return tmpTagTitle;
}
/**
* This function is called by controller to get the local data of this crawler when job is finished
*/
public Object getMyLocalData() {
return crawResult;
}
/**
* This function is called by controller before finishing the job. You can
* put whatever stuff you need here.
*/
@Override
public void onBeforeExit() {
dumpMyData();
}
public void dumpMyData() {
int id = getMyId();
// You can configure the log to output to file
logger.info("Crawler {} > Processed Pages: {}", id,
crawResult.getTotalProcessedPages());
logger.info("Crawler {} > Total Links Found: {}", id,
crawResult.getTotalLinks());
logger.info("Crawler {} > Total Text Size: {}", id,
crawResult.getTotalTextSize());
}
}
刚才的这一部分
String content = parseData.getHtml();// 链接内容
String htmlTitle = parseData.getTitle();//链接的title
就是获取当前页面的title属性的方法。
使用这个就能获取到a标签里的title,而不是网页上所截取的标题anchor这个就是网页上显示什么就会展示什么的属性!