crawler4j爬虫技术获取a标签的title及链接

最近的项目用到了爬虫技术,这里主要说明下crawler4j技术的获取对应的标签的链接及title的实现:

首先是抓取类:
1、必须继承于WebCrawler,实现shouldVisit和visit两个方法。
2、使用一个入口类进行采集任务,并进行处理。

第一点:


import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.lowagie.text.pdf.codec.Base64.InputStream;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;

/**
 * 数据采集-爬虫页面处理 shouldVisit: 过滤某个链接是否符合 visit: 读取某个链接内容
 * 
 * @author liuzc
 * @date 2015-2-12
 */
public class GdhhCrawler extends WebCrawler {
    private static final Logger logger = LoggerFactory.getLogger(GdhhCrawler.class);

    /**
     * 过滤二进制文件
     */
    private static final Pattern FILTERS = Pattern
            .compile(".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf"
                    + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");

    private GdhhCrawlController myController = null;
    private DaJob daJob = null;//采集任务
    private Pattern keyWordPattern = null;//关健词形成的正则表达式
    private GdhhCrawlResult crawResult = null;

    public GdhhCrawler() {
        crawResult = new GdhhCrawlResult();
    }

    @Override
    public boolean shouldVisit(Page referringPage, WebURL url) {        

        //读取CrawlController 及DaJob
        if(myController == null){
            myController = (GdhhCrawlController)this.getMyController();
            daJob = myController.getDaJob();
        }

        String href = url.getURL().toLowerCase();//链接
        String anchor = url.getAnchor();//标题


        logger.info("href: {}", href);
        logger.info("anchor: {}", anchor);


        //如果采集关健词为空,则按默认规划过滤
        if(daJob == null || daJob.getCrawlKeyWord().equals("")){

            return !FILTERS.matcher(href).matches();
        }

        //如果是种子页面,而且设置为不记录,则不处理
        if(daJob.isSeedUrl(href) /*&& daJob.getFlgStoreSeedPage().equals("N")*/){
            return false;
        }

        //采集关健词不为空,而标题为空,则不采集
        if(NullUtils.isNull(anchor)) 
            return false;

        boolean isMatches;
        if("Y".equals(daJob.getIsGroup())){
            isMatches = getIsMatches(anchor,daJob.getRegPattern());
        }else{
            if(keyWordPattern == null)
                keyWordPattern = Pattern.compile("("+daJob.getCrawlKeyWord()+")");
            isMatches = keyWordPattern.matcher(anchor).find();
        }


        return !FILTERS.matcher(href).matches()
                && isMatches;
    }

    /**
     * 关键词组是否匹配网页锚点
     * author:lhail
     * @param anchor
     * @param regPattern
     * @return
     */
    public boolean getIsMatches(String anchor,Map<String,List<String>> regPattern){
        List<String> listA = regPattern.get("relatedOfKeyWord");//关键词之间的关系
        List<String> listB = regPattern.get("strategy");//关键词策略
        List<String> listC = regPattern.get("keyWords");//关键词
        boolean isMatches = true;
        boolean flag;
        if(!NullUtils.isEmpty(listA)&&"A".equals(listA.get(0))){//与的关系
            for(int i=0;i<listB.size();i++){
                if("0".equals(listB.get(i))){//不包含
                    flag = !Pattern.compile("("+listC.get(i)+")").matcher(anchor).find();
                }else{//包含
                    flag = Pattern.compile("("+listC.get(i)+")").matcher(anchor).find();
                }
                if(!flag){
                    isMatches = false;
                    break;
                }
            }
        }else if(!NullUtils.isEmpty(listA)&&"O".equals(listA.get(0))){//或的关系
            isMatches = false;
            for(int i=0;i<listB.size();i++){
                if("0".equals(listB.get(i))){//不包含
                    flag = !Pattern.compile("("+listC.get(i)+")").matcher(anchor).find();
                }else{//包含
                    flag = Pattern.compile("("+listC.get(i)+")").matcher(anchor).find();
                }
                if(flag){
                    isMatches = true;
                    break;
                }       
            }
        }else{
            isMatches = false;
        }
        return isMatches;
    }


    public void visit(Page page) {
        try{

            //读取CrawlController 及DaJob
            if(myController == null || daJob == null){
                myController = (GdhhCrawlController)this.getMyController();
                daJob = myController.getDaJob();
            }

            WebURL webUrl = page.getWebURL();
            String linkUrl = webUrl.getURL();
            /*
             * 基于数据库的考虑,linkUrl极其有可能超出对应的数据库字段,故限制其对应的长度进入数据库
             * @author lhlong   
             */
            if (linkUrl.length() < 1000) {

                logger.info("Visited: {}", linkUrl);

                SimpleDateFormat dateFormat = new SimpleDateFormat(
                        "yyyy/MM/dd HH:mm:ss");
                Date today = dateFormat.parse(dateFormat.format(new Date()));

                DaResult daResult = new DaResult();

                if (!NullUtils.isNull(daResult.getLinkTitle()))
                    daResult.setUrlname(webUrl.getDomain());

                daResult.setLinkUrl(linkUrl);// 链接URL
                daResult.setCreateTime(today);

                crawResult.incProcessedPages();

                if (page.getParseData() instanceof HtmlParseData) {
                    HtmlParseData parseData = (HtmlParseData) page
                            .getParseData();
                    Set<WebURL> links = parseData.getOutgoingUrls();
                    crawResult.incTotalLinks(links.size());

                    // 下载页面内容另保存为xml文件保存在本地
                    String content = parseData.getHtml();// 链接内容
                    String htmlTitle = parseData.getTitle();//链接的title
                    String tmpTagTitle = modifyHtml(htmlTitle,webUrl.getAnchor()) ; 

                    if(!NullUtils.isNull(tmpTagTitle)){
                        daResult.setLinkTitle(tmpTagTitle);// 链接标题
                    }
                    else 
                        daResult.setLinkTitle(webUrl.getAnchor());// 链接标题

                    // 文件名为:链接地址去掉前缀,后缀,及中间分隔符合
                    String pageName = daResult.getPageName();
                    String filePath = daJob.getCrawlStorageFolder()
                            + File.separator + pageName + ".xml";
                    logger.info("pageName: {}", pageName);
                    // logger.info("content: {}", content);
                    logger.info("filePath: {}", filePath);
                    FileOutputStream out = null;
                    try {
                        out = new FileOutputStream(new File(filePath));
                        out.write(content.getBytes());
                        out.flush();
                    } catch (IOException e) {
                        e.printStackTrace();
                    } finally {
                        try {
                            out.close();
                        } catch (IOException e) {
                        }
                    }
                }

                crawResult.addDaResult(daResult);
            }
        }catch(Exception e){
            e.printStackTrace();
        }
    }
    /**
     * @function 使用jsoup过滤<a>标签里的title属性,找到对应的完整标题
     * @param content
     * @param anchorTmp
     * @return
     * @author lhlong
     * @modify 
     * @date 2015-12-17
     */
    public String modifyHtml(String title,String anchorTmp){

        String [] aTitle ;
        aTitle = title.split("-");
        String tmpTagTitle = "";
        if(!NullUtils.isNull(anchorTmp)){
            //不为空的话,就抓取对应的标题
            if(!NullUtils.isNull(aTitle[0])){
                tmpTagTitle = aTitle[0];
            }
        }
        /*
        try {
            for (WebURL webURL : urls) {
                URL url;
                url = new URL(webURL.getURL());
                Document document;

                document = Jsoup.parse(url, 10000);

                Elements elements = document.getElementsByTag("a");
                for (Element element : elements) {

                        if(!NullUtils.isNull(element.text())){
                            //链接不为空
                            if(element.text().equals(anchorTmp)){
                                if(!NullUtils.isNull(element.attr("title"))){
                                    //title属性不为空
                                    tmpTagTitle = element.attr("title");
                                }
                            }
                        }
                }
            }
        }
        catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        catch (IOException e1) {
                // TODO Auto-generated catch block
            e1.printStackTrace();
        }
            */

        return tmpTagTitle;
    }

    /**
     * This function is called by controller to get the local data of this crawler when job is finished
     */
    public Object getMyLocalData() {
        return crawResult;
    }

    /**
     * This function is called by controller before finishing the job. You can
     * put whatever stuff you need here.
     */
    @Override
    public void onBeforeExit() {
        dumpMyData();
    }

    public void dumpMyData() {
        int id = getMyId();
        // You can configure the log to output to file
        logger.info("Crawler {} > Processed Pages: {}", id,
                crawResult.getTotalProcessedPages());
        logger.info("Crawler {} > Total Links Found: {}", id,
                crawResult.getTotalLinks());
        logger.info("Crawler {} > Total Text Size: {}", id,
                crawResult.getTotalTextSize());
    }
}

刚才的这一部分
String content = parseData.getHtml();// 链接内容
String htmlTitle = parseData.getTitle();//链接的title

就是获取当前页面的title属性的方法。

使用这个就能获取到a标签里的title,而不是网页上所截取的标题anchor这个就是网页上显示什么就会展示什么的属性!

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值