Java WebCollector爬虫采集数据

最新推荐文章于 2021-02-13 15:28:08 发布

yunshouhu

最新推荐文章于 2021-02-13 15:28:08 发布

阅读量8.5k

点赞数 1

分类专栏： java

本文链接：https://blog.csdn.net/earbao/article/details/48678281

版权

java 专栏收录该内容

309 篇文章 5 订阅

订阅专栏

package junit.test;


import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;

import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.util.RegexRule;

import org.apache.commons.io.IOUtils;
import org.jsoup.nodes.Document;

public class BaikeCrawler extends DeepCrawler {

	private String htmlPath;
    /*2.x版本中，爬虫的遍历由用户自定义(本质还是广度遍历，但是每个页面
     生成的URL，也就是遍历树中每个节点的孩子节点，是由用户自定义的)。
      
     1.x版本中，默认将每个页面中，所有满足正则约束的链接，都当作待爬取URL，通过
     这种方法可以完成在一定范围内(例如整站)的爬取(根据正则约束)。
    
     所以在2.x版本中，我们只要抽取页面中满足正则的URL，作为Links返回，就可以
     完成1.x版本中BreadthCrawler的功能。
      
     */
    RegexRule regexRule = new RegexRule();


    public BaikeCrawler(String crawlPath,String htmlPath) {
        super(crawlPath);

        this.htmlPath=htmlPath;
        File file=new File(htmlPath);
        if(!file.exists())
        {
        	file.mkdirs();
        }
        
        regexRule.addRule("http://.*baidu.com/.*");
        regexRule.addRule("-.*jpg.*");

       

        
    }

    @Override
    public Links visitAndGetNextLinks(Page page) {
        Document doc = page.getDoc();
        String title = doc.title();
      
       // System.out.println("URL:" + page.getUrl() + "  标题:" + title);
        //System.out.println(doc);
        String outputpath=htmlPath+File.separator+title+".html";
        
        System.out.println(outputpath);
        try {
			IOUtils.copy(new StringReader(doc.toString()), new FileWriter(new File(outputpath)));
		} catch (IOException e) {
			e.printStackTrace();
		}

        /*下面是2.0版本新加入的内容*/
        /*抽取page中的链接返回，这些链接会在下一轮爬取时被爬取。
         不用担心URL去重，爬虫会自动过滤重复URL。*/
        Links nextLinks = new Links();

        /*我们只希望抽取满足正则约束的URL，
         Links.addAllFromDocument为我们提供了相应的功能*/
        nextLinks.addAllFromDocument(doc, regexRule);

        /*Links类继承ArrayList<String>,可以使用add、addAll等方法自己添加URL
         如果当前页面的链接中，没有需要爬取的，可以return null
         例如如果你的爬取任务只是爬取seed列表中的所有链接，这种情况应该return null
         */
        return nextLinks;
    }

    public static void main(String[] args) throws Exception {
        /*构造函数中的string,是爬虫的crawlPath，爬虫的爬取信息都存在crawlPath文件夹中,
          不同的爬虫请使用不同的crawlPath
        */
        BaikeCrawler crawler = new BaikeCrawler("D:\\java\\crawer","D:\\java\\htmlPath");
        crawler.setThreads(50);
        crawler.addSeed("http://baike.baidu.com/lishi");
        /*设置是否断点爬取*/
        crawler.setResumable(false);

        crawler.start(3);
    }

}

/*
 * Copyright (C) 2014 hu
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package cn.edu.hfut.dmic.webcollector.example;

import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.example.util.JDBCHelper;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl;
import cn.edu.hfut.dmic.webcollector.net.Proxys;
import cn.edu.hfut.dmic.webcollector.net.RandomProxyGenerator;
import cn.edu.hfut.dmic.webcollector.util.RegexRule;
import java.net.Proxy;
import org.jsoup.nodes.Document;
import org.springframework.jdbc.core.JdbcTemplate;

/**
 * WebCollector 2.x版本的tutorial
 * 2.x版本特性：
 *   1）自定义遍历策略，可完成更为复杂的遍历业务，例如分页、AJAX
 *   2）内置Berkeley DB管理URL，可以处理更大量级的网页
 *   3）集成selenium，可以对javascript生成信息进行抽取
 *   4）直接支持多代理随机切换
 *   5）集成spring jdbc和mysql connection，方便数据持久化
 *   6）集成json解析器
 *   7）使用slf4j作为日志门面
 *   8）修改http请求接口，用户自定义http请求更加方便
 * 
 * 可在cn.edu.hfut.dmic.webcollector.example包中找到例子(Demo)
 * 
 * @author hu
 */
public class TutorialCrawler extends DeepCrawler {

    /*2.x版本中，爬虫的遍历由用户自定义(本质还是广度遍历，但是每个页面
     生成的URL，也就是遍历树中每个节点的孩子节点，是由用户自定义的)。
      
     1.x版本中，默认将每个页面中，所有满足正则约束的链接，都当作待爬取URL，通过
     这种方法可以完成在一定范围内(例如整站)的爬取(根据正则约束)。
    
     所以在2.x版本中，我们只要抽取页面中满足正则的URL，作为Links返回，就可以
     完成1.x版本中BreadthCrawler的功能。
      
     */
    RegexRule regexRule = new RegexRule();

    JdbcTemplate jdbcTemplate = null;

    public TutorialCrawler(String crawlPath) {
        super(crawlPath);

        regexRule.addRule("http://.*zhihu.com/.*");
        regexRule.addRule("-.*jpg.*");

        /*创建一个JdbcTemplate对象,"mysql1"是用户自定义的名称，以后可以通过
         JDBCHelper.getJdbcTemplate("mysql1")来获取这个对象。
         参数分别是：名称、连接URL、用户名、密码、初始化连接数、最大连接数
        
         这里的JdbcTemplate对象自己可以处理连接池，所以爬虫在多线程中，可以共用
         一个JdbcTemplate对象(每个线程中通过JDBCHelper.getJdbcTemplate("名称")
         获取同一个JdbcTemplate对象)             
         */

        try {
            jdbcTemplate = JDBCHelper.createMysqlTemplate("mysql1",
                    "jdbc:mysql://localhost/testdb?useUnicode=true&characterEncoding=utf8",
                    "root", "password", 5, 30);

            /*创建数据表*/
            jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS tb_content ("
                    + "id int(11) NOT NULL AUTO_INCREMENT,"
                    + "title varchar(50),url varchar(200),html longtext,"
                    + "PRIMARY KEY (id)"
                    + ") ENGINE=MyISAM DEFAULT CHARSET=utf8;");
            System.out.println("成功创建数据表 tb_content");
        } catch (Exception ex) {
            jdbcTemplate = null;
            System.out.println("mysql未开启或JDBCHelper.createMysqlTemplate中参数配置不正确!");
        }
    }

    @Override
    public Links visitAndGetNextLinks(Page page) {
        Document doc = page.getDoc();
        String title = doc.title();
        System.out.println("URL:" + page.getUrl() + "  标题:" + title);

        /*将数据插入mysql*/
        if (jdbcTemplate != null) {
            int updates=jdbcTemplate.update("insert into tb_content (title,url,html) value(?,?,?)",
                    title, page.getUrl(), page.getHtml());
            if(updates==1){
                System.out.println("mysql插入成功");
            }
        }

        /*下面是2.0版本新加入的内容*/
        /*抽取page中的链接返回，这些链接会在下一轮爬取时被爬取。
         不用担心URL去重，爬虫会自动过滤重复URL。*/
        Links nextLinks = new Links();

        /*我们只希望抽取满足正则约束的URL，
         Links.addAllFromDocument为我们提供了相应的功能*/
        nextLinks.addAllFromDocument(doc, regexRule);

        /*Links类继承ArrayList<String>,可以使用add、addAll等方法自己添加URL
         如果当前页面的链接中，没有需要爬取的，可以return null
         例如如果你的爬取任务只是爬取seed列表中的所有链接，这种情况应该return null
         */
        return nextLinks;
    }

    public static void main(String[] args) throws Exception {
        /*构造函数中的string,是爬虫的crawlPath，爬虫的爬取信息都存在crawlPath文件夹中,
          不同的爬虫请使用不同的crawlPath
        */
        TutorialCrawler crawler = new TutorialCrawler("/home/hu/data/wb");
        crawler.setThreads(50);
        crawler.addSeed("http://www.zhihu.com/");
        crawler.setResumable(false);


        /*
        //requester是负责发送http请求的插件，可以通过requester中的方法来指定http/socks代理
        HttpRequesterImpl requester=(HttpRequesterImpl) crawler.getHttpRequester();    
       
        //单代理
        requester.setProxy("127.0.0.1", 1080,Proxy.Type.SOCKS);
        
        //多代理随机
        RandomProxyGenerator proxyGenerator=new RandomProxyGenerator();
        proxyGenerator.addProxy("127.0.0.1",8080,Proxy.Type.SOCKS);
        requester.setProxyGenerator(proxyGenerator);
        */


        /*设置是否断点爬取*/
        crawler.setResumable(false);

        crawler.start(5);
    }

}

<dependencies>
        <dependency>
            <groupId>cn.edu.hfut.dmic.webcollector</groupId>
            <artifactId>WebCollector</artifactId>
            <version>2.12</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>2.44.0</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.31</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>4.0.5.RELEASE</version>
        </dependency>

        <dependency>
            <groupId>commons-dbcp</groupId>
            <artifactId>commons-dbcp</artifactId>
            <version>1.4</version>
        </dependency>
    </dependencies>