SPRING BOOT+WEBMAGIC

    最近,想自己学习下hadoop,但又缺少点文本数据,所以需要爬取点数据~ 不会写py , 就直接找了个爬虫框架~
    webmagic的原理图如下,很简单很好用:      

这里写图片描述

POM.xml

    <!-- mybatis start-->
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>${mybatis-version}</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>
        <!-- mybatis end -->

        <!-- webMagic start -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>${webMagic-version}</version>
        </dependency>

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>${webMagic-version}</version>
        </dependency>
        <!-- webMagic end -->

Application.java

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.support.SpringBootServletInitializer;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
@EnableScheduling
public class GlobeFishWebMagicApplication extends SpringBootServletInitializer {

    @Override
    protected SpringApplicationBuilder configure(SpringApplicationBuilder application) {
        return application.sources(GlobeFishWebMagicApplication.class);
    }

    public static void main(String[] args) {
        SpringApplication.run(GlobeFishWebMagicApplication.class, args);
    }
}

processor.java

import java.util.Date;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicInteger;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import com.panchen.globeFishWebMagic.entity.CSDNMessage;
import com.panchen.globeFishWebMagic.mapper.CSDNMessageMapper;
import com.panchen.globeFishWebMagic.util.SpringContextUtil;
import com.panchen.globeFishWebMagic.util.UUIDUtil;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 爬虫
 * 
 * @author pc
 *
 */

@Component
@SpringBootApplication  
public class CSDNProcessor extends Thread implements PageProcessor {

    private final static Logger logger = LoggerFactory.getLogger(CSDNProcessor.class);

    @Autowired
    private CSDNMessageMapper csdnMessageMapper;

    private String originalUrl;

    private String taskName;

    // CountDownLatch作为计数器记录线程
    private static CountDownLatch cdl=new CountDownLatch(9);

    //使用原子变量
    private static AtomicInteger urlCount = new AtomicInteger(0);

    private static AtomicInteger pageCount = new AtomicInteger(1);

    public CSDNProcessor() {
    }

    public CSDNProcessor(CountDownLatch cdl) {
        this.cdl = cdl;
    }

    // 抓取配置
    private Site site = Site.me().setSleepTime(1000).setRetryTimes(30).setCharset("utf-8").setTimeOut(300000)
            .setUserAgent(
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");

    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        if (page.getUrl().regex("http://blog\\.csdn\\.net/(.*)/article/details/(.*)").match()) {
            // get
            CSDNMessage newCSDNMessage = new CSDNMessage(UUIDUtil.getUUID(), page.getUrl().get(),
                    page.getHtml().xpath("//*[@id=\"blog_userface\"]/span/a/text()").get(),
                    page.getHtml().xpath("//*[@class=\"article_title\"]/h1/span/text()").get(),
                    page.getHtml().xpath("//*[@id=\"article_content\"]").get(),
                    page.getHtml().xpath("//*[@class=\"link_postdate\"]/text()").get(), new Date(), 1, null, null,
                    page.getHtml().xpath("//*[@id=\"btnDigg\"]/dd/text()").get(),
                    page.getHtml().xpath("//*[@id=\"btnBury\"]/dd/text()").get(),
                    page.getHtml().xpath("//*/[@class=\"link_view\"]/text()").get(),
                    page.getHtml().xpath("//*[@class=\"link_comments\"]/text()").get(),
                    page.getHtml().xpath("//*[@class=\"category_r\"]/label/span/text()").get());
            csdnMessageMapper.addCSDNMessage(newCSDNMessage);
            urlCount.getAndIncrement();
        }
        List<String> urls = page.getHtml()
                .xpath("//*[@class=\"blog_list clearfix\"]/dd/[@class=\"tracking-ad\"]/a/@href").all();
        // 跳页
        if (page.getUrl().get().matches("http://blog\\.csdn\\.net/(.*)/newarticle.html(.*)")) {
            pageCount.getAndIncrement();
            if (page.getUrl().get().matches("http://blog\\.csdn\\.net/(.*)/newarticle.html")) {
                page.addTargetRequest(page.getUrl().get() + "?&page=2");
            } else {
                page.addTargetRequest(
                        page.getUrl().get().substring(0, page.getUrl().get().lastIndexOf('=') + 1) + pageCount);
            }
        }
        if (null != urls && 0 < urls.size()) {
            for (String url : urls) {
                if (null != csdnMessageMapper.getMessageByUrl(url)) {
                    csdnMessageMapper.deleteCSDNMessageByUrl(url);
                }
                page.addTargetRequest(url);
            }
        }
    }

    public void run() {
        long startTime, endTime;
        logger.info(taskName + "START!!!!!");
        startTime = System.currentTimeMillis();
        //spring对bean的管理是安全的  无法通过注入来得到bean 工具类实现ApplicationContextAware即可
        Spider.create(SpringContextUtil.getBeanByClass(new CSDNProcessor().getClass())).addUrl(originalUrl).thread(1).run();
        endTime = System.currentTimeMillis();
        logger.info(taskName + "END!!!!!,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了" + pageCount + "页、" + urlCount
                + "条记录");
        cdl.countDown();
    }

    /**
     * 
     * 对多模块进行爬取
     */
    @Scheduled(cron = "0 46 16 ? * *")
    public void scheduled() {
        long startTime, endTime;
        logger.info("START!!!!!");
        startTime = System.currentTimeMillis();
        // 手機
        CSDNProcessor mobile = new CSDNProcessor(cdl);
        mobile.setOriginalUrl("http://blog.csdn.net/mobile/newarticle.html");
        mobile.setTaskName("mobile");
        // web前端
        CSDNProcessor web = new CSDNProcessor(cdl);
        web.setOriginalUrl("http://blog.csdn.net/web/newarticle.html");
        web.setTaskName("web");
        // 研發管理
        CSDNProcessor software = new CSDNProcessor(cdl);
        software.setOriginalUrl("http://blog.csdn.net/software/newarticle.html");
        software.setTaskName("software");
        // 架構設計
        CSDNProcessor enterprise = new CSDNProcessor(cdl);
        enterprise.setOriginalUrl("http://blog.csdn.net/enterprise/newarticle.html");
        enterprise.setTaskName("enterprise");
        // 程序語言
        CSDNProcessor code = new CSDNProcessor(cdl);
        code.setOriginalUrl("http://blog.csdn.net/code/newarticle.html");
        code.setTaskName("code");
        // 互聯網
        CSDNProcessor www = new CSDNProcessor(cdl);
        www.setOriginalUrl("http://blog.csdn.net/www/newarticle.html");
        www.setTaskName("www");
        // 數據庫
        CSDNProcessor database = new CSDNProcessor(cdl);
        database.setOriginalUrl("http://blog.csdn.net/database/newarticle.html");
        database.setTaskName("database");
        // cloud
        CSDNProcessor cloud = new CSDNProcessor(cdl);
        cloud.setOriginalUrl("http://blog.csdn.net/cloud/newarticle.html");
        cloud.setTaskName("cloud");
        // 總和
        CSDNProcessor other = new CSDNProcessor(cdl);
        other.setOriginalUrl("http://blog.csdn.net/other/newarticle.html");
        other.setTaskName("other");
        // 子线程开始
        mobile.start();
        web.start();
        software.start();
        enterprise.start();
        code.start();
        www.start();
        database.start();
        cloud.start();
        other.start();
        // 主线程等待
        try {
            cdl.await();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        endTime = System.currentTimeMillis();
        logger.info("END!!!!!,总耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了" + pageCount + "页、" + urlCount + "条记录");

    }

    public String getOriginalUrl() {
        return originalUrl;
    }

    public void setOriginalUrl(String originalUrl) {
        this.originalUrl = originalUrl;
    }

    public String getTaskName() {
        return taskName;
    }

    public void setTaskName(String taskName) {
        this.taskName = taskName;
    }

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值