spider爬取腾讯网:娱乐版块内容

1.准备工作(依赖包):redis和mysql请自行准备,此案例涉及到爬取数据的保存和去重(通过redis)

 <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.6</version>
         </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.47</version>
        </dependency>

        <dependency>
            <groupId>org.mybatis</groupId>
            <artifactId>mybatis</artifactId>
            <version>3.4.6</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.38</version>
        </dependency>

        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.12</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>1.7.25</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
            <scope>test</scope>
        </dependency>

        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>3.0.1</version>
        </dependency>

2.配置文件

    2.1.db.properties

driver=com.mysql.jdbc.Driver
url=jdbc:mysql:///spider?useUnicode=true&characterEncoding=UTF-8
username=root
password=admin

    2.2.log4j.properties

# Global logging configuration
log4j.rootLogger=ERROR, stdout
# MyBatis logging configuration...
log4j.logger.com.test.spider=TRACE
# Console output...
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n

   2.3.mybatis.xml

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
        PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
    <!--属性配置-->
    <properties resource="db.properties"/>
    <environments default="development">
        <environment id="development">
            <!--事务管理器-->
            <transactionManager type="JDBC"/>
            <!--数据源-->
            <dataSource type="POOLED">
                <property name="driver" value="${driver}"/>
                <property name="url" value="${url}"/>
                <property name="username" value="${username}"/>
                <property name="password" value="${password}"/>
            </dataSource>
        </environment>
    </environments>

    <mappers>
        <mapper resource="com/test/spider/mapper/UserMapper.xml"/>
    </mappers>
</configuration>

3.实体类News:com.test.spider.bean;

package com.test.spider.bean;

import java.util.Date;

public class News {
    private Integer id;
    private String title;
    private String intro;
    private String source;
    private String vurl;
    private Date publishTime;
    @Override
    public String toString() {
        return "News{" +
                "id=" + id +
                ", title='" + title + '\'' +
                ", intro='" + intro + '\'' +
                ", source='" + source + '\'' +
                ", vurl='" + vurl + '\'' +
                ", publishTime=" + publishTime +
                '}';
    }

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getIntro() {
        return intro;
    }

    public void setIntro(String intro) {
        this.intro = intro;
    }

    public String getSource() {
        return source;
    }

    public void setSource(String source) {
        this.source = source;
    }

    public String getVurl() {
        return vurl;
    }

    public void setVurl(String vurl) {
        this.vurl = vurl;
    }

    public Date getPublishTime() {
        return publishTime;
    }

    public void setPublishTime(Date publishTime) {
        this.publishTime = publishTime;
    }
}

4.NewsMapper接口:com.test.spider.mapper;

package com.test.spider.mapper;

import com.test.spider.bean.News;
import org.apache.ibatis.annotations.Param;
import java.util.List;

public interface NewsMapper {
    Integer batchSave(@Param("news") List<News> list);
}

5.NewsMapper.xml配置文件:com.test.spider.mapper;

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
        PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.test.spider.mapper.NewsMapper">
    <insert id="batchSave">
        insert into news(title,intro,source,vurl,publish_time)
        VALUES
        <foreach collection="news" separator="," item="item">
            (#{item.title},#{item.intro},#{item.source},#{item.vurl},#{item.publishTime})
        </foreach>
    </insert>
</mapper>

6.MybatisUtil工具类:com.test.spider.util;

package com.test.spider.util;

import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;

import java.io.IOException;
import java.io.InputStream;

public class MybatisUtil {

    private static SqlSessionFactory factory = null;

    static{
        String resource = "mybatis.xml";
        InputStream in = null;
        try {
            in = Resources.getResourceAsStream(resource);
        } catch (IOException e) {
            e.printStackTrace();
        }
        factory = new SqlSessionFactoryBuilder().build(in);
    }

    public static SqlSession getSession(){
        return factory.openSession();
    }
}

7.EntertainmentSpider主要功能:com.test.spider;

package com.test.spider;

import com.alibaba.fastjson.JSON;
import com.sun.org.apache.bcel.internal.generic.NEW;
import com.test.spider.bean.News;
import com.test.spider.mapper.NewsMapper;
import com.test.spider.util.MybatisUtil;
import jdk.nashorn.internal.scripts.JD;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.ibatis.session.SqlSession;
import org.junit.Test;
import redis.clients.jedis.Jedis;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* Created by thinkpad on 2019/9/18.
*/
public class EntertainmentSpider {

    public void main() throws Exception {
        Integer page = 0;
        while(true){
            //1.确定Url
            String indexUrl = "https://pacaio.match.qq.com/irs/rcd?cid=146&token=49cbb2154853ef1a74ff4e53723372ce&ext=ent&page="+page+"&callback=__jp7";
            //2.发送请求,获取数据
            //获取httpclient对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
            //创建请求方式对象
            HttpGet get = new HttpGet(indexUrl);
            //执行请求操作
            CloseableHttpResponse response = httpClient.execute(get);
            if(response.getStatusLine().getStatusCode() == 200){
                HttpEntity entity = response.getEntity();
                String res = EntityUtils.toString(entity, "utf-8");
                //3.解析数据
                res = toJsonString(res);   //截取字符串
                //转为map对象
                Map<String, Object> map = jsonToMap(res);
                //页面没有数据使,结束循环
                if(Integer.parseInt(map.get("datanum").toString()) == 0){
                    break;
                }
                //转换为新闻对象
                List<News> list = mapToBean(((List<Map>) map.get("data")));
                //4.保存数据
                //保存新闻数据
                if(list.size()>0){
                    Integer count = saveNews(list);
                }
                System.out.println(page);
            }
            page++;
        }
        System.out.println("执行结束....");
    }

    //截取结果字符串
    public String toJsonString(String src){
        int start = src.indexOf("{");
        int end = src.lastIndexOf("}")+1;
        return src.substring(start,end);
    }

    //json字符串转换为java对象
    public Map<String,Object> jsonToMap(String src){
        return JSON.parseObject(src, HashMap.class);
    }

    //新闻数据转换为新闻对象
    public List<News> mapToBean(List<Map> src) throws ParseException {

        SimpleDateFormat format  = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

        List<News> list = new ArrayList<News>();

        for (Map map : src) {
            //获取url的值
            String url = map.get("vurl").toString();

            if(!checkUrl(url)){
                continue;
            }

            News news = new News();
            news.setTitle(map.get("title").toString());
            news.setIntro(map.get("intro").toString());
            news.setSource(map.get("source").toString());
            news.setVurl(url);
            String publish_time = map.get("publish_time").toString();
            news.setPublishTime(format.parse(publish_time));
            list.add(news);
        }
        return list;
    }

    //保存新闻的数据
    public int saveNews(List<News> list){


        SqlSession session = MybatisUtil.getSession();
        NewsMapper mapper = session.getMapper(NewsMapper.class);
        Integer res = mapper.batchSave(list);
        session.commit();
        session.close();
        return res;
    }

    //判断url是否已经在redis中保存
    public Boolean checkUrl(String url){
        Jedis jedis = new Jedis("localhost",6379);
        Long count = jedis.sadd("bigdata:0701:spider:news:url", url);
        jedis.close();
        return count>0?true:false;
    }
}

8.App(测试类):com.test.spider;

package com.test.spider;

import com.test.spider.EntertainmentSpider;

import java.util.Timer;
import java.util.TimerTask;

public class App {

    public static void main(String[] args) {

        final EntertainmentSpider spider = new EntertainmentSpider();
        //创建定时器
        Timer timer = new Timer();
        timer.schedule(
                new TimerTask() {
                    @Override
                    public void run() {
                       try {
                            spider.main();


                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }
                },
                0,
                1200000L
        );
    }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值