java+jsoup实现简单的爬虫 简单爬取百度百度实时热点

架构:Maven + MyBatis + MySQL + Mapper + Jsoup

先上整体架子 

数据库表设计

 

 下面就开始上代码了

Day01_BaiduNewsCrawler

package edu.xawl.main;

import edu.xawl.mapper.BaiduNewsMapper;
import edu.xawl.po.BaiduNews;
import edu.xawl.utils.MybatisHelper;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Day01_BaiduNewsCrawler {

    public static void main(String[] args) throws Exception {
        // 获取SQLSession
        SqlSession sqlsessionLocal = MybatisHelper.getSqlsessionLocal();
        // 注入要要操作的表mapper
        BaiduNewsMapper baiduNewsMapper = sqlsessionLocal.getMapper(BaiduNewsMapper.class);
        // 要抓取的网站
        String url = "http://top.baidu.com/buzz?b=1";
        // 使用爬虫获取
        Document document = Jsoup.connect(url).get();
        getElmentAndInsert(document,baiduNewsMapper,"实时热点");
        // 获取新闻列表
        Elements lis = document.select("#flist li");
        for (int i = 2; i < lis.size(); i++) {
            Element li = lis.get(i);
            String title = li.select("a").attr("title");
            String href = "http://top.baidu.com" + li.select("a").attr("href").substring(1);
            document = Jsoup.connect(href).get();
            getElmentAndInsert(document,baiduNewsMapper,title);
        }
        // 提交事务
        sqlsessionLocal.commit();
        // 关闭流
        sqlsessionLocal.close();
    }

    public static void getElmentAndInsert(Document document, BaiduNewsMapper baiduNewsMapper,String type) {
        // 选择相应的内容
        Elements trs = document.select("#main > div.mainBody > div > table tr");
        for (Element tr : trs) {
            // 获取网页数据
            String keyword = tr.select(".list-title").text();
            String clazz = tr.select(".tc").text();
            String tempNum = tr.select(".last").text();
            int num = 0;
            try {
                num = Integer.parseInt(tempNum);
            } catch (NumberFormatException e) {
                continue;
            }
            // 封装到po对象中
            BaiduNews baiduNews = new BaiduNews();
            baiduNews.setKeyword(keyword);
            baiduNews.setType(type);
            baiduNews.setClazz(clazz);
            baiduNews.setSerchNm(num);
            baiduNewsMapper.insert(baiduNews);
        }
    }


}

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>edu.xawl</groupId>
    <artifactId>crawler</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.20</version>
        </dependency>

        <dependency>
            <groupId>org.mybatis</groupId>
            <artifactId>mybatis</artifactId>
            <version>3.4.6</version>
        </dependency>

        <dependency>
            <groupId>tk.mybatis</groupId>
            <artifactId>mapper</artifactId>
            <version>4.0.4</version>
        </dependency>

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
        <!--可以快速生成实体类的get/set方法-->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.2</version>
        </dependency>
    </dependencies>
</project>

BaiduNewsMapper

mapper中实现了单表的增删改查,我们直接继承就可以使用

package edu.xawl.mapper;

import edu.xawl.po.BaiduNews;
import tk.mybatis.mapper.common.Mapper;

public interface BaiduNewsMapper extends Mapper<BaiduNews> {
}

BaiduNews   po对象

package edu.xawl.po;

import lombok.Data;

import javax.persistence.Column;
import javax.persistence.Table;

@Data
@Table(name = "baidu_news")
public class BaiduNews {
    private Integer id;
    private String keyword;
    private String type;
    private String clazz;
    @Column(name = "search_num")
    private Integer serchNm;
}

MybatisHelper

package edu.xawl.utils;

import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import tk.mybatis.mapper.common.Mapper;
import tk.mybatis.mapper.common.MySqlMapper;
import tk.mybatis.mapper.entity.Config;
import tk.mybatis.mapper.mapperhelper.MapperHelper;


public class MybatisHelper {

    private static SqlSessionFactory sqlSessionFactory;

    static {
        try {
            sqlSessionFactory = new SqlSessionFactoryBuilder().build(Resources.getResourceAsReader("mybatis-config.xml"), "local");
            SqlSession sessionLocal = null;
            try {
                sessionLocal = sqlSessionFactory.openSession();
                MapperHelper mapperHelper = new MapperHelper();
                Config config = new Config();
                config.setEnableMethodAnnotation(true);
                mapperHelper.setConfig(config);
                mapperHelper.registerMapper(Mapper.class);
                mapperHelper.registerMapper(MySqlMapper.class);
                mapperHelper.processConfiguration(sessionLocal.getConfiguration());
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                if (sessionLocal != null) {
                    sessionLocal.close();
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static SqlSession getSqlsessionLocal() {
        return sqlSessionFactory.openSession();
    }

}

mybatis-config.xml

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
        PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
    <!-- 开启驼峰自动映射 -->
    <settings>
        <!--<setting name="mapUnderscoreToCamelCase" value="false"/>-->
        <setting name="defaultStatementTimeout" value="60"/>
    </settings>

    <typeAliases>
        <package name="com.itcast.po"/>
    </typeAliases>

    <!-- 配置环境,制定数据库连接信息 -->
    <environments default="local">
        <environment id="local">
            <transactionManager type="JDBC"/>
            <dataSource type="POOLED">
                <property name="driver" value="com.mysql.jdbc.Driver"/>
                <property name="url" value="jdbc:mysql://127.0.0.1:3306/crawler"/>
                <property name="username" value="root"/>
                <property name="password" value="123"/>
            </dataSource>
        </environment>
    </environments>

    <mappers>
        <package name="edu.xawl.mapper"/>
    </mappers>
</configuration>


ok,完成

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值