使用jsoup抓取网页数据并存入数据库

该项目简单思路就是通过jsoup抓取到的数据先存入到ArrayList中,然后再通过jdbc存入到数据库。

创建的是maven项目,需要在pom.xml中添加两个依赖:

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.38</version>
        </dependency>

创建实体类:

package dao;

/**
 * 实体类
 */
public class Blog {

    private String title;//标题
    private String href;//链接
    private String author;//作者
    private String reads;//阅读量

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getHref() {
        return href;
    }

    public void setHref(String href) {
        this.href = href;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public String getReads() {
        return reads;
    }

    public void setReads(String reads) {
        this.reads = reads;
    }

    public Blog(String title, String href, String author, String reads) {
        super();
        this.title = title;
        this.href = href;
        this.author = author;
        this.reads = reads;
    }

    public Blog(){
        super();
    }

    @Override
    public String toString() {
        return "Blog{" +
                "title='" + title + '\'' +
                ", href='" + href + '\'' +
                ", author='" + author + '\'' +
                ", reads='" + reads + '\'' +
                '}';
    }
}

然后进行数据爬取操作和存数据库操作:

package control;

import dao.Blog;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;

public class Grab {

    private static Document doc;

    public static void main(String[] args) {
        try {
            //获取的网页链接
            doc = Jsoup.connect("https://www.cnblogs.com/").get();
        } catch (IOException e) {
            e.printStackTrace();
        }
        //调用Blog类
        BlogZhua();
    }

    /**
     * 抓取网页数据
     */
    public static void BlogZhua() {
        //获取网页中的元素
        Elements eles = doc.select("div#post_list>div.post_item");
        //将抓取的数据存入到ArrayList集合中
        ArrayList<Blog> arrayList = new ArrayList();
        //使用for循环遍历网页中的数据
        for (Element ele : eles) {
            //抓取页面中的文章标题
            String txt = ele.select("div.post_item_body>h3>a.titlelnk").text();
            //抓取页面中的文章链接
            String href = ele.select("div.post_item_body>h3>a.titlelnk").attr("href");
            //爬取文章作者
            String author = ele.select("div.post_item_foot > a.lightblue").text();
            //爬取文章发布时间
            String reads = ele.select("div.post_item_foot > span.article_view > a.gray").text();
            //将数据添加到集合中
            Blog blog = new Blog();
            blog.setTitle(txt);
            blog.setHref(href);
            blog.setAuthor(author);
            blog.setReads(reads);
            arrayList.add(blog);
        }
        //遍历ArrayList集合
        for (Blog test : arrayList) {
//            System.out.println("数据:" + test);
            //此处调用AddBlog类,把集合中的数据添加到数据库中
            AddBlog(test);
        }
    }

    /**
     * 数据库操作
     *
     * @param blog
     */
    public static void AddBlog(Blog blog) {
        try {
            //加载mysql驱动
            Class.forName("com.mysql.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
        Connection connection = null;
        PreparedStatement pstmt = null;
        try {
            //连接mysql
            connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/jsoup?useSSL=true", "root", "1234");
            //编写sql语句
            String sql = "INSERT INTO `content`(`title`, `href`, `author`, `reads`) VALUES (?,?,?,?)";
            pstmt = connection.prepareStatement(sql);
            pstmt.setString(1, blog.getTitle());
            pstmt.setString(2, blog.getHref());
            pstmt.setString(3, blog.getAuthor());
            pstmt.setString(4, blog.getReads());
            int result = pstmt.executeUpdate();
            //验证数据是否添加成功
            if (result > 0) {
                System.out.println("数据添加成功!");
            }
        } catch (SQLException throwables) {
            throwables.printStackTrace();
            System.out.println("数据库访问失败!");
        }
    }
}
  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值