Idea编写简单Java网络爬虫程序（maven）

本文链接：https://blog.csdn.net/xgysimida/article/details/109681021

本文介绍了如何使用Java进行网络爬虫开发，包括在pom.xml中添加依赖，编写解析程序来抓取和解析网页内容，以及创建一个工具类用于发送HTTP请求。示例代码展示了如何从指定URL获取网页，解析HTML并提取文章标题、URL、简介和作者信息。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

在练习Java的时候，涉及到了网络爬虫，根据网上的一些博客，编写了一点程序，希望对大家有用。

第一步：在pom.xml文件中导入依赖

        <!-- 主要是httpclient请求的相关包 -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.12</version>
        </dependency>
        <!-- 主要是google.common的相关包 -->
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>23.0</version>
        </dependency>

        <!-- 主要用来解析网页的内容 -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

第二步：编写解析程序

package test;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
 * @ClassName: parseHtml
 * @Description: TODO
 * @Author: Wxz
 * @Date: 2020/11/13 15:23
 * @Version: V1.0
 */
public class parseHtml {
    public static void main(String[] args) {
        //请求的URl
        String uri ="http://www.datalearner.com/blog_list";
        //返回的页面内容
        String getHtml =null;
        try {
            //通过自己编写的工具类返回爬取的内容，类型为String
            getHtml = Utils.getHtmlContent(uri);
        } catch (IOException e) {
            e.printStackTrace();
        }
        //对爬取下来的字符串类型的内容进行解析
        Document doc = Jsoup.parse(getHtml);
        //以类似解析Dom树的形式解析相关内容，不同网页的内容要有不同的解析形式
        Elements htmllist = doc.select("div[class=card]");
        for (Element element : htmllist) {
            //获取的是文章的名称
            String title = element.select("h5[class=card-title]").text();
            //获取的文章url
            String Titleuri ="http://www.datalearner.com"+element.select("h5[class=card-title]>a").attr("href");
            //获取的文章简介
            String introduction = element.select("p[class=card-text text-justify]").text();
            //获取作者的姓名
            String author = element.select("span[class=fa fa-user]").text();
            //整合输出，当然也可以存储到数据库之类的，业务的处理还是看自己的安排
            System.out.println("标题： "+title+"   路径:  "+Titleuri+"  简介:"+introduction+"  作者:"+author);
            System.out.println("--------------");
        }

    }

}

第三步：编写Java爬虫的工具类

package test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
 * @ClassName: Utils
 * @Description: TODO
 * @Author: Wxz
 * @Date: 2020/11/13 15:25
 * @Version: V1.0
 */
public class Utils {
    /**
     * 公共爬虫类
     * @param url 请求Uri路径
     * @return
     * @throws IOException
     */
    public static String getHtmlContent(String url) throws IOException {
        //1、建立请求客户端
        CloseableHttpClient aDefault = HttpClients.createDefault();
        //2、获取请求地址
        HttpGet httpGet = new HttpGet(url);
        //3、获取网址返回结果
        CloseableHttpResponse execute = aDefault.execute(httpGet);
        //4、获取返回实体
        HttpEntity entity = execute.getEntity();
        //5、将获取的实体以字符串的形式进行返回
        String content = EntityUtils.toString(entity);
        //6、查看源代码是关闭流的意思
        EntityUtils.consume(entity);
        return content;
    }
}

参考博客：网络爬虫