Java爬虫HttpClient与Jsoup

Java爬虫

爬取数据:根据一定的规则,自动抓取互联网信息

使用的Jar包

环境

  • JDK1.8

  • IDEA

  • Maven

Jar包

<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.8</version>
</dependency>

一、一个简单Demo

我们模拟一个打开浏览器访问网页的方式,获取一点数据

注意:有些网页不能获取

package com.kj.Util;

import org.apache.http.HttpEntity;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

/**
	发送一个最简单的请求
*/
public class HtmlParseUtil {
    public static void main(String[] args) throws IOException {
        String url = "https://www.itcast.cn";
        // 1.打开浏览器。创建一个HttpClient对象
        CloseableHttpClient aDefault = HttpClients.createDefault();

        // 2. 输入网址。发起 get 请求
        HttpGet httpGet = new HttpGet(url);

        // 3. 按回车,发请求。使用httpclient发送请求
        CloseableHttpResponse response = aDefault.execute(httpGet);

        // 4. 解析响应
        //获取状态码
        if (response.getStatusLine().getStatusCode() == 200) {
            HttpEntity entity = response.getEntity();
            String s = EntityUtils.toString(entity, "utf-8");
            System.out.println(s);
        }
    }
}

二、发送带参数的Get请求

package com.kj.Util;

import com.sun.javafx.fxml.builder.URLBuilder;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URISyntaxException;

/**
 * 带参数的 Get 请求
 */
public class HttpGet_2 {
    public static void main(String[] args) throws URISyntaxException {
        // 1.创建 httpclient 对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 2.url 重写
        URIBuilder uriBuilder  = new URIBuilder("http://yun.itheima.com/search");
        // 3.设置参数
        uriBuilder.setParameter("name","value");
        // 4.创建 httpGet 对象
        HttpGet httpGet = new HttpGet(uriBuilder.build());

        CloseableHttpResponse response = null;
        try {
            // 5.发送请求
            response = httpClient.execute(httpGet);

            if (response.getStatusLine().getStatusCode() == 200) {
                HttpEntity entity = response.getEntity();
                String s = EntityUtils.toString(entity, "utf-8");
                System.out.println(s);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }
}

三、发送带参数的Post请求

package com.kj.Util;

import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class HttpPost_3 {
    public static void main(String[] args) throws Exception {
        // 1.创建 httpclient 对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        HttpPost post = new HttpPost("http://yun.itheima.com/search");

        // 2.声明 list 集合,封装表单中的参数
        List<NameValuePair> params = new ArrayList<NameValuePair>();
        params.add(new BasicNameValuePair("so", "java"));
        // 3.创建表单的 entity 对象,参数一:post请求参数;参数二:字符集
        UrlEncodedFormEntity entity = new UrlEncodedFormEntity(params, "utf-8");
        // 4.将表单的 entity 设置到 post 请求中
        post.setEntity(entity);

        CloseableHttpResponse response = null;
        try {
            // 5.发送请求
            response = httpClient.execute(post);

            if (response.getStatusLine().getStatusCode() == 200) {
                HttpEntity httpEntity = response.getEntity();
                String s = EntityUtils.toString(httpEntity, "utf-8");
                System.out.println(s);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

四、连接池

对比jdbc连接池,对HttpClient进行管理

package com.kj.Util;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class Pool_4 {
    public static void main(String[] args) {
        // 创建连接池管理
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();

        // 设置链接数
        cm.setMaxTotal(100);

        // 设置每个主机最大链接数,设置访问主机的连接数大小。如:访问百度最多10个
        cm.setDefaultMaxPerRoute(10);

        // 使用连接池发送请求
        doGet(cm);
    }

    private static void doGet(PoolingHttpClientConnectionManager cm) {
        // 不是每次新创建httpclient,而是从连接池中创建httpclient
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        HttpGet get = new HttpGet("http://www.itcast.cn");
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(get);
            if (response.getStatusLine().getStatusCode() == 200) {
                String s = EntityUtils.toString(response.getEntity(), "utf-8");
                System.out.println(s);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

五、网页分析

​ 需要使用字符串处理工具

5.1、Html解析工具Jsoup

​ Java的html解析器,可以直接解析某个html地址、html文件内容。还提供了一套非常省力的Api,可以通过DOM,CSS以及类似jQuery的操作方式取出和操作数据

maven依赖

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>
<!--配合使用工具-->
<dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.13</version>
    <scope>test</scope>
</dependency>

<dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>2.6</version>
</dependency>

<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.10</version>
</dependency>

5.2、简单的抓取数据

package com.kj.util;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;

import java.net.URL;


public class _6UrlForJsoup {

    @Test
    public void testUrl() throws Exception {
        // 解析 url 地址
        Document document = Jsoup.parse(new URL("http://www.bilibili.com/"), 10000);
        // 获取 title 标签中的内容
        String title = document.getElementsByTag("title").first().text();
        System.out.println(title);
    }
}

​ PS:虽然Jsoup可以代替HttpClient发送请求,不过实际开发中使用到多线程、连接池、代理等方式,而Jsoup对这些的支持并不好。所以我们一般把Jsoup当作html的解析工具使用。

5.3、解析html字符串

@Test
public void testHtml() throws Exception{
    // 使用工具类获取字符串
    String content = FileUtils.readFileToString(new File("./src/main/resources/KJ_Study.html"), "utf-8");

    // 解析字符串
    Document document = Jsoup.parse(content);

    String title = document.getElementsByTag("title").first().text();
    System.out.println(title);
}

5.4、解析文件

@Test
public void testFile() throws Exception{
    // 解析文件
    Document document = Jsoup.parse(new File("./src/main/resources/KJ_Study.html"),"utf-8");

    String title = document.getElementsByTag("title").first().text();
    System.out.println(title);
}

5.5、【进阶】从元素中获取数据

@Test
public void testData() throws Exception{
    // 获取 Document
    Document document = Jsoup.parse(new File("./src/main/resources/KJ_Study.html"),"utf-8");

    Element element = document.getElementById("userSkin");

    /*
     <div id="userSkin">
       <div class="user-skin-box">
      <p class="user-skin-title">
                <span>自定义博客皮肤<span class="vip-get">VIP专享<span></span>
  .....
    */
    // 1.获取 id
    System.out.println(element.id());
    // 2.获取 className
    System.out.println(element.child(0).className());
    // 有多个 className 可以通过 classNames() 方法
    System.out.println(element.child(0).classNames());
    // 3.获取属性值。利用属性名
    System.out.println(element.attr("id"));
    // 4.获取元素的所有属性
    Attributes attributes = element.attributes();
    System.out.println(attributes);
    // 5.获取内容
    Elements elementsByClass = document.getElementsByClass("user-skin-title");
    System.out.println(elementsByClass.first().text());
}

5.6、【进阶】Selector选择器

@Test
    public void testSeletor() throws Exception {
        // 解析文件,获取 document
        Document document = Jsoup.parse(new File("./src/main/resources/KJ_Study.html"), "utf-8");
        // 通过标签名查找
        Elements elements = document.select("span");
        for (Element e : elements) {
            System.out.println(e.text());
        }

        // #id:通过 id 查找
        Elements e2 = document.select("#userSkin");
        for (Element e : e2) {
            System.out.println(e.child(0).className());
        }

        // .class:通过 class 名获取
        Elements e3 = document.select(".user-skin-box");
        for (Element e : e3) {
            System.out.println(e.child(0).className());
        }

        // [attribute]:通过属性名获取
        Elements e4 = document.select("[viewBox]");
        for (Element e : e4) {
            System.out.println(e.className());
        }

        // [attribute=value]:通过属性和属性值获取
        Elements e5 = document.select("[t=1567152543821]");
        for (Element e : e5) {
            System.out.println(e.className());
        }
    }

5.7、【进阶】选择器组合使用

@Test
public void testSelecter2() throws Exception {
    // 解析文件
    Document document = Jsoup.parse(new File("./src/main/resources/KJ_Study.html"), "utf-8");

    // el#id:元素+id
    Elements elements = document.select("div#cropBox");
    for (Element e : elements) {
        System.out.println(e.child(0).className());
    }

    // el.class:元素+class
    Elements e2 = document.select("span.close-bt");
    for (Element e : e2) {
        System.out.println(e.className());
    }

    // el[attr]:元素+属性名
    Elements e3 = document.select("svg[t]");
    for (Element e : e3) {
        System.out.println(e.className());
    }

    // 任意组合
    Elements e4 = document.select("svg[t].icon");
    for (Element e : e4) {
        System.out.println(e.className());
    }

    // ancestor child: 查找某个元素下子元素
    Elements e5 = document.select("p[class] span");
    for (Element e : e5) {
        System.out.println(e.text());
    }

    // parent > child: 查找某个父元素下的直接子元素。直接就是直接相连的
    Elements e6 = document.select("p[class] > span > svg");
    for (Element e : e6) {
        System.out.println(e.className());
    }

    // parent > *: 查找某个父元素下的所有直接子元素。
    Elements e7 = document.select("p[class] > span > *");
    for (Element e : e7) {
        System.out.println(e.tagName());
    }
}
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要使用Java编写基于HttpClientJsoup爬虫,需要进行以下步骤: 1. 首先,导入HttpClientJsoup的依赖包。可以使用maven或gradle进行依赖管理。 2. 创建一个HttpClient实例,用于发送HTTP请求和接收响应。可以使用HttpClients.createDefault()方法创建一个默认配置的实例。 3. 创建一个HttpGet实例,设置请求URL和请求头信息。可以使用new HttpGet(url)方法创建一个HttpGet实例,然后使用setHeader()方法设置请求头信息。 4. 发送HTTP请求,并获取响应结果。可以使用HttpClient.execute()方法发送请求,并使用HttpResponse.getEntity()方法获取响应实体。 5. 解析HTML内容。可以使用Jsoup.parse()方法解析HTML内容,然后使用Jsoup提供的API进行内容提取和处理。 以下是一个使用HttpClientJsoup进行网页爬取的示例代码: ```java import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.io.IOException; public class WebCrawler { public static void main(String[] args) throws IOException { // 创建一个HttpClient实例 HttpClient httpClient = HttpClients.createDefault(); // 创建一个HttpGet实例,设置请求URL和请求头信息 HttpGet httpGet = new HttpGet("https://www.example.com"); httpGet.setHeader("User-Agent", "Mozilla/5.0"); // 发送HTTP请求,并获取响应结果 HttpResponse httpResponse = httpClient.execute(httpGet); String html = EntityUtils.toString(httpResponse.getEntity(), "UTF-8"); // 解析HTML内容 Document document = Jsoup.parse(html); String title = document.title(); System.out.println("Title: " + title); } } ``` 在这个示例中,我们使用HttpClient发送了一个GET请求到https://www.example.com,并获取了响应结果。然后使用Jsoup解析HTML内容,并获取了网页的标题。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值