网络爬虫和Jsoup

最新推荐文章于 2024-10-01 09:29:45 发布

「已注销」

最新推荐文章于 2024-10-01 09:29:45 发布

阅读量195

点赞数

文章标签： java

本文链接：https://blog.csdn.net/qq_45041521/article/details/105907355

版权

本文介绍了网络爬虫的基本概念和功能，强调了Jsoup在数据采集中的作用，主要作为HTML解析工具。文章详细讲解了Jsoup的Document对象、解析URL、选择器的使用等关键操作，为实现高效网页数据提取提供了指南。

摘要由CSDN通过智能技术生成

网络爬虫

从功能上来讲，爬虫一般分为数据采集，数据处理，数据存储

为什么学习爬虫

实现搜索引擎
大数据时代，让我们获取更多数据源
更好的进行搜索引擎优化

网络爬虫（Web crawler)，是一种按照一定规则，自动抓取万维网信息的程序或者脚本

<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.2</version>
</dependency>

1. 入门程序

 		//1. 打开浏览器/创建一个HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2. 输入网址/发起get请求创建HttpGet对象
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");

        //3. 按回车，发起请求，返回响应/使用HttpClient对象发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);

        //4. 解析响应获取数据
        //判断我们的状态码
        if(response.getStatusLine().getStatusCode() == 200){
            HttpEntity httpEntity = response.getEntity();
            String string = EntityUtils.toString(httpEntity, "utf-8");
            System.out.println(string);
        }

2. 带参数的Get请求

CloseableHttpClient httpClient = HttpClients.createDefault();

        URIBuilder uriBuilder = new URIBuilder("http://www.itcast.cn/search");
        uriBuilder.addParameter("keys", "Java");
        /*URI uri = new URIBuilder().setScheme("http").setHost("localhost").setPort(12345).setPath("/doGetControllerTwo").setParameters(params).build();*/

        HttpGet httpGet = new HttpGet(uriBuilder.build());

        CloseableHttpResponse response = httpClient.execute(httpGet);

        if (response.getStatusLine().getStatusCode() == 200) {
            String result = EntityUtils.toString(response.getEntity(), "utf-8");
            System.out.println(result);
        }

3. 不带参数的Post请求

		 //1. 打开浏览器/创建一个HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2. 输入网址/发起get请求创建HttpGet对象
        HttpPost httpPost = new HttpPost("http://www.itcast.cn");

        //3. 按回车，发起请求，返回响应/使用HttpClient对象发起请求
        CloseableHttpResponse response = httpClient.execute(httpPost);

        //4. 解析响应获取数据
        //判断我们的状态码
        if(response.getStatusLine().getStatusCode() == 200){
            HttpEntity httpEntity = response.getEntity();
            String string = EntityUtils.toString(httpEntity, "utf-8");
            System.out.println(string);
        }

4. 带参数的Post请求

CloseableHttpClient httpClient = HttpClients.createDefault();

        HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");

        //声明List集合，封装表单中的参数
        List<NameValuePair> paras = new ArrayList<NameValuePair>();
        paras.add(new BasicNameValuePair("key","value"));

        //创建表单Entity对象
        UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(paras,"utf-8");

        //设置表单的Entity对象到Post请求
        httpPost.setEntity(formEntity);

        CloseableHttpResponse response = httpClient.execute(httpPost);

        if (response.getStatusLine().getStatusCode() == 200){
            String string = EntityUtils.toString(response.getEntity(), "utf-8");
            System.out.println(string);
        }else {
            System.out.println("error");
        }

5. 连接池

//创建连接池管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//设置最大连接数和每个主机的最大连接数
cm.setMaxTotal(100);
cm.setDefaultMaxPerRoute(10);
//从连接池获取对象
CloseableHttpClient httpClient = HttpClients.custom()
   	.setConnectionManager(cm)
    .build();

HttpGet httpGet = new HttpGet("localhost://8080/ifram/login.html");

httpClient.execute(httpGet);
        //不需要关闭
        //httpClient.close();

6. 配置信息

//配置请求消息
RequestConfig config = RequestConfig.custom()
    .setConnectionRequestTimeout(1000) //创建连接最长时间，单位是毫秒
    .setConnectionRequestTimeout(500)   //设置获取链接的最长时间
    .setSocketTimeout(10*1000)  //设置数据传输的最长时间
    .build();
//给请求设置请求信息
httpGet.setConfig(config);

Jsoup

虽然使用Jsoup可以替代HttpClient直接发起请求解析数据，但是往往不会这样用，因为实际的开发过程中，需要使用到多线程，连接池，代理等等方式，而jsoup对这些的支持并不是很好，所以我们一般把jsoup 仅仅作为Html解析工具使用。

1. Document对象

Document 接口表示整个 HTML 或 XML 文档。从概念上讲，它是文档树的根，并提供对文档数据的基本访问。

2. 坐标

<dependency>
	<groupId>org.jsoup</groupId>
	<artifactId>jsoup</artifactId>
    <version>1.10.2</version>
</dependency>

<!-- FileUtils -->
<dependency>
	<groupId>commons-io</groupId>
	<artifactId>commons-io</artifactId>
	<version>2.6</version>
</dependency>

<!-- StringUtils -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.7</version>
</dependency>

3. 解析url

//解析url地址，第一个参数是访问的url，第二个参数是访问的超时时间
Document document = Jsoup.parse(new URL("http://www.itcast.cn"), 1000);

//使用标签选择器
String title = document.getElementsByTag("title").first().text();

4. 解析字符串

//使用工具类读取文件，获取字符串
String content = FileUtils.readFileToString(new File(),"utf-8");

//解析字符串
Document document = Jsoup.parse(content);

String title = document.getElementsByTag("title").first().text();

5. 解析文件

//解析文件,第一个参数文件地址，第二个参数编码
Document document = Jsoup.parse(new File(), "utf-8");
        
String title = document.getElementsByTag("title").first().text();