Java爬虫

最新推荐文章于 2022-04-28 16:16:13 发布

Uluoyu

最新推荐文章于 2022-04-28 16:16:13 发布

阅读量191

点赞数

分类专栏： Java 文章标签： java 爬虫

本文链接：https://blog.csdn.net/Uluoyu/article/details/117470136

版权

Java 专栏收录该内容

32 篇文章 0 订阅

订阅专栏

Java爬虫

1.小的爬虫例子

1）Maven依赖

引入httpclient对象来爬取数据，日志使用slf4j

<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
<dependency>
    <groupId>org.slf4j</groupId>
    <artifactId>slf4j-log4j12</artifactId>
    <version>1.7.25</version>
    <scope>test</scope>
</dependency>

2）日志配置

log4j.rootLogger=DEBUG,A1
log4j.logger.com.zhq=DEBUG

log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

3）测试代码CrawlerTest,爬取58同城的租房信息

package com.zhq.test;


import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class CrawlerTest {
    public static void main(String[] args) throws Exception {
        //1.打开浏览器，创建HttpClient 对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //2.输入网址，发起Get请求创建HttpGet对象
        HttpGet httpGet = new HttpGet("https://hf.58.com/chuzu/?PGTID=0d100000-0034-50f6-b943-9ee1684b96c4&ClickID=2");
        //按回车，发起请求，返回响应，使用httpClient对象发起请求
        CloseableHttpResponse response =httpClient.execute(httpGet);

        //解析请求，响应数据
        //1.判断状态码是否是200
        if(response.getStatusLine().getStatusCode()==200){
            HttpEntity httpEntity = response.getEntity();
            String content = EntityUtils.toString(httpEntity,"utf8");
            System.out.println(content);
        }
    }
}

2.HttpClient

1）HttpClient-Get

package com.zhq.test;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpGetTest {
    public static void main(String[] args) {
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //创建HttpGet对象，发起响应的地址
        HttpGet httpGet = new HttpGet("https://hf.58.com/chuzu/?PGTID=0d100000-0034-50f6-b943-9ee1684b96c4&ClickID=2");
        //使用HttpClient对象，发起响应，获得response
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            //解析响应
            if (response.getStatusLine().getStatusCode() == 200) {
                //通过工具类将获得的数据以UTF-8的编码方式获得，转为字符类型
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                System.out.println(content.length());
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                //关闭资源
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            //关闭浏览器
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

<dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
            //去除作用域，这样除了测试以外，也可以使用日志
<!--            <scope>test</scope>-->
        </dependency>

2)HttpClient-Get带参数

//请求地址是：https://hf.58.com/baohe/chuzu/?PGTID=0d3090a7-0034-8a5a-9823-045eab7825ba&ClickID=2
//创建URIBuilder
URIBuilder uriBuilder = new URIBuilder("https://hf.58.com/baohe/chuzu/");
//设置参数
uriBuilder.setParameter("PGTID", "0d3090a7-0034-8a5a-9823-045eab7825ba").setParameter("ClickID","2");
//也可以不断的点下去，设置多个参数
HttpGet httpGet = new HttpGet(uriBuilder.build());
System.out.println("请求的地址"+httpGet);

3）HttpClient-Post

//创建HttpPost对象，发起响应的地址
HttpPost httpPost = new HttpPost("https://hf.58.com/chuzu/?PGTID=0d100000-0034-50f6-b943-9ee1684b96c4&ClickID=2");

只有这一点区别。

4)HttpClient-Post带参数

//创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpPost对象，发起响应的地址
HttpPost httpPost = new HttpPost("https://hf.58.com/chuzu/");
//声明List集合获取，封装表单中的参数
List<NameValuePair> params = new ArrayList<NameValuePair>();
//请求地址是：https://hf.58.com/chuzu/?PGTID=0d100000-0034-50f6-b943-9ee1684b96c4&ClickID=2
params.add(new BasicNameValuePair("PGTID","0d100000-0034-50f6-b943-9ee1684b96c4"));
//创建表单的Entity对象，第一个参数是封装好的表单数据，第二参数是编码方式
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf8");
//设置表单的Entity对象到Post请求中
httpPost.setEntity(formEntity);

这儿有的问题就是如何传两个参数，而不是一个。

//4.添加参数
List<NameValuePair> parameters = new ArrayList<NameValuePair>();
for (Map.Entry<String, String> entry : params.entrySet()) {
parameters.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}

这是网上的一个参考。

3.HttpClient 连接池

每次请求都要创建HTTP Client，需要频繁的创建和销毁，为了解决这个问题引入了连接池。

package com.zhq.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpClientPoolTest {
    public static void main(String[] args) {
        //创建连接池管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        cm.setMaxTotal(100);
        //设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(10);
        //使用连接池管理器发起请求
        doGet(cm);
        doGet(cm);
        //这两个HttpClient是不同的
    }

    private static void doGet(PoolingHttpClientConnectionManager cm) {
        //不是每次都创建HttpClient对象，而是从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        //创建HttpGet对象，发起响应的地址
        HttpGet httpGet = new HttpGet("https://hf.58.com/chuzu/?PGTID=0d100000-0034-50f6-b943-9ee1684b96c4&ClickID=2");
        //使用HttpClient对象，发起响应，获得response
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            //解析响应
            if (response.getStatusLine().getStatusCode() == 200) {
                //通过工具类将获得的数据以UTF-8的编码方式获得，转为字符类型
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                System.out.println(content.length());
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (response != null) {
                try {
                    //关闭资源
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            //关闭浏览器
            //不能关闭HttpClient,由连接池管理HttpClient。
            //httpClient.close();
        }
    }
}

两个注意。

//不是每次都创建HttpClient对象，而是从连接池中获取HttpClient对象
    CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
//不能关闭HttpClient,由连接池管理HttpClient。
//httpClient.close();

为什么要设置每个主机的最大连接数

//设置最大连接数
cm.setMaxTotal(100);
//设置每个主机的最大连接数
cm.setDefaultMaxPerRoute(10);

当抓取一个网站时，里面可能有其他网站链接，例如新浪，百度，腾讯之类的，都有不同的主机，如果不分配，连接池中的100个连接数可能都给其中一个，设置之后每个主机都可以获得10个连接数。

4.请求参数

自定义相关的时间

//配置请求信息
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) //创建连接最长时间，单位为毫秒
        .setConnectionRequestTimeout(500) //设置获取连接的最长的时间，单位为毫秒
        .setSocketTimeout(10 * 1000) //设置传输数据的最长时间，单位为毫秒
        .build();
//给请求设置请求信息
httpGet.setConfig(config);

也可以定义与其他相关的请求参数。

5.Jsoup

jsoup 是一款Java 的HTML解析器，可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API，可通过DOM，CSS以及类似于jQuery的操作方法来取出和操作数据。

1）从一个URL，文件或字符串中解析HTML

2）使用DOM或CSS选择器来查找、取出数据

3）可操作HTML元素、属性、文本

1）加入Maven依赖

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<!-- Jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/junit/junit -->
<!-- 测试 -->
<dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.13.1</version>
    <scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<!-- 工具 操作文件-->
<dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>2.8.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<!-- 工具 StringUtils-->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.11</version>
</dependency>

2) Jsoup解析URL

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-57CJVkXc-1622618536121)(C:\Users\U落雨\Desktop\1610428225423.png)]

package jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;

import java.net.URL;

public class JsoupFirstTest {
    @Test
    public void testUrl() throws Exception {
        //解析url地址，第一个参数是访问的url地址，第二个参数是访问是时候的超时时间
        Document doc = Jsoup.parse(new URL("https://hf.58.com/chuzu/?PGTID=0d100000-0034-5fcb-50cd-6f38b3c43a53&ClickID=2"), 10*1000);
        //使用标签选择器获取标签里面的内容,获取第一个标签
        String title = doc.getElementsByTag("title").first().text();
        System.out.println(title);
    }
}

3) Jsoup解析字符串

@Test
public void testString() throws Exception{
    //使用工具类读取文件，获取字符串
    String content = FileUtils.readFileToString(new File("html页面的地址"), "utf8");
    Document doc = Jsoup.parse(content);
    //使用标签选择器获取标签里面的内容,获取第一个标签
    String title = doc.getElementsByTag("title").first().text();
    System.out.println(title);
}

结果和上面相同。

4）Jsoup解析文件

@Test
public void testFile() throws Exception{
    //解析文件
    Document doc = Jsoup.parse(new File("文件的地址"),"utf8")
    //使用标签选择器获取标签里面的内容,获取第一个标签
    String title = doc.getElementsByTag("title").first().text();
    System.out.println(title);
}

5）使用DOM方式遍历文档获取元素

@Test
public void testDOM() throws Exception {
    //解析url地址，第一个参数是访问的url地址，第二个参数是访问是时候的超时时间
    Document doc = Jsoup.parse(new URL("https://hf.58.com/chuzu/?PGTID=0d100000-0034-5fcb-50cd-6f38b3c43a53&ClickID=2"), 10*1000);
    //根据id查询元素getElementById
    //Element element = doc.getElementById("header-wrap");
    //根据标签获取元素getElementsByTag
    //Element element = doc.getElementsByTag("title").first();
    //根据class元素获取
    //Element element = doc.getElementsByClass("top-publish-news fr").first();
    //根据属性获取元素
    //Element element = doc.getElementsByAttribute("onClick").first();
    Element element = doc.getElementsByAttributeValue("onClick", "clickLog('from=fcpc_list_bj_fabu')").first();
    System.out.println(element.text());
}

6）从元素中获取数据

@Test
public void testData() throws Exception {
    //解析url地址，第一个参数是访问的url地址，第二个参数是访问是时候的超时时间
    Document doc = Jsoup.parse(new URL("https://hf.58.com/chuzu/?PGTID=0d100000-0034-5fcb-50cd-6f38b3c43a53&ClickID=2"), 10 * 1000);
    Element element = doc.getElementById("secitem-rent");
    String str = "";
    //从元素中获取id;
    //str=element.id();
    //从元素中获取className
    //str=element.className();
    //Set<String> classNames = element.classNames();
    //for (String s : classNames){
    //    System.out.println(s);
    //}
    //从元素中获取属性的值attr
    //str = element.attr("class");
    //从元素中获取所有属性attributes
    Attributes attributes = element.attributes();
    System.out.println(attributes.toString());
    //从元素中获取内容text,前面学过
    System.out.println("获取的数据" + str);
}

7)使用Selector选择器获取数据

@Test
public void testSelector() throws Exception {
    //解析url地址，第一个参数是访问的url地址，第二个参数是访问是时候的超时时间
    Document doc = Jsoup.parse(new URL("https://hf.58.com/chuzu/?PGTID=0d100000-0034-5fcb-50cd-6f38b3c43a53&ClickID=2"), 10 * 1000);
    //tagname：通过标签查找元素
    //Elements elements = doc.select("span");
    //#id:通过ID查找元素
    //Element element = doc.select("#secitem-rent").first();
    //.class：通过class名称查找数据
    //Element element = doc.select(".secitem").first();
    //[attribute]: 利用属性查找元素
    //Element element = doc.select("[para]").first();
    //[attr=value]: 利用属性值查找元素
    Elements elements = doc.select("[name=b_link]");
    for (Element element : elements){
        System.out.println(element.text());
    }
    //System.out.println(element.toString());
    //System.out.println(element.text());
}

8）使用Selector组合选择器获取元素

@Test
public void testSelector2() throws Exception {
    //解析url地址，第一个参数是访问的url地址，第二个参数是访问是时候的超时时间
    Document doc = Jsoup.parse(new URL("https://hf.58.com/chuzu/?PGTID=0d100000-0034-5fcb-50cd-6f38b3c43a53&ClickID=2"), 10 * 1000);
    //el#id: 元素+ID
    //Element element = doc.select("dl#secitem-rent").first();
    //el.class：元素+class
    //Element element = doc.select("span.dev").first();
    //el[attr]: 元素+属性名
    //Element element = doc.select("a[para]").first();
    //任意组合
    //ancestor child: 查找某个元素下子元素
    //Elements elements = doc.select(".secitem dt");
    //parent > child: 查找某个父元素下的直接子元素
    //Elements elements = doc.select(".secitem > dd > a");
    //parent > *: 查找某个父元素下的所有直接子元素
    Elements elements = doc.select(".secitem > dd > *");
    for (Element element : elements){
        System.out.println(element.text());
    }
    //System.out.println(element.text());
}

6.爬虫案例

一个小的Demo,有网站反爬取的问题，解决方法，代理IP。

7.Webmagic

1）加入maven依赖

<dependencies>
    <!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core -->
    <dependency>
        <groupId>us.codecraft</groupId>
        <artifactId>webmagic-core</artifactId>
        <version>0.7.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension -->
    <dependency>
        <groupId>us.codecraft</groupId>
        <artifactId>webmagic-extension</artifactId>
        <version>0.7.4</version>
    </dependency>
</dependencies>

2)日志配置

webmagic默认使用slf4j-log4j12作为slf4j的实现，核心包导入后，只需要配置日至即可。

log4j.rootLogger=INFO,A1

log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

3）小的案例

package com.zhq.webmagic;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;

public class JobProcessor implements PageProcessor {
    //解析页面
    public void process(Page page) {
        //解析返回数据的page,并把解析的结果放到ResultIteams中
        page.putField("div",page.getHtml().css("span.voted-tip").all());
        //XPath
        page.putField("div2",page.getHtml().xpath("//span[@class=voted-tip]"));
        //正则表达式，组合使用也可以。
        page.putField("div3",page.getHtml().css("div.para").regex(".*优化.*").all());
        //处理结果API
        page.putField("div4",page.getHtml().css("div.para").regex(".*优化.*").get());
        page.putField("div5",page.getHtml().css("div.para").regex(".*优化.*").toString());
        //获取链接
        //page.addTargetRequests(page.getHtml().css("div a").links().regex(".*百科.*").all());
        //page.putField("div5",page.getHtml().css("span.topic").all());
    }
    private Site site = Site.me()
            .setCharset("utf8")//编码方式
            .setTimeOut(10000)//设置超时时间
            .setRetryTimes(1000)//设置重试间隔时间
            .setSleepTime(3)//设置重试的次数
            ;
    public Site getSite() {
        return site;
    }
    //主函数执行爬虫
    public static void main(String[] args) {
        Spider.create(new JobProcessor())
                .addUrl("https://baike.baidu.com/item/%E7%99%BE%E5%BA%A6%E6%96%B0%E9%97%BB/107020?fr=aladdin") //设置爬取数据的页面
                .addPipeline(new FilePipeline("C:\\Users\\U落雨\\Desktop\\rs"))//保存到文件中
                .thread(5)//设置5个线程
                .run();//执行爬虫
    }
}

/设置超时时间
.setRetryTimes(1000)//设置重试间隔时间
.setSleepTime(3)//设置重试的次数
;
public Site getSite() {
return site;
}
//主函数执行爬虫
public static void main(String[] args) {
Spider.create(new JobProcessor())
.addUrl(“https://baike.baidu.com/item/%E7%99%BE%E5%BA%A6%E6%96%B0%E9%97%BB/107020?fr=aladdin”) //设置爬取数据的页面
.addPipeline(new FilePipeline(“C:\Users\U落雨\Desktop\rs”))//保存到文件中
.thread(5)//设置5个线程
.run();//执行爬虫
}
}

Uluoyu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Java爬虫

Java爬虫1.小的爬虫例子1）Maven依赖引入httpclient对象来爬取数据，日志使用slf4j<dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclie
复制链接

扫一扫

专栏目录