使用Jsoup进行简单的网页爬虫

1 篇文章 0 订阅
1 篇文章 0 订阅
  • 爬取某旅游网站信息
package com.itheima._02spider;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

public class TravelSpider {
    public static void main(String[] args) throws Exception {
        String url = "http://www.jinmalvyou.com/search/index/view_type/1/keyword/%E5%9B%BD%E5%86%85";

        fetchTravelData(url);
    }

    private static void fetchTravelData(String url) throws Exception {
        //1.读取url,得到Document对象
        Document document = Jsoup.connect(url).get();
        //2.获取到这一页所有的旅游路线信息
        Elements elements = document.select(".rl-b-li");
        //3.循环处理每个路线信息
        for (Element element : elements) {
            //3.1 获取路线名称
            Elements rnameElement = element.select(".pro-title>a");
            String rname = rnameElement.text();
            System.out.println("路线名称:" + rname);
            //3.2 获取路线介绍
            Elements introduceElements = element.select(".pro-colomn");
            Element introduceElement = introduceElements.get(0);
            String routeIntroduce = introduceElement.text();
            System.out.println("路线介绍:" + routeIntroduce);

            //3.3 获取路线价格
            Elements priceElements = element.select(".price>strong");
            String price = priceElements.text();
            System.out.println("路线价格:" + price);

            //3.4 获取路线图片
            Elements rimageElements = element.select(".pro-img img");
            String rimageSrc = "http:" + rimageElements.attr("src");
            String localPath = saveImage(rimageSrc);
            System.out.println("路线图片:" +localPath);

            System.out.println("-----------------------------------");
        }

        //4.爬取下一页的内容
        Elements nextElements = document.select("a.next");
        if (nextElements != null && !nextElements.isEmpty()) {
            String nextUrl = "http://www.jinmalvyou.com" + nextElements.attr("href");
            fetchTravelData(nextUrl);
        }
    }

    /**
     * Java程序发HTTP请求,得到图片,保存到本地。返回图片在本地的保存路径
     * @param rimageSrc
     * @return
     */
    private static String saveImage(String rimageSrc) throws IOException {
        //从图片路径里,得到图片名称:http://img.jinmalvyou.com/20190516/goods_thumb_22624_330_195.jpeg
        int index = rimageSrc.lastIndexOf("/");
        String rimageName = rimageSrc.substring(index + 1); //goods_thumb_22624_330_195.jpeg
        String localPath = "E:\\63\\travelImages\\" + rimageName;

        //1.创建一个客户端对象,相当于我们的浏览器
        CloseableHttpClient client = HttpClients.createDefault();
        //2.创建HTTP请求对象
        HttpGet get = new HttpGet(rimageSrc);
        //3.使用客户端发送HTTP请求
        CloseableHttpResponse response = client.execute(get);
        //4.判断响应状态
        if (response.getStatusLine().getStatusCode() == 200) {
            //5.得到响应结果
            HttpEntity entity = response.getEntity();
            //6.得到响应的内容 响应体
            InputStream inputStream = entity.getContent();
            //7.创建一个输出流
            FileOutputStream outputStream = new FileOutputStream(localPath);
            //8.把数据写到输出流里
            /*int len = -1;
            byte[] buffer = new byte[1024 * 8];
            while ((len = inputStream.read(buffer)) != -1) {
                outputStream.write(buffer, 0, len);
            }*/
            IOUtils.copy(inputStream, outputStream);
            inputStream.close();
            outputStream.close();
        }
        return localPath;
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值