使用HttpClient和Jsoup爬取某网的妹子图片

工具:

 - HttpCilent     模拟发送请求,获取网站Html数据
 - Jsoup          解析Html数据,获取图片链接
 - Firebug        查看页面信息,寻找爬取规律

代码:

package ren.hz.spider.mzitu;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class GetInfoFromMz {

    public String getInfo() {

        //创建HttpClient
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //新建Get请求
        HttpGet get = new HttpGet("http://www.mzitu.com/all");
        //创建响应接受
        CloseableHttpResponse response;
        try {
            //执行请求
            response = httpClient.execute(get);
            if (response.getStatusLine().getStatusCode() == 200) {
                HttpEntity entity = response.getEntity();
                String html = EntityUtils.toString(entity);

                //使用Jsoup解析返回的html
                Document document = Jsoup.parse(html);
                //获取相应dom标签
                Elements div_all = document.select("div.all");
                //获取该节点下的所有a标签
                Elements as = div_all.select("a");
                //输出获取的标签数
                System.out.println(as.size());
                for (Element a : as) {
                    //获取a标签文字内容,去除空格,作为保存文件名
                    String title = a.text().trim();
                    //获取a标签链接
                    String link = a.attr("href");
                    //本地目录
                    String path = "E:/mzitu/" + title;
                    HttpGet get2 = new HttpGet(link);
                    response = httpClient.execute(get2);
                    Document document2 = Jsoup.parse(EntityUtils.toString(response.getEntity()));
                    //根据页面源码可以知道最大页码值在第21个span处
                    String max_span = document2.select("span").get(10).text();
                    //组装url
                    for (int i = 1; i < Integer.valueOf(max_span) + 1; i++) {
                        String url = link + "/" + i;
                        if (i == 1) {
                            url = link;
                        }
                        System.out.println(url);
                        HttpGet get3 = new HttpGet(url);
                        response = httpClient.execute(get3);
                        Document document3 = Jsoup.parse(EntityUtils.toString(response.getEntity()));
                        //获取图片地址
                        String img_url = document3.select("div.main-image").select("img").attr("src");
                        //根据图片地址,使用流的方式获取图片并存盘
                        InputStream ipt = httpClient.execute(new HttpGet(img_url)).getEntity().getContent();
                        File file = new File(path + "/" + img_url.substring(img_url.lastIndexOf("/")));
                        if (!file.exists()) {
                            if (!file.getParentFile().exists()) {
                                file.getParentFile().mkdirs();
                            }
                            file.createNewFile();
                        }
                        FileOutputStream fileOutputStream = new FileOutputStream(file);
                        byte[] bytes = new byte[1024];
                        int j = 0;
                        while ((j = ipt.read(bytes)) != -1) {
                            fileOutputStream.write(bytes, 0, j);
                        }
                        fileOutputStream.flush();
                        fileOutputStream.close();
                    }
                }
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return null;
    }

    public static void main(String[] args) {
        new GetInfoFromMz().getInfo();
    }

}
  • 3
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值