java爬取表单post提交参数后返回页面的图片

java爬取表单post提交参数后返回页面的图片

爬取的网站链接

link

分析网站

在这里插入图片描述
输入查询条件后,点击搜索后,我们进行抓包
在这里插入图片描述
在这里插入图片描述
请求参数是下面的代码里我们要以键值对的形式放在post请求里的,下面上代码

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;

import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
    //获取图片的url地址
    public static List<String> getUrl(String newUrl,int i) throws IOException{
        CloseableHttpClient client = HttpClientBuilder.create().build();
        HttpPost post = new HttpPost(newUrl);
        //设置请求头,可有可无,并不是最关键的
        post.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " +
                "Chrome/76.0.3809.132 Safari/537.36");
        post.addHeader("Referer","https://www.birdnet.cn/atlas.php?mod=show&action=atlaslist");
        post.addHeader("Sec-Fetch-Mode","no-cors");
        post.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
        post.addHeader("Connection","keep-alive");
        post.addHeader("Content-Type","application/x-www-form-urlencoded");
        post.addHeader("Cookie","Uy6T_2132_saltkey=XOdZD1pI; Uy6T_2132_lastvisit=1571735180; UM_distinctid=16df2ee85841c-06858747280452-5373e62-100200-16df2ee85851a1; Hm_lvt_44bc9d6e0d240a547107872c37798d70=1571738781; CNZZDATA1277640886=1938566344-1571737312-%7C1571743680; Uy6T_2132_sendmail=1; Uy6T_2132_sid=SpbQ31; Uy6T_2132_lastact=1571746309%09atlas.php%09show; Hm_lpvt_44bc9d6e0d240a547107872c37798d70=1571746310");
        post.addHeader("Referer","https://www.birdnet.cn/atlas.php?mod=show&action=atlaslist");
        post.addHeader("Upgrade-Insecure-Requests","1");
        post.addHeader("Sec-Fetch-Mode","navigate");
        post.addHeader("Sec-Fetch-Site","same-origin");
        post.addHeader("Referer","https://www.birdnet.cn/atlas.php?mod=show&action=atlaslist");
        //        json方式
//        JSONObject jsonParam = new JSONObject();
//        jsonParam.put("all_name", "麻雀");
//        StringEntity entity = new StringEntity(jsonParam.toString(),"utf-8");//解决中文乱码问题
//        entity.setContentEncoding("UTF-8");
//        entity.setContentType("application/json");
//        post.setEntity(entity);

//        表单方式,下面的参数是刚刚图片里框出来的几个参数
        List<BasicNameValuePair> pairList = new ArrayList<BasicNameValuePair>();
        pairList.add(new BasicNameValuePair("all_name", "麻雀"));
        pairList.add(new BasicNameValuePair("mod", "show"));
        pairList.add(new BasicNameValuePair("action", "atlaslist"));
        pairList.add(new BasicNameValuePair("searchType", "1"));
        pairList.add(new BasicNameValuePair("page", Integer.toString(i)));
        //由于页面的编码格式是GBK,这里需要设置一下
        post.setEntity(new UrlEncodedFormEntity(pairList, "GBK"));

        HttpResponse httpResponse = client.execute(post);
        String content = EntityUtils.toString(httpResponse.getEntity());
        
        //获取到爬取到的img标签的内容,该标签的src属性是图片的链接地址
        Document dc=Jsoup.parse(content);
        Elements elements = dc.select("img");

        List<String> urls=new ArrayList<String>();
        for (Element e:elements) {
            if(e.attr("src").startsWith("https")){
                urls.add(e.attr("src"));
                System.out.println(e.attr("src"));
            }

        }
        return urls;
    }

   //    下载图片,这里的形参url是图片的地址,i是用来给下载的图片命名的
    public static void down_img(String url,int i) throws IOException {
        String ext=url.substring(url.lastIndexOf("."));
        CloseableHttpClient httpClient=HttpClientBuilder.create().build();
        System.out.println("下载图片:" + i + ext);

        Date date = new Date();
        HttpGet httpGet = new HttpGet( url);

        HttpResponse response = httpClient.execute(httpGet);
        HttpEntity entity=response.getEntity();
        OutputStream out=new FileOutputStream("D:\\bird\\"+i+ext);
        entity.writeTo(out);
        Date date1 = new Date();
        long time = date1.getTime() - date.getTime();
        System.out.println("耗时:" + time + "ms");
        out.close();
    }
       public static void main(String[] args) throws IOException {

//        i用于计算页数
        int i=1;
        List<String> list=new ArrayList<String>();
        for(;i<=10;i++){
            List<String> listUrl = getUrl("https://www.birdnet.cn/atlas.php?mod=show&action=atlaslist", 1);
            list.addAll(listUrl);
        }
        System.out.println(list.size());
        
//        这里i重新赋值用于下载图片的命名
        i=401;
        for(String url:list){
                down_img(url,i);
                i++;
        }
    }

ok,到这里就完成了
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值