爬取的网站链接
分析网站
输入查询条件后,点击搜索后,我们进行抓包
在这里插入图片描述
请求参数是下面的代码里我们要以键值对的形式放在post请求里的,下面上代码
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
//获取图片的url地址
public static List<String> getUrl(String newUrl,int i) throws IOException{
CloseableHttpClient client = HttpClientBuilder.create().build();
HttpPost post = new HttpPost(newUrl);
//设置请求头,可有可无,并不是最关键的
post.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " +
"Chrome/76.0.3809.132 Safari/537.36");
post.addHeader("Referer","https://www.birdnet.cn/atlas.php?mod=show&action=atlaslist");
post.addHeader("Sec-Fetch-Mode","no-cors");
post.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
post.addHeader("Connection","keep-alive");
post.addHeader("Content-Type","application/x-www-form-urlencoded");
post.addHeader("Cookie","Uy6T_2132_saltkey=XOdZD1pI; Uy6T_2132_lastvisit=1571735180; UM_distinctid=16df2ee85841c-06858747280452-5373e62-100200-16df2ee85851a1; Hm_lvt_44bc9d6e0d240a547107872c37798d70=1571738781; CNZZDATA1277640886=1938566344-1571737312-%7C1571743680; Uy6T_2132_sendmail=1; Uy6T_2132_sid=SpbQ31; Uy6T_2132_lastact=1571746309%09atlas.php%09show; Hm_lpvt_44bc9d6e0d240a547107872c37798d70=1571746310");
post.addHeader("Referer","https://www.birdnet.cn/atlas.php?mod=show&action=atlaslist");
post.addHeader("Upgrade-Insecure-Requests","1");
post.addHeader("Sec-Fetch-Mode","navigate");
post.addHeader("Sec-Fetch-Site","same-origin");
post.addHeader("Referer","https://www.birdnet.cn/atlas.php?mod=show&action=atlaslist");
// json方式
// JSONObject jsonParam = new JSONObject();
// jsonParam.put("all_name", "麻雀");
// StringEntity entity = new StringEntity(jsonParam.toString(),"utf-8");//解决中文乱码问题
// entity.setContentEncoding("UTF-8");
// entity.setContentType("application/json");
// post.setEntity(entity);
// 表单方式,下面的参数是刚刚图片里框出来的几个参数
List<BasicNameValuePair> pairList = new ArrayList<BasicNameValuePair>();
pairList.add(new BasicNameValuePair("all_name", "麻雀"));
pairList.add(new BasicNameValuePair("mod", "show"));
pairList.add(new BasicNameValuePair("action", "atlaslist"));
pairList.add(new BasicNameValuePair("searchType", "1"));
pairList.add(new BasicNameValuePair("page", Integer.toString(i)));
//由于页面的编码格式是GBK,这里需要设置一下
post.setEntity(new UrlEncodedFormEntity(pairList, "GBK"));
HttpResponse httpResponse = client.execute(post);
String content = EntityUtils.toString(httpResponse.getEntity());
//获取到爬取到的img标签的内容,该标签的src属性是图片的链接地址
Document dc=Jsoup.parse(content);
Elements elements = dc.select("img");
List<String> urls=new ArrayList<String>();
for (Element e:elements) {
if(e.attr("src").startsWith("https")){
urls.add(e.attr("src"));
System.out.println(e.attr("src"));
}
}
return urls;
}
// 下载图片,这里的形参url是图片的地址,i是用来给下载的图片命名的
public static void down_img(String url,int i) throws IOException {
String ext=url.substring(url.lastIndexOf("."));
CloseableHttpClient httpClient=HttpClientBuilder.create().build();
System.out.println("下载图片:" + i + ext);
Date date = new Date();
HttpGet httpGet = new HttpGet( url);
HttpResponse response = httpClient.execute(httpGet);
HttpEntity entity=response.getEntity();
OutputStream out=new FileOutputStream("D:\\bird\\"+i+ext);
entity.writeTo(out);
Date date1 = new Date();
long time = date1.getTime() - date.getTime();
System.out.println("耗时:" + time + "ms");
out.close();
}
public static void main(String[] args) throws IOException {
// i用于计算页数
int i=1;
List<String> list=new ArrayList<String>();
for(;i<=10;i++){
List<String> listUrl = getUrl("https://www.birdnet.cn/atlas.php?mod=show&action=atlaslist", 1);
list.addAll(listUrl);
}
System.out.println(list.size());
// 这里i重新赋值用于下载图片的命名
i=401;
for(String url:list){
down_img(url,i);
i++;
}
}
ok,到这里就完成了