java爬虫(httpClient+Jsoup)爬取高清大图简单实例
爬取的网站:
http://www.netbian.com/
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.Scanner;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
public class Main {
static int sum = 0;
public static void main(String[] args) throws ClientProtocolException, IOException, InterruptedException {
int ch = 0;//初始化
int page=1;
boolean selectAll=false;//判断用下载全部的真假情况
String flage = null;
while(true){
String[] arry= {"rili","dongman","fengjing","meinv","youxi","yingshi","dongtai","weimei","sheji","keai","qiche","huahui","dongwu"
,"jieri","renwu","meishi","shuiguo","jianzhu","tiyou","junshi","feizhuliu","qita","s/wangzherongyao" ,"s/huyan" ,"s/lol" };
System.out.println("彼岸桌面壁纸\n壁纸预览请打开网址http://www.netbian.com/\n壁纸的大小为1920*1080\n请选择你要下载的壁纸类型\t\n1.日历壁纸 2.动漫壁纸 3.风景壁纸 4.美女壁纸 5.游戏"
+ "\n6.影视 7.动态 8.唯美壁纸 9.设计 10.可爱壁纸 "
+ "\n11.汽车壁纸 12.花卉 13.动物 14.节日 15.人物 \n16.美食 17.水果 18.建筑 19.体育 20.军事 \n21.非主流 22.其它 23.王者荣耀 24.护眼 25.LOL"
+ "\n26.下载以上全部类型\n0.退出");
Scanner in = new Scanner(System.in);
if(selectAll==false) //如果选择下载全部类型为 假
ch=in.nextInt();//让用户自己选择下载什么类型
if(ch==26) {//如果选择26.下载全部类型
selectAll=true;//选择全部为 真
ch=0;//并赋值为0;
}
if(selectAll==true)//选择下载全部类型为 真
ch++;//ch自动加1,达到自动下载全部,
switch(ch) {//执行选择
case 1:flage = arry[ch-1];break;
case 2:flage = arry[ch-1];break;
case 3:flage = arry[ch-1];break;
case 4:flage = arry[ch-1];break;
case 5:flage = arry[ch-1];break;
case 6:flage = arry[ch-1];break;
case 7:flage = arry[ch-1];break;
case 8:flage = arry[ch-1];break;
case 9:flage = arry[ch-1];break;
case 10:flage = arry[ch-1];break;
case 11:flage = arry[ch-1];break;
case 12:flage = arry[ch-1];break;
case 13:flage = arry[ch-1];break;
case 14:flage = arry[ch-1];break;
case 15:flage = arry[ch-1];break;
case 16:flage = arry[ch-1];break;
case 17:flage = arry[ch-1];break;
case 18:flage = arry[ch-1];break;
case 19:flage = arry[ch-1];break;
case 20:flage = arry[ch-1];break;
case 21:flage = arry[ch-1];break;
case 22:flage = arry[ch-1];break;
case 23:flage = arry[ch-1];break;
case 24:flage = arry[ch-1];break;
case 25:flage = arry[ch-1];break;
default: System.exit(0);
}
try {
downLoadPicture(page, flage);
} catch (Exception e) {
// TODO: handle exception
System.out.println("感谢你的使用!总共为你下载了" + sum + "张图;目录在C:/爬虫图片/"+flage+"/");
System.out.println("下载完毕,返回主菜单!");
in=new Scanner(System.in);
}
}
}
public static void downLoadPicture(int page, String flage) throws ClientProtocolException, IOException, InterruptedException {
// 创建一个链接
CloseableHttpClient httpClient = HttpClients.createDefault();
// 利用链接发送请求index_2.htm
String adress;
if (page == 1)
adress = "http://www.netbian.com/"+flage+"/index.htm";
else
adress = "http://www.netbian.com/"+flage+"/index_" + page + ".htm";
HttpGet httpGet = new HttpGet(adress);
// 接收回应
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
// 获取响应解析工具httpEntity
HttpEntity httpEntity = httpResponse.getEntity();
// content响应内容(网页源代码),并解决中文乱码用UTF-8
String content = EntityUtils.toString(httpEntity, "UTF-8");
// 现在得到了网页源代码,而图片就在源代码中
// 拿到了脏数据----要清洗--像洗菜一样---
// Jsoup解析器
Document document = Jsoup.parse(content);
// elements标签下的内容 //直接从浏览器复制过来
Elements elements = document.select("div.wrap.clearfix #main div.list ul li a ");// 选择,元素,标签
int i = 0; //如果类名有空格继续用.链接
// 进一步清洗
for (i = 0; i < 21; i++) {// 观察到一个有略缩图的网页一共有30张
Element element = elements.get(i);
// 拿到高清图的链接,这个链接在 href 标签里面,attr接收到高清图链接
String attr = element.attr("href");
if (attr.equals("https://pic.netbian.com/"))
continue;
// 重复上面的步骤 //创建一个链接
CloseableHttpClient httpClient2 = HttpClients.createDefault();
// 利用高清图链接发送请求
HttpGet httpGet2 = new HttpGet("http://www.netbian.com" + attr + "");
// 接收回应
CloseableHttpResponse httpResponse2 = httpClient2.execute(httpGet2);
// 获取响应解析工具httpEntity
HttpEntity httpEntity2 = httpResponse2.getEntity();
// content响应内容(网页源代码),并解决中文乱码用UTF-8
String content2 = EntityUtils.toString(httpEntity2, "UTF-8");
// Jsoup解析器 再进一步的清洗
Document document2 = Jsoup.parse(content2);//直接从浏览器复制过来
Elements elements2 = document2.select("div.endpage div.pic p a img");// 选择,元素,标签
// System.out.println( elements2);
Element element2 = elements2.get(0);
// src有图片的地址,得到了高清图片的地址src.....
String attr3 = element2.attr("src");
// 利用高清图地址发送请求
HttpGet httpGet3 = new HttpGet(attr3);
// 执行下载该高清原图
CloseableHttpResponse httpResponse3 = httpClient.execute(httpGet3);
HttpEntity httpEntity3 = httpResponse3.getEntity();
//流入本地文件夹
InputStream stream = httpEntity3.getContent();
FileUtils.copyInputStreamToFile(stream, new File("C://爬虫图片//"+flage+"//" + page + "-" + i + ".png"));
sum++;
//Thread.sleep(1000);
System.out.println("恭喜第" + page + " 页的图片正在下载......总共下载了" + sum + "张图;目录在C:/爬虫图片/"+flage+"/");
}
page++;//下一页
downLoadPicture(page,flage);
}
}
运行效果如下
仅供学习和研究使用。
[添加链接描述]
我是一个java初学者:
由于对爬虫非常感兴趣,但又不是主学python的,那我就换一个姿势爬虫吧,这个我是在B站看视频学会的,大家也想学学的可以去看看
(https://www.bilibili.com/video/BV1wJ411J7dm)