刚刚学习java爬虫,使用jsoup和httpclient写个小demo进行入门学习
1.引入依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
2.编写保存图片实体类
@Data
public class Picture {
private String title;
private String url;
}
3.实现方法
package com.teset.demo.utils;
import com.teset.demo.entity.Picture;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.stream.Collectors;
/**
* 下载壁纸
* @Author Administrator
* @Date 2021/7/30 14:57
*/
public class BizhiDownloader {
private CloseableHttpClient httpClient;
private String referer;
private String rootPath;
private String filePath;
private String picUrl;
public BizhiDownloader() {
httpClient = HttpClientUtil.getHttpClient();
}
public void start(List<String> urlList) {
urlList.stream().forEach(url->{
this.referer = url;
String dirName = url.substring(22, url.length()-1); //根据标题名字去创建目录
//创建分类目录
File path = new File("D:/bizhi/", dirName); //保存路径
if (!path.exists() && !path.isDirectory()) {
path.mkdir();
rootPath = path.toString();
}
rootPath = path.toString();
for (int i = 1; i <= 10; i++) { //分页获取图片数据,简单获取几页就行了
picUrl = url;
if (i==1){
this.page(url);
}else {
this.page(url + "index_" + i + ".html");
}
}
});
}
/**
* 分页获得数据
*/
public void page(String url){
System.out.println("url:" + url);
String html = this.getHtml(url);
Map<String, String> picMap = this.extractTitleUrl(html);
if (picMap == null) {
return ;
}
this.getPictureHtml(picMap);
}
/**
* 获得图片页面数据
*/
public void getPictureHtml(Map<String, String> picMap){
//进入标题页,在标题页中再次分页下载。
picMap.forEach((title, url)->{
String html = this.getHtml(picUrl+url);
if (html == null) {
//如果返回的页面数据为 null,那就退出;
return ;
}
Picture picture = this.extractPictureUrl(html);
System.out.println("开始下载");
SinglePictureDownloader downloader = new SinglePictureDownloader(picture, referer, rootPath);
this.download(picture);
try {
Thread.sleep(500);
System.out.println("爬取完一张图片,休息0.5秒。");
} catch (InterruptedException e) {
e.printStackTrace();
}
});
}
private Picture extractPictureUrl(String html) {
Document doc = Jsoup.parse(html, "GBK");
String title = doc.getElementsByTag("H1").first().text();
//获得图片路径
String src = doc.getElementsByAttributeValue("id", "img").first().getElementsByTag("img").attr("src");
//获取图片的文件扩展名
title = title + src.substring(src.lastIndexOf("."));
return new Picture(title, picUrl+src);
}
/**
* 获得页面数据
* @param url
* @return
*/
private String getHtml(String url) {
String html = null;
HttpGet get = new HttpGet(url);
get.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36");
get.setHeader("referer", url);
try (CloseableHttpResponse response = httpClient.execute(get)) {
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == 200) {
HttpEntity entity = response.getEntity();
if (entity != null) {
html = EntityUtils.toString(entity, "GBK");
}
}
else {
System.out.println(statusCode);
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return html;
}
/**
* 抽取图片的url
* @param html
* @return
*/
private Map<String, String> extractTitleUrl(String html) {
if (html == null) {
return null;
}
Document doc = Jsoup.parse(html, "UTF-8");
Elements pictures = doc.select("ul.clearfix > li");
//拿到图片地址
Elements pictureA = pictures.stream()
.map(pic->pic.getElementsByTag("a").first())
.collect(Collectors.toCollection(Elements::new));
return pictureA.stream().collect(Collectors.toMap(
pic->pic.getElementsByTag("img").first().attr("alt"),
pic->pic.attr("href")));
}
/**
* 下载方法
*/
public void download(Picture picture) {
HttpGet get = new HttpGet(picture.getUrl());
Random rand = new Random();
//设置请求头
get.setHeader("User-Agent", HeaderUtil.headers[rand.nextInt(HeaderUtil.headers.length)]);
get.setHeader("referer", referer);
System.out.println(referer);
HttpEntity entity = null;
try (CloseableHttpResponse response = httpClient.execute(get)) {
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == 200) {
entity = response.getEntity();
if (entity != null) {
File picFile = new File(rootPath, picture.getTitle());
try (OutputStream out = new BufferedOutputStream(new FileOutputStream(picFile))) {
entity.writeTo(out);
System.out.println("下载完毕:" + picFile.getAbsolutePath());
}
}
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
EntityUtils.consume(entity);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
4.启动
public static void main(String[] args){
//要爬取的页面
String[] urls = new String[] {
"https://pic.netbian.com/",
"https://pic.netbian.com/4kmeinv/",
"https://pic.netbian.com/4kdongman/"
};
// 添加初始队列,启动爬虫
List<String> urlList = new ArrayList<>(Arrays.asList(urls));
BizhiDownloader spider = new BizhiDownloader();
spider.start(urlList);
}