项目概述
WallpapersHome是一个提供高质量壁纸的网站,包含自然、科技、艺术等多种类别的壁纸。我们的目标是编写一个爬虫程序,自动化地从该网站下载指定类别的高分辨率壁纸到本地文件系统。
关键技术栈
- Jsoup: 一个Java库,用于解析HTML文档,从中提取和操作数据。
- Hutool: 一个Java工具类库,封装了文件操作、网络请求等常用操作,简化Java开发。
- Java并发: 通过Java的并发工具,提高爬虫的下载效率。
实现步骤
- 获取壁纸列表: 使用Jsoup连接到WallpapersHome的特定类别页面,解析该页面的HTML文档,获取所有壁纸的详情页面链接。
- 解析壁纸详情: 针对每个壁纸详情链接,再次使用Jsoup获取高分辨率壁纸的直接下载URL。
- 下载壁纸: 利用Hutool的
HttpUtil.downloadFile
方法根据解析出的下载URL下载壁纸,并使用FileUtil
保存到本地文件系统。 - 处理并发下载: 虽然示例代码中未使用并发下载,但提到了可以使用Java的并发工具(如
ExecutorService
)来提升下载效率。
关键代码解析
获取壁纸列表
javaCopy codepublic List<String> getForm(Integer pageNum) throws IOException {
String url = formUrl + "?page=" + pageNum;
Connection connect = Jsoup.connect(url);
// 设置请求头...
Document doc = connect.get();
Elements links = doc.select("a[href]");
// 解析并返回所有壁纸详情页链接...
}
解析壁纸下载URL
javaCopy codeprivate String getDownloadUrl(String url) throws IOException {
Connection connect = Jsoup.connect(url);
// 设置请求头...
Document doc = connect.get();
// 解析最高分辨率的壁纸下载链接...
return links.stream().filter(link -> link.contains(maxRatio)).findFirst().orElse("");
}
下载壁纸
javaCopy codeprivate void download(String url) throws IOException {
// 提取文件名...
HttpUtil.downloadFile(url, FileUtil.file(filePath + fileName), new StreamProgress() {
public void start() { /* 开始下载... */ }
public void progress(long total, long progressSize) { /* 下载进度... */ }
public void finish() { /* 完成下载... */ }
});
}
实践应用与注意事项
此爬虫项目为开发者提供了一个实际应用场景,演示了如何结合Jsoup和Hutool库进行Web爬虫开发。在应用该爬虫时,请确保遵守WallpapersHome网站的使用条款,避免违反版权或其他法律法规。
通过合理利用Java并发机制,可以大幅提升爬虫的效率,实现快速下载。不过,合理控制并发数量,防止过度请求导致的IP封禁或对目标网站造成不必要的负载,是开发过程中需要特别注意的问题。
源码如下:
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.StreamProgress;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpUtil;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
public class Download {
/**
* 爬取的壁纸分页网址
*/
private String formUrl = "https://wallpapershome.com/nature";
/**
* 获取的壁纸网址(用于获取文件下载链接和分辨率)
*/
private String mainUrl = "https://wallpapershome.com";
/**
* 文件保存路径
*/
private String filePath = "F:\\壁纸\\";
/**
* 总页数
*/
private static Integer pageCount = 1;
private static Integer page = 1;
private static AtomicInteger downCount = new AtomicInteger(0);
public static void main(String[] args) {
Download download = new Download();
// ExecutorService executor = Executors.newFixedThreadPool(5);
try {
while (page <= pageCount) {
List<String> urllist = download.getForm(page);
System.out.println("=========================开始下载第" + page + "页==========================");
if (page == 1) {
System.out.println("============================总共" + pageCount + "页============================");
}
// 创建CompletionService
/* CompletionService<Void> completionService = new ExecutorCompletionService<>(executor);
for (String url : urllist) {
completionService.submit(() -> {
String downloadUrl = download.getDownloadUrl(url);
download.download(downloadUrl);
return null;
});
}
// 获取并处理任务结果
for (int i = 0; i < urllist.size(); i++) {
completionService.take().get(); // 这里使用take()方法获取最先完成的任务结果
}*/
for (String url : urllist) {
String downloadUrl = download.getDownloadUrl(url);
download.download(downloadUrl);
}
page++;
System.out.println("第"+page + "页下载完成");
Thread.sleep(1500);
}
} catch (Exception e) {
System.out.println("文件下载失败:" + e.getMessage());
e.printStackTrace();
} finally {
// executor.shutdown();
System.out.println("============所有文件下载完成,本次任务成功下载" + downCount.get() + "个文件========");
}
}
public List<String> getForm(Integer pageNum) throws IOException {
String url = formUrl;
if (Objects.nonNull(pageNum)) {
url = formUrl + "?page=" + pageNum;
}
Connection connect = Jsoup.connect(url);
connect.header("Cookie", "sid=g4m2ugp3v0sicdutrpqeepeqa1; screen-width=1440; screen-height=900; ratio=2; window-width=1392; col-width=1115; window-height=744; FCNEC=%5B%5B%22AKsRol_46hf2y-AJJMJjAiq0OgWf0wejs54kaJMdEnCoI9yKSQPj6AOgAgEB9oidBjSxbXtAMX58lbRNiBUaM_4v3DLqN2o1hxYouVGtBgYBjoEZIlzHd1hhNhXe_-RKSylBcQnBlysPs5cujvl6p2tJcUIHEgX8zA%3D%3D%22%5D%5D");
connect.header("Host", "wallpapershome.com");
connect.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0");
Document doc = connect.get();
// 提取所有链接
Elements links = doc.select("a[href]");
List<String> result = new ArrayList<>();
for (Element link : links) {
String href = link.attr("href");
if (StrUtil.isNotBlank(href) && href.endsWith(".html")) {
//下载页面
result.add(mainUrl + href);
} else if (StrUtil.isNotBlank(href) && href.startsWith("?page")) {
int num = Integer.parseInt(href.substring(6));
pageCount = pageCount < num ? num : pageCount;
}
}
return result;
}
/**
* 获取最高分辨率的壁纸下载链接
*
* @Param url
* @return
* @throws IOException
*/
private String getDownloadUrl(String url) throws IOException {
Connection connect = Jsoup.connect(url);
connect.header("Cookie", "sid=g4m2ugp3v0sicdutrpqeepeqa1; screen-width=1440; screen-height=900; ratio=2; FCNEC=%5B%5B%22AKsRol8iocKLTfJ3-7NEGMVingOM01ugKcNMT6KJgW7mBKWkdyKrX4nFYMvemNKhehcHRa_WEev2BEc7SYo0mloS-q1g0HF_AV6RB4n_YPQx9Q23ZHR_F3TuBMUJ0Em3K41uJ1nWPVtn93l4f-MPJnIaHa3b0hJH0w%3D%3D%22%5D%5D; window-width=1396; window-height=750; col-width=1118");
connect.header("Host", "wallpapershome.com");
connect.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0");
Document doc = connect.get();
List<String> links = new ArrayList<>();
doc.select("a[href]").forEach(link -> links.add(link.attr("href")));
List<String> paragraphs = new ArrayList<>();
doc.select("p").forEach(paragraph -> paragraphs.add(paragraph.text()));
int max = 0;
String maxRatio = "";
String maxRatio2 = "";
for (String paragraph : paragraphs) {
String[] s = paragraph.split(" ");
if (s.length > 1 && s[1].contains("x")) {
String[] xes = s[1].split("x");
if (xes.length > 1) {
String x = xes[0];
if (x.matches("\\d+") && max < Integer.parseInt(x) && links.stream().anyMatch(link -> link.contains(s[1]))) {
max = Integer.parseInt(x);
maxRatio = s[1];
maxRatio2 = s[0];
}
}
}
}
System.out.println("最大分辨率:" + maxRatio2 + " " + maxRatio);
String finalMaxRatio = maxRatio;
Optional<String> first = links.stream().filter(link -> StrUtil.isNotBlank(link) && link.contains(finalMaxRatio)).findFirst();
return first.map(s -> mainUrl + s).orElse("");
}
private void download(String url) throws IOException {
if (StrUtil.isBlank(url)) {
return;
}
String[] split = url.split("-");
String fileName;
if (split.length > 1) {
fileName = split[split.length - 1];
} else {
fileName = url.substring(url.lastIndexOf("/") + 1);
}
//超时时间5分钟
HttpUtil.downloadFile(url, FileUtil.file(filePath + fileName), 1000 * 60 * 5, new StreamProgress() {
@Override
public void start() {
System.out.println(Thread.currentThread().getName() + ": " + fileName + "开始下载。。。。");
}
@Override
public void progress(long total, long progressSize) {
System.out.println(Thread.currentThread().getName() + ": " + fileName + ",已下载:" + FileUtil.readableFileSize(progressSize));
}
@Override
public void finish() {
downCount.incrementAndGet();
System.out.println(Thread.currentThread().getName() + ": " + fileName + " 下载完成!");
}
});
}
}