首先添加jsoup依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
爬取的网站
废话不多说,直接上代码。
package com.cong.paCong;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.swing.filechooser.FileSystemView;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
/**
* 土豪漫画 www.zhwsxx.com
* extends paCongUtils
*/
public class zhwsxxCom {
/**
* 获取电脑桌面路径
*/
public static String top = FileSystemView.getFileSystemView().getHomeDirectory().getPath() + "\\manHua";
/*爬取网站*/
public static final String URL = "https://zhwsxx.com";
/*爬取漫画地址*/
public static final String manhua_url = "https://zhwsxx.com/book/1";
/*漫画名称*/
public static final String name = "斗破苍穹";
public static void main(String[] args) {
getChapter(manhua_url).forEach((key, value) -> {
List<String> page = getPage(value);
for (int i = 0; i < page.size(); i++) {
setUrl(page.get(i), name, key, i);
}
});
}
/**
* 获取可以下载图片最多的地址
*
* @param dis
* @return
*/
public static List<String> getPage(String dis) {
/*储存 图片地址 key 可以下载的图片数量 value 图片地址*/
LinkedHashMap<Long, List<String>> linkedHashMap = new LinkedHashMap<>();
/*将多个url 拆分*/
String[] split = dis.split(",");
for (int i = 0; i < split.length; i++) {
if (split[i] != "" && split[i] != null) {
/*通过url 获取 下载图片的url */
List<String> urlList = getUrlList(split[i]);
System.err.println(split[i]);
/*判断可以下载的图片 url */
Long number = urlList.stream().filter((e) -> URLWhether(e)).count();
System.err.println(number);
/*添加进map中*/
linkedHashMap.put(number, urlList);
}
}
/*把map 的key 转换为 list 并且排序查出最大数*/
Long aLong = linkedHashMap.entrySet().stream().map((e) -> e.getKey()).max(Comparator.naturalOrder()).get();
System.err.println(aLong);
/*返回*/
return linkedHashMap.get(aLong);
}
/**
* 获取图片路径
*
* @param url
* @return
*/
public static List<String> getUrlList(String url) {
List<String> list = new ArrayList<>();
/*获取 html代码*/
Document document = jsoupGet(url);
/*获取 装有 漫画篇章地址的盒子*/
Element content = document.getElementById("content");
/*解析出img标签*/
Elements select = content.select(".comiclist .comicpage div img");
select.forEach((e) -> {
/*获取 图片路径*/
list.add(e.attr("data-original"));
});
/*返回*/
return list;
}
/*获取篇章*/
public static LinkedHashMap<String, String> getChapter(String url) {
/*获取html*/
Document document = jsoupGet(url);
/*获取篇章盒子*/
Element elementById = document.getElementById("detail-list-select");
/*获取篇章地址*/
Elements select = elementById.select("li > a");
/*保存篇章章节名称*/
LinkedHashMap<String, LinkedHashMap<String, String>> list = new LinkedHashMap<>();
/*储存篇章与地址数据*/
LinkedHashMap<String, String> map = new LinkedHashMap<>();
/*遍历篇章*/
for (int i = 0; i < select.size(); i++) {
/*获取地址*/
String href = select.get(i).attr("href");
/*获取名称*/
String text = select.get(i).text().replace(" ", "");
if (map.get(text) != null) {
map.put(text, map.get(text) + "," + URL + href);
} else {
map.put(text, URL + href);
}
}
map.forEach((key, value) -> {
/*去掉中文 只保留数字*/
String strTurnNumber = getStrTurnNumber(key);
/*判断篇章名称是否已经存在*/
String trim = strTurnNumber.equals("") ? "" : Integer.valueOf(strTurnNumber).toString();
/*获取 map key中最后一位*/
Integer max = getMax(list);
if (max == null || trim == "" || Integer.valueOf(trim) >= max) {
if (trim == "" || !getUseless(key)) {
LinkedHashMap linkedHashMap = new LinkedHashMap();
linkedHashMap.put(key, value);
list.put(key.replace(":", "").replace(" ", ""), linkedHashMap);
} else if (list.get(trim) != null) {
LinkedHashMap linkedHashMap = new LinkedHashMap();
linkedHashMap.putAll(list.get(trim));
linkedHashMap.put(key, value);
list.put(trim, linkedHashMap);
} else {
LinkedHashMap linkedHashMap = new LinkedHashMap();
linkedHashMap.put(key, value);
list.put(trim, linkedHashMap);
}
}
});
LinkedHashMap<String, String> linkedHashMap = new LinkedHashMap<>();
Integer finalMax = getMax(list);
final boolean[] bool = {true};
list.forEach((key, value) -> {
if (bool[0]) {
StringBuffer strK = new StringBuffer();
StringBuffer strV = new StringBuffer();
AtomicReference<Integer> number = new AtomicReference<>(0);
value.forEach((k, v) -> {
if (number.get() == 0) {
number.getAndSet(number.get() + 1);
strK.append(k);
}
strV.append(v + ",");
});
linkedHashMap.put(strK.toString().replace("话", "话 "), strV.toString());
}
if (finalMax != null && bool[0] == true) {
bool[0] = key.equals(String.valueOf(finalMax)) ? false : true;
}
});
return linkedHashMap;
}
/**
* 获取map key中最大数字
*
* @param list
* @return
*/
public static Integer getMax(LinkedHashMap<String, LinkedHashMap<String, String>> list) {
Integer max = null;
List<Integer> collect = list.entrySet().stream().map((e) -> e.getKey())
.filter((e) -> e.length() != 0 && isNumeric(e))
.map((e) -> Integer.valueOf(e)).collect(Collectors.toList());
if (collect.size() != 0) {
max = collect.stream().max(Comparator.naturalOrder()).get();
}
return max;
}
/**
* 判断等于是不是无关紧要的篇章
*
* @param str
* @return
*/
public static boolean getUseless(String str) {
String[] arr = {"请假条", "请假", "2020年", "2021年", "新年", "下"};
for (int i = 0; i < arr.length; i++) {
if (str.indexOf(arr[i]) != -1) {
return false;
}
}
return true;
}
/**
* 获取字符串中的数字
*
* @return
*/
public static String getStrTurnNumber(String text) {
List<String> list = new ArrayList<>(Arrays.asList(new String[]{"一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "百", "千", "万", "亿", "零"}));
if (text == "" || text == null || text.length() == 0) {
return "";
}
StringBuffer buffer = new StringBuffer();
Integer number = 0;
/*去掉中文 只保留数字*/
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
/*判断是否是数字*/
if (Character.isDigit(c)) {
buffer.append(c);
number++;
continue;
} else {
/*如果不是数字在判断是不是中文数字*/
if (list.contains(String.valueOf(c)) && text.substring(0, 1).equals("第")) {
buffer.append(c);
number++;
continue;
}
}
if (number > 0) {
break;
}
}
/*最后是形态如 45 或 四十五*/
String string = buffer.toString().trim();
/*判断是否是中文数字*/
if (string != "" && string != null && string.length() != 0 && list.contains(string.substring(0, 1))) {
/*如果是中文数字 转换*/
string = String.valueOf(chineseNumberInt(string));
}
return string;
}
/**
* 下载图片方法 chapter代表几张 page第几页
*
* @param url 图片地址
* @param chapter 图片 章节 名称
* @param ManHuaName 漫画 名称
* @param page 第几章的几页
* @throws IOException
*/
public static void setUrl(String url, String ManHuaName, String chapter, int page) {
System.err.println("下载图片方法" +
" url=:" + url + "" +
" chapter=:" + chapter +
" page=:" + page);
InputStream reader = null;
OutputStream writer = null;
try {
/*创建网络连接*/
java.net.URL http = new URL(url);
/*连接数据*/
URLConnection tion = null;
tion = http.openConnection();
/*读取数据*/
reader = tion.getInputStream();
/*存放图片文件夹 一级文件夹*/
File file = new File(top + "\\" + ManHuaName + "\\" + chapter);
/*文件不存在*/
if (!file.exists()) {
/*创建文件*/
file.mkdirs();
}
/*写*/
writer = new FileOutputStream(file + "\\第" + page + "页" + ".jpg");
/*每次读取的字节长度*/
int n = 0;
/*存储每次读取的内容*/
byte[] bb = new byte[1024];
while ((n = reader.read(bb)) != -1) {
/*将读取的内容,写入到输出流当中*/
writer.write(bb, 0, n);
}
} catch (IOException e) {
e.printStackTrace();
}
/*关闭流*/
getClose(writer);
getClose(reader);
}
/**
* 关闭留的方法
*
* @param closeable
*/
public static void getClose(Closeable closeable) {
try {
if (closeable == null) {
closeable.close();
}
} catch (IOException e) {
System.err.println("关闭流出错了");
e.printStackTrace();
}
}
/**
* 判断 网络资源是否存在
*
* @param urlName
* @return
*/
public static boolean URLWhether(String urlName) {
try {
URL url = new URL(urlName);
// 返回一个 URLConnection 对象,它表示到 URL 所引用的远程对象的连接。
URLConnection uc = url.openConnection();
// 打开的连接读取的输入流。
InputStream in = uc.getInputStream();
return true;
} catch (Exception e) {
return false;
}
}
/**
* 爬取网页
* 由于需要重复利用jsoup爬取 封装成一个方法减少代码量
* 使用jsoup需要导maven依赖
* <!--解析html使用-->
* <dependency>
* <groupId>org.jsoup</groupId>
* <artifactId>jsoup</artifactId>
* <version>1.9.2</version>
* </dependency>
*
* @param url
* @return
* @throws IOException
*/
public static Document jsoupGet(String url) {
Document doc = null;
try {
doc = Jsoup.connect(url).timeout(500000).header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate, sdch").header("Accept-Language", "zh-CN,zh;q=0.8")
.header("Accept-Language", "zh-CN,zh;q=0.8")
.header("User-Agent", "Mozilla/31.0 (compatible; MSIE 10.0; Windows NT; DigExt)")
.header("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)")
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36").get();
} catch (IOException e) {
e.printStackTrace();
}
return doc;
}
/**
* 判断是否是数字
*
* @param str
* @return
*/
public static boolean isNumeric(String str) {
for (int i = 0; i < str.length(); i++) {
if (!Character.isDigit(str.charAt(i))) {
return false;
}
}
return true;
}
/**
* 中文数字转换阿拉伯数字
*
* @param chineseNumber
* @return
*/
public static int chineseNumberInt(String chineseNumber) {
int result = 0;
int temp = 1;//存放一个单位的数字如:十万
int count = 0;//判断是否有chArr
char[] cnArr = new char[]{'一', '二', '三', '四', '五', '六', '七', '八', '九'};
char[] chArr = new char[]{'十', '百', '千', '万', '亿'};
for (int i = 0; i < chineseNumber.length(); i++) {
boolean b = true;//判断是否是chArr
char c = chineseNumber.charAt(i);
for (int j = 0; j < cnArr.length; j++) {//非单位,即数字
if (c == cnArr[j]) {
if (0 != count) {//添加下一个单位之前,先把上一个单位值添加到结果中
result += temp;
temp = 1;
count = 0;
}
// 下标+1,就是对应的值
temp = j + 1;
b = false;
break;
}
}
if (b) {//单位{'十','百','千','万','亿'}
for (int j = 0; j < chArr.length; j++) {
if (c == chArr[j]) {
switch (j) {
case 0:
temp *= 10;
break;
case 1:
temp *= 100;
break;
case 2:
temp *= 1000;
break;
case 3:
temp *= 10000;
break;
case 4:
temp *= 100000000;
break;
default:
break;
}
count++;
}
}
}
if (i == chineseNumber.length() - 1) {//遍历到最后一个字符
result += temp;
}
}
return result;
}
}
运行结果:
下载结果:
其实代码的注解已经讲的比较清楚了。我就不多讲了,修改一下url就可以爬这个网站其他漫画了