所用maven依赖:
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>4.5.15</version>
</dependency>
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>0.1.1</version>
</dependency>
代码示例:
package com.xiaoyun;
import cn.hutool.http.HttpUtil;
import cn.wanghaomiao.xpath.exception.NoSuchAxisException;
import cn.wanghaomiao.xpath.exception.NoSuchFunctionException;
import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException;
import cn.wanghaomiao.xpath.model.JXDocument;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class TestXpath {
private static String html = "https://www.newcger.com/aemoban/list_1.html";
private static String pageMaxNum = "";
private static List<String> listHtml = new LinkedList<>();
public static void main(String[] args) {
long startTime = System.currentTimeMillis();
getPanDown();
System.out.println("所用时间= " + (System.currentTimeMillis() - startTime));
}
private static void getPanDown() {
init(html);
StringBuilder sb = new StringBuilder("https://www.newcger.com/aemoban/list_");
int pageNum = 5;
for (int i = 1; i <= pageNum; i++) {
String sHtml = sb.append(i).append(".html").toString();
listHtml.add(sHtml);
}
listHtml.stream().parallel().forEach(html -> {
String xpath = "//a[@class='tit']/@href";
String xpathText = "//a[@class='tit']/text()";
List<Object> nodeHtmlList = getHttpFormXpath(html, xpath);
List<String> list = nodeHtmlList.stream().parallel().map(nodeHtml -> {
String panDown = "//div[@class='fd_div']/ul//li/a/@href";
String panDownPassword = "//div[@class='fd_div']/ul//li/a/text()";
List<Object> panHtml = getHttpFormXpath(nodeHtml.toString(), panDown);
List<Object> panPassword = getHttpFormXpath(nodeHtml.toString(), panDownPassword);
return panHtml.get(0) + "," + panPassword.get(0).toString() + "," + panHtml.get(1);
}).collect(Collectors.toList());
List<Object> nodeHtmlTitleList = getHttpFormXpath(html, xpathText);
Map<Object, String> map = listToMap(nodeHtmlTitleList, list);
System.out.println(map);
});
}
public static <K, V> Map<K, V> listToMap(List<K> keys, List<V> values) {
return keys.stream().collect(Collectors.toMap(key -> key, key -> values.get(keys.indexOf(key))));
}
private static void init(String html) {
String pageNumXpath = "//div[@class='pageNavByNum']/form/label/text()";
List<Object> httpFormXpath = getHttpFormXpath(html, pageNumXpath);
String num = httpFormXpath.get(0).toString();
pageMaxNum = Pattern.compile("[^(0-9)]").matcher(num).replaceAll("").trim();
System.out.println(pageMaxNum);
}
private static List<Object> getHttpFormXpath(String html, String xpathName) {
String body = HttpUtil.createGet(html).execute().body();
JXDocument jxDocument = new JXDocument(body);
List<Object> sel = new ArrayList<>();
try {
sel = jxDocument.sel(xpathName);
} catch (NoSuchAxisException e) {
e.printStackTrace();
} catch (NoSuchFunctionException e) {
e.printStackTrace();
} catch (XpathSyntaxErrorException e) {
e.printStackTrace();
}
return sel;
}
}
同理,另一个视频网站
package com.xiaoyun;
import cn.hutool.core.codec.Base64;
import cn.hutool.http.HttpUtil;
import cn.wanghaomiao.xpath.exception.NoSuchAxisException;
import cn.wanghaomiao.xpath.exception.NoSuchFunctionException;
import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException;
import cn.wanghaomiao.xpath.model.JXDocument;
import lombok.Data;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class TestXpath02 {
private static Map<String, String> cacheHtml = new HashMap();
private static Map<String, AvDownEntity> cacheAvDown = new HashMap();
private static List<String> list = Arrays.asList(
"aHR0cHM6Ly93d3cuMTEyM3dvLmNvbS9WaWRlb0FWLw=="
);
private static Set<String> newSet = new HashSet<>();
private static String domain = "aHR0cHM6Ly93d3cuMTEyM3dvLmNvbQ==";
public static void main(String[] args) {
init();
System.out.println(newSet);
runHttp();
System.out.println(cacheAvDown);
}
private static void runHttp() {
Iterator<String> iterator = newSet.iterator();
while (iterator.hasNext()) {
String domainName = iterator.next();
if (Optional.ofNullable(domainName).isPresent()) {
String listHtmlXpath = "//h4[@class='title text-overflow']/a/@href";
String listHtmlPhotoXpath = "//a[@class='stui-vodlist__thumb lazyload']/@data-original";
List<Object> httpFormXpathString = getHttpFormXpath(domainName, listHtmlXpath);
List<Object> httpFormXpathPhotoString = getHttpFormXpath(domainName, listHtmlPhotoXpath);
List<String> htmlList = httpFormXpathString.stream().map(o -> {
String domainDecode = Base64.decodeStr(domain);
String htmlUrl = domainDecode + o.toString();
return htmlUrl;
}).collect(Collectors.toList());
Map<String, Object> map = listToMap(htmlList, httpFormXpathPhotoString);
map.forEach((key, value) -> {
AvDownEntity avDownEntity = new AvDownEntity();
avDownEntity.setDomainUrl(key);
avDownEntity.setPrice(value.toString());
cacheAvDown.put(key, avDownEntity);
});
htmlList.stream().parallel().forEach(html -> {
String xpathVideo = "//ul[@class='stui-content__playlist clearfix']/li/a/@title";
String xpathDown = "//ul[@class='stui-content__playlist clearfix']/li/a/@href";
List<Object> httpFormXpathVideo = getHttpFormXpath(html, xpathVideo);
List<Object> httpFormXpathDown = getHttpFormXpath(html, xpathDown);
AvDownEntity avDownEntity = cacheAvDown.get(html);
avDownEntity.setDownUrl(httpFormXpathDown.get(0).toString());
avDownEntity.setTitle(httpFormXpathVideo.get(0).toString());
cacheAvDown.put(html, avDownEntity);
System.out.println(avDownEntity);
});
}
}
}
private static void init() {
String endNumXpath = "//li[last()]/a/@href";
list.stream().parallel().forEach(domainHtml -> {
String domainHtmlDecode = Base64.decodeStr(domainHtml);
List<Object> listObj = getHttpFormXpath(domainHtmlDecode, endNumXpath);
String[] split = listObj.get(3).toString().split("/");
String str = split[split.length - 1];
str = Pattern.compile("[^(0-9)]").matcher(str).replaceAll("").trim();
int num = Integer.parseInt(str);
for (int i = 0; i < num; i++) {
String htmlNew = domainHtmlDecode + "index_" + i + ".html";
newSet.add(htmlNew);
}
});
}
public static <K, V> Map<K, V> listToMap(List<K> keys, List<V> values) {
return keys.stream().collect(Collectors.toMap(key -> key, key -> values.get(keys.indexOf(key))));
}
private static List<Object> getHttpFormXpath(String html, String xpathName) {
String body = null;
if (cacheHtml.get(html) != null) {
body = cacheHtml.get(html);
} else {
try {
Thread.sleep(200);
} catch (InterruptedException e) {
e.printStackTrace();
}
body = HttpUtil.
createGet(html).
execute().body();
cacheHtml.put(html, body);
}
JXDocument jxDocument = new JXDocument(body);
List<Object> sel = new ArrayList<>();
try {
sel = jxDocument.sel(xpathName);
} catch (NoSuchAxisException e) {
e.printStackTrace();
} catch (NoSuchFunctionException e) {
e.printStackTrace();
} catch (XpathSyntaxErrorException e) {
e.printStackTrace();
}
return sel;
}
@Data
static class AvDownEntity {
private String domainUrl;
private String title;
private String downUrl;
private String price;
}
}
获取公网Ip地址
private static Map<String, String> cacheMap = new HashMap();
public static void main(String[] args) {
String xpath = "//div[@id='ipv4']/a/text()";
String xpathInformation = "//tr/td[@style='font-size:14px;']/text()";
String html = "https://whatismyipaddress.com";
List<Object> list = getHttpFormXpath(html, xpath);
List<Object> addr = getHttpFormXpath(html, xpathInformation);
System.out.println(list);
System.out.println(addr);
}
private static List<Object> getHttpFormXpath(String html, String xpathName) {
String body = null;
if (cacheMap.get(html) != null) {
body = cacheMap.get(html);
} else {
body = HttpUtil.
createGet(html).
execute().body();
cacheMap.put(html, body);
}
return getObjects(xpathName, body);
}
static List<Object> getObjects(String xpathName, String body) {
JXDocument jxDocument = new JXDocument(body);
List<Object> sel = new ArrayList<>();
try {
sel = jxDocument.sel(xpathName);
} catch (NoSuchAxisException e) {
e.printStackTrace();
} catch (NoSuchFunctionException e) {
e.printStackTrace();
} catch (XpathSyntaxErrorException e) {
e.printStackTrace();
}
return sel;
}
改进,加入map缓存,加入key MD5,避免多次查询:
package com.xiaoyun;
import cn.hutool.core.map.MapUtil;
import cn.hutool.crypto.digest.MD5;
import cn.hutool.http.HttpUtil;
import cn.wanghaomiao.xpath.exception.NoSuchAxisException;
import cn.wanghaomiao.xpath.exception.NoSuchFunctionException;
import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException;
import cn.wanghaomiao.xpath.model.JXDocument;
import java.util.*;
public class IpAddrSelect {
private static Map<String, String> cacheMap = new HashMap();
private static Map<String, Map<String, List<Object>>> cacheSelMap = new HashMap();
public static void main(String[] args) {
String xpath = "//div[@id='ipv4']/a/text()";
String xpathInformation = "//tr/td[@style='font-size:14px;']/text()";
String html = "https://whatismyipaddress.com";
List<Object> list = getHttpFormXpath(html, xpath);
List<Object> addr = getHttpFormXpath(html, xpathInformation);
System.out.println(list);
System.out.println(addr);
System.out.println(cacheSelMap);
}
public static List<Object> getHttpFormXpath(String html, String xpathName) {
String body = null;
if (cacheMap.get(html) != null) {
body = cacheMap.get(html);
} else {
body = HttpUtil.
createGet(html).
execute().body();
cacheMap.put(html, body);
}
return getObjects(xpathName, body);
}
private static List<Object> getObjects(String xpathName, String body) {
String bodyMd5 = bodyMd5(body);
Map<String, List<Object>> bodyMap = cacheSelMap.get(bodyMd5);
List<Object> sel = new ArrayList<>();
if (!MapUtil.isEmpty(bodyMap)) {
if (sel != null && sel.size() > 0) {
sel = bodyMap.get(xpathName);
} else {
sel = selAndCachePut(xpathName, body, bodyMap);
}
} else {
Map<String, List<Object>> bodyNewMap = new HashMap<>();
sel = selAndCachePut(xpathName, body, bodyNewMap);
}
return sel;
}
private static String bodyMd5(String body) {
MD5 md5 = new MD5();
return md5.digestHex16(body);
}
private static List<Object> selAndCachePut(String xpathName, String body, Map<String, List<Object>> bodyNewMap) {
List<Object> sel = new ArrayList<>();
try {
JXDocument jxDocument = new JXDocument(body);
sel = jxDocument.sel(xpathName);
bodyNewMap.put(xpathName, sel);
String bodyMd5 = bodyMd5(body);
cacheSelMap.put(bodyMd5, bodyNewMap);
} catch (NoSuchAxisException e) {
e.printStackTrace();
} catch (NoSuchFunctionException e) {
e.printStackTrace();
} catch (XpathSyntaxErrorException e) {
e.printStackTrace();
}
return sel;
}
}