JsoupXpath 可以使用Xpath语法解析HTML,因此我们选择使用它来解析HTML
1、添加maven坐标
<!--解析html-->
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>2.2</version>
</dependency>
2、编写解析代码
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.IdUtil;
import com.google.common.collect.Maps;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
import java.io.*;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class HtmlUtil {
public static String sourceCharset = "GB2312"; // 源文件编码
public static String targetCharset = "utf8"; // 目标文件编码
public static String xpath = "//tbody/tr/td//allText()";//Xpath语法
public static String charsetName = "utf-8";
public static void readHtml(InputStream in) {
try {
String path = getPath(in);
File file = new File(path);
encoding(sourceCharset, targetCharset, path);
Element doc = Jsoup.parse(file, charsetName);
Elements elements = new Elements();
elements.add(doc);
FileUtil.del(path);
JXDocument jxDocument = new JXDocument(elements);
List<JXNode> jxNodes = jxDocument.selN(xpath);
for (JXNode jxNode : jxNodes) {
System.out.println(jxNode);
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 根据InputStream生成文件,返回随机生成路径
*
* @return
*/
public static String getPath(InputStream inputStream) {
String realPath = System.getProperty("java.io.tmpdir");
String tmpFolderName = IdUtil.simpleUUID();
String dir = realPath + "/" + tmpFolderName;
File file = new File(dir);
file.mkdir();
String path = dir + "/" + IdGenerator.nextId() + ".html";
FileUtil.writeFromStream(inputStream, path);
return path;
}
/**
* 编码转换
*
* @param toCharset 要转换的编码
* @param path 要转换的文件路径
* @return
* @throws Exception
*/
public static String encoding(String charset, String toCharset, String path) throws Exception {
File srcFile = new File(path);
// 编码相同,无需转码
if (charset.equalsIgnoreCase(toCharset)) {
return "编码一样,无需转换";
}
InputStream in = new FileInputStream(path);
BufferedReader br = new BufferedReader(
new InputStreamReader(in, charset));
StringBuffer sb = new StringBuffer();
String s1;
while ((s1 = br.readLine()) != null) {
String s = URLEncoder.encode(s1, toCharset);
sb.append(s + "\r\n");//一行+回车
}
br.close();
srcFile.delete();//删除原来文件
//重新以新编码写入文件并返回值
File newfile = new File(path);//重新建原来的文件
newfile.createNewFile();
OutputStream out = new FileOutputStream(newfile);
OutputStreamWriter writer = new OutputStreamWriter(out, toCharset);
BufferedWriter bw = new BufferedWriter(writer);
bw.write(URLDecoder.decode(sb.toString(), toCharset));
String result = URLDecoder.decode(sb.toString(), toCharset);
bw.flush();//刷到文件中
bw.close();
return result;
}
}