nekohtml解析html(string或是文件流)

最新推荐文章于 2021-02-19 08:38:02 发布

angx

最新推荐文章于 2021-02-19 08:38:02 发布

阅读量1.9k

点赞数

分类专栏： java

本文链接：https://blog.csdn.net/meandmyself/article/details/42077803

版权

java 专栏收录该内容

34 篇文章 0 订阅

订阅专栏

一、dom解析html

DOMParser parser = new DOMParser();

XMLInputSource source = new XMLInputSource(null, "", null,
new StringReader("<html><head></head><body>"
+ sugg.getSuggContent() + "</body></html>"), "utf-8");
try {
parser.parse(source);
} catch (XNIException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
Document document = parser.getDocument();

String ss = document.getDocumentElement().getTextContent();

二、去除文本中多余的换行

InputStream is = null;
InputStreamReader isr = null;
try {
is = new ByteArrayInputStream(ss.getBytes("utf-8"));//这是关键，不然会出现乱码
isr = new InputStreamReader(is, "utf-8");//编码
} catch (UnsupportedEncodingException e2) {
e2.printStackTrace();
}
BufferedReader br = new BufferedReader(isr);
StringBuffer sb = new StringBuffer();
try {
String line = br.readLine();
while (null != line) {
if (!"".equals(line.trim())) {
sb.append(line + "\r");
}
line = br.readLine();
}
} catch (IOException e1) {
e1.printStackTrace();
}
String fstr = sb.toString();