今天同事遇到问题,去除文本中的html标签,使用正则匹配总是有些问题,协助解决了这个问题,记录一下,其实就是找到了一个库Jsoup。代码如下
package com.xn.map.tile;
import org.jsoup.Jsoup;
import java.io.*;
public class RemoveHtmlTag {
public static void main(String[] args) throws IOException {
String filePath = "D:\\work\\test\\html.txt";
String html = readTxt(filePath);
System.out.println(html);
System.out.println(html2text(html));
}
public static String html2text(String html) {
return Jsoup.parse(html).text();
}
public static String readTxt(String filePath) throws IOException {
FileInputStream fin = new FileInputStream(filePath);
InputStreamReader reader = new InputStreamReader(fin);
BufferedReader buffReader = new BufferedReader(reader);
String strTmp = "";
StringBuilder txt = new StringBuilder();
while((strTmp = buffReader.readLine())!=null){
txt.append(strTmp);
}
buffReader.close();
return txt.toString();
}
}
pom中引入
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>