1.获取文件名称路径
/**
* 获取文件名
*
*<hr>
* @author hanjidong
* @date 2020年11月17日 下午3:05:51
* @since 0.0.1
* @param basePath
* @param documentId
* @param itemId
* @param fileType
* @return
* String
*/
public static String buildFilename(String basePath, String documentId,String itemId,String fileType){
StringBuffer sbuffer = new StringBuffer();
sbuffer.append(basePath).append(documentId).append(FILE_SPLIT).append(itemId);
File file = new File(sbuffer.toString());
file.mkdirs();
if(ValidateUtil.isEmpty(fileType)){
return sbuffer.append(FILE_SPLIT).toString();
}
sbuffer.append(FILE_SPLIT).append(documentId+"_"+itemId+"."+fileType);
return sbuffer.toString();
}
2.读取html文件
public static String readHTML(String filepath){
StringBuffer htmlSb = new StringBuffer();
if(!FileUtil.isExist(filepath)){
return htmlSb.toString();
}
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(filepath)));
while (br.ready()) {
htmlSb.append(br.readLine());
}
} catch (Exception e) {
e.printStackTrace();
}finally{
if(br != null){
try {
br.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
return htmlSb.toString();
}
3.过滤html标签
public static String Html2Text(String inputString){
String htmlStr = inputString; //含html标签的字符串
String textStr ="";
java.util.regex.Pattern p_script;
java.util.regex.Matcher m_script;
java.util.regex.Pattern p_style;
java.util.regex.Matcher m_style;
java.util.regex.Pattern p_html;
java.util.regex.Matcher m_html;
try{
String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; //定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); //过滤script标签
p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); //过滤style标签
p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); //过滤html标签
textStr = htmlStr;
}catch(Exception e){
e.printStackTrace();
}
return textStr;//返回文本字符串
}
4.调用方式
public static String getContent(String basePath, String documentId, String itemId, String fileType) {
String buildHtmlFilename = buildHtmlFilename(basePath, documentId, itemId, fileType);
return FilterHtmlUtil.Html2Text(readHTML(buildHtmlFilename));
}