java获取html文件的内容

H愚公移山H

已于 2024-10-20 20:09:00 修改

阅读量4.8k

点赞数 7

分类专栏：工具类 html 文章标签：文件操作 HTML读取内容过滤路径构建字符串处理

于 2020-12-15 11:24:11 首次发布

本文链接：https://blog.csdn.net/HAN_789/article/details/111195109

版权

工具类同时被 2 个专栏收录

10 篇文章 0 订阅

订阅专栏

html

2 篇文章 0 订阅

订阅专栏

1.获取文件名称路径

/**
	 * 获取文件名
	 *
	 *<hr>
	 * @author hanjidong
	 * @date 2020年11月17日 下午3:05:51
	 * @since 0.0.1
	 * @param basePath
	 * @param documentId
	 * @param itemId
	 * @param fileType
	 * @return
	 * String
	 */
	public static String buildFilename(String basePath, String documentId,String itemId,String fileType){
		StringBuffer sbuffer = new StringBuffer();
		sbuffer.append(basePath).append(documentId).append(FILE_SPLIT).append(itemId);
		File file = new File(sbuffer.toString());
		file.mkdirs();
		if(ValidateUtil.isEmpty(fileType)){
			return sbuffer.append(FILE_SPLIT).toString();
		}
		sbuffer.append(FILE_SPLIT).append(documentId+"_"+itemId+"."+fileType);
		return sbuffer.toString();
	}

2.读取html文件

public static String readHTML(String filepath){
		StringBuffer htmlSb = new StringBuffer();
		
		if(!FileUtil.isExist(filepath)){
			return htmlSb.toString();
		}
		BufferedReader br = null;
		try {
			br = new BufferedReader(new InputStreamReader(new FileInputStream(filepath)));
			while (br.ready()) {
				htmlSb.append(br.readLine());
			}
			
		} catch (Exception e) {
			e.printStackTrace();
		}finally{
			if(br != null){
				try {
					br.close();
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
		}
		return htmlSb.toString();
	}

3.过滤html标签

public static String Html2Text(String inputString){
    String htmlStr = inputString; //含html标签的字符串
    String textStr ="";
    java.util.regex.Pattern p_script;
    java.util.regex.Matcher m_script;
    java.util.regex.Pattern p_style;
    java.util.regex.Matcher m_style;
    java.util.regex.Pattern p_html;
    java.util.regex.Matcher m_html;
   try{
         String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
         String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; //定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
         String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
         p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
         m_script = p_script.matcher(htmlStr);
         htmlStr = m_script.replaceAll(""); //过滤script标签
         p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
         m_style = p_style.matcher(htmlStr);
         htmlStr = m_style.replaceAll(""); //过滤style标签
         p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
         m_html = p_html.matcher(htmlStr);
         htmlStr = m_html.replaceAll(""); //过滤html标签
         textStr = htmlStr;
    }catch(Exception e){
    e.printStackTrace();
    }
    return textStr;//返回文本字符串
}

4.调用方式

	public static String getContent(String basePath, String documentId, String itemId, String fileType) {
		String buildHtmlFilename = buildHtmlFilename(basePath, documentId, itemId, fileType);
		return FilterHtmlUtil.Html2Text(readHTML(buildHtmlFilename));
	}