在后台去除HTML的样式;例如富文本编辑器的

最新推荐文章于 2022-08-21 21:32:19 发布

宝贝笑

最新推荐文章于 2022-08-21 21:32:19 发布

阅读量2.5k

点赞数

package com.chendaojun.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;

public class ParseHtml {
    public static void main(String[] args){
        //可以将注释打开逐个试验
        
        ParseHtml ph = new ParseHtml();
        String html="";
        
        //打开下面两行可进行连接mysql并解析html
        //html=ph.getHtmlFromMysql();
        //System.out.println(ph.parseHtml(html));
        //System.out.println(ph.parseHtml(html,300));
        
        //打开下面两行可进行获得路径文件内容并解析html，路径根据实际修改
        //html=ph.getHtml("E:\\1478300.html");
        //System.out.println(ph.parseHtml(html));
        //System.out.println(ph.parseHtml(html,300));
        
        //指定长度直接解析
        //html=ph.parseHtml("<p>sdfsdf</p><br><div>sdfsdfsdf</div>",10);
        //System.out.println(html);
        
        //直接解析
        html=ph.parseHtml("<p>sdfsdf</p><br><div>sdfsdfsdf</div>sdflksdflksdjfk<dkf");
        System.out.println(html);
    }
    
    //从mysql中取出在线编辑器存进去的html文章
    public String getHtmlFromMysql(){
        String url="jdbc:mysql://localhost:3306/blog";
        String userName="root";
        String passWord="root";
        String className="com.mysql.jdbc.Driver";
        String sql="select text from blog where id=5";
        String html="";
        Connection conn=null;
        Statement stmt=null;
        ResultSet rs=null;
        try{
            Class.forName(className);
            conn=DriverManager.getConnection(url,userName,passWord);
            stmt=conn.createStatement();
            rs=stmt.executeQuery(sql);
            while(rs.next()){
                //获得html内容
                html=rs.getString("text");
            }
        }catch(Exception e){
            e.printStackTrace();
        }finally{
            try{
                if(rs!=null){
                    rs.close();
                    rs=null;
                }
                if(stmt!=null){
                    stmt.close();
                    stmt=null;
                }
                if(conn!=null){
                    conn.close();
                    conn=null;
                }
            }catch(Exception e){
                e.printStackTrace();
            }
        }
        return html;
    }
    
    //从指定路径读取html文件
    public String getHtml(String filePath) {
        String html = "";
        FileInputStream fis = null;
        InputStreamReader isr = null;
        BufferedReader br = null;
        try {
            File file = new File(filePath);
            fis = new FileInputStream(file);
            isr = new InputStreamReader(fis);
            br = new BufferedReader(isr);
            String bRead = "";
            while ((bRead = br.readLine()) != null) {
                html += bRead;
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if(br!=null){
                    br.close();
                    br=null;
                }
                if(isr!=null){
                    isr.close();
                    isr=null;
                }
                if(fis!=null){
                    fis.close();
                    fis=null;
                }
                
            } catch (Exception e) {
                e.printStackTrace();
            }

        }
        return html;

    }
    
    //任意html，残缺不全也可以
    public String parseHtml(String html) {
        /*
         * <.*?>为正则表达式，其中的.表示任意字符，*?表示出现0次或0次以上，此方法可以去掉双头标签(双头针对于残缺的标签)
         * "<.*?"表示<尖括号后的所有字符，此方法可以去掉残缺的标签，及后面的内容
         * " "，若有多种此种字符，可用同一方法去除
         */
        html = html.replaceAll("<.*?>", "  ").replaceAll(" ", " ");
        html = html.replaceAll("<.*?", "");
        return (html + "...");
    }
    
    //可以指定截取长度
    public String parseHtml(String html,int length) {
        if(html.length()<length){
            return "截取长度超过文件内容总长";
        }
        return parseHtml(html.substring(0, length));
    }
}

上面的是从别人那里转的地址是:https://www.cnblogs.com/cnsevennight/p/4468055.html

前言：本人在实现业务逻辑的时候，需要在后台把带HTML标签（富文本）数据的文章截取成文章的摘要，涉及到怎么处理带HTML标签数据，在网上一共找到了两种解决方法：

1、调用HtmlParser插件

HtmlParser 简介

htmlparser是一个纯的java写的html解析的库，主要用于改造或提取html。用来分析抓取到的网页信息是个不错的选择，遗憾的是参考文档太少。
项目主页：http://htmlparser.sourceforge.net/
API文档： http://htmlparser.sourceforge.net/javadoc/index.html

参考博客：http://blog.csdn.net/fancy3013/article/details/50965112

2、引用一个方法直接去掉HTML标签

这也是我在项目所用的，是别人写好的一个方法。

public String parseHtml(String html,int length) {            	
        if(html == null || html == "") {
    		return html = "空";
		}else {
			if(html.length()<length){
	            return html;
	        }else {
	            /*
	             * <.*?>为正则表达式，其中的.表示任意字符，*?表示出现0次或0次以上，此方法可以去掉    双头标签(双头针对于残缺的标签)	
             * "<.*?"表示<尖括号后的所有字符，此方法可以去掉残缺的标签，及后面的内容
	             * " "，若有多种此种字符，可用同一方法去除
	             */
	            html = html.replaceAll("<.*?>", " ").replaceAll("", "");
	            html = html.replaceAll("<.*?", "");
	            return (html.substring(0, length) + "...");
        		        }
		}

参考自:https://blog.csdn.net/lq13457309725/article/details/79578326?utm_source=copy

宝贝笑

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
在后台去除HTML的样式;例如富文本编辑器的

package com.chendaojun.util;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.InputStreamReader;import java.sql.Connection;import java.sql.DriverM...
复制链接

扫一扫