从网页中获取有用的信息 html2text

最新推荐文章于 2024-08-10 07:06:49 发布

shuchao_522

最新推荐文章于 2024-08-10 07:06:49 发布

阅读量989

点赞数

分类专栏： jdk 文章标签： html 正则表达式 regex 互联网工具测试

本文链接：https://blog.csdn.net/shuchao_522/article/details/3325825

版权

jdk 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

  这个工具蛮好用的哈，如果你想从其他网页中抓取一些有用的信息 比如文章 新闻之类的 

 
 import java.util.regex.Pattern;
import org.apache.log4j.Logger;
/**
 * 来自互联网  过滤html css标签测试通过
 * @author shuchao
 *
 */
public class Html2Text {
    protected static Logger log = Logger.getLogger(Html2Text.class);
    public  static String html2Text(String inputString) {    
          String htmlStr = inputString; //含html标签的字符串     
          String textStr ="";    
          java.util.regex.Pattern p_script;    
          java.util.regex.Matcher m_script;    
          java.util.regex.Pattern p_style;    
          java.util.regex.Matcher m_style;    
          java.util.regex.Pattern p_html;    
          java.util.regex.Matcher m_html;    
              
          try {    
           String regEx_script = "<[//s]*?script[^>]*?>[//s//S]*?<[//s]*?///[//s]*?script[//s]*?>"; //定义script的正则表达式
//{<script[^>]*?>[//s//S]*?<///script> }     
           String regEx_style = "<[//s]*?style[^>]*?>[//s//S]*?<[//s]*?///[//s]*?style[//s]*?>"; //定义style的正则表达式
//{或<style[^>]*?>[//s//S]*?<///style> }     
              String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式     
               
              p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);    
              m_script = p_script.matcher(htmlStr);    
              htmlStr = m_script.replaceAll(""); //过滤script标签     
       
              p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);    
              m_style = p_style.matcher(htmlStr);    
              htmlStr = m_style.replaceAll(""); //过滤style标签     
               
              p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);    
              m_html = p_html.matcher(htmlStr);    
              htmlStr = m_html.replaceAll(""); //过滤html标签     
               
           textStr = htmlStr;    
               
          }catch(Exception e) {    
              log.error("Html2Text: " + e.getMessage());    
          }    
              
          return textStr;//返回文本字符串     
        }  
}