抓取新闻

/**
     * html内容
     * @param url html地址
     * @return null
     */
    public String parser(String url)
    {

           url="http://news.baidu.com/n?cmd=1&class=civilnews&tn=rss&sub=0";

        String parse = null;
        try
        {
            String content;
            // String title ;
            content = getOneHtml(url);
            // title = getTitle(content);
            parse = "\n" + getTab(getLink(getScript(getCSS(content))));

            parse = getText(parse);

        }
        catch (IOException e)
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return parse;
    }

/*    *//**
     * @param args
     *//*

    public static void main(String[] args)
    {
        ParserHtml ph = new ParserHtml();
       
         * String content = ""; String title = ""; String parse=""; try {
         * content = parser(); title = getTitle(content);
         * parse=getTab(getLink(getScript(getCSS(content))));
         * System.out.println(parse); parserText(parse); //
         * System.out.println(getCSS(content)); //System.out.println("title:" +
         * title); } catch (IOException e) { // TODO Auto-generated catch block
         * e.printStackTrace(); }
        
        ph.parser("D:\\aa.html");

    }*/
   

    /**
     * 取得标题
     * @param s 内容
     * @return null
     */
    public static String getTitle(final String s)
    {
        String regex;
        String title = "";
        final List<String> list = new ArrayList<String>();
        regex = "<title>.*?</title>";// 取得标题的正则表达式
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        StringBuffer sBuffer = new StringBuffer();
        while (ma.find())
        {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++)
        {
            // title = title + list.get(i);
            sBuffer.append(list.get(i));
        }
        title = sBuffer.toString();
        return outTag(title);
    }

    /**
     * 去掉所有的html标记
     * @param s 内容
     * @return null
     */
    public static String outTag(final String s)
    {
        return s.replaceAll("<.*?>", "");// 去掉所有的html标记
    }

    /**
     * 去掉所有的html样式
     * @param s 内容
     * @return null
     */
    public static String getCSS(final String s)
    {
        String regex;
        String outCss = s;

        regex = "(<style|<STYLE).*?(</style>|<STYLE>)";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find())
        {
            outCss = outCss.replace(ma.group(), "");
        }
        return outCss;

    }

    /**
     * 去掉所有的script脚本
     * @param s 内容
     * @return null
     */
    public static String getScript(final String s)
    {
        String outScript = s;
        String regex;

        // regex = "<script.*?</script>";
        regex = "(<script|<SCRIPT).*?(</script>|</SCRIPT>)";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find())
        {
            outScript = outScript.replace(ma.group(), "");
        }
        return outScript;
    }

    /**
     * 去掉所有的html标记
     * @param s 内容
     * @return null
     */
    public static String getTab(final String s)
    {
        String outScript = s;
        String regex;

        regex = "<.*?>";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find())
        {
            outScript = outScript.replace(ma.group(), "");
            // outScript=outScript.replace(" ", "");
        }
        return outScript;
    }

    /**
     * 去掉所有的<a>标签
     * @param s 内容
     * @return null
     */
    public static String getLink(final String s)
    {
        String outScript = s;
        String regex;

        regex = "(<a|<A).*?(</a>|</A>)";
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        final Matcher ma = pa.matcher(s);
        while (ma.find())
        {
            outScript = outScript.replace(ma.group(), "");
            // outScript=outScript.replace(" ", "");
        }
        return outScript;
    }

    /**
     * 读取一个网页全部内容
     * @param htmlurl htmlurl
     * @return null
     * @throws IOException IOException
     */
    public String getOneHtml(final String htmlurl) throws IOException
    {
        URL url;
        String temp;
        final StringBuffer sb = new StringBuffer();
        try
        {
            url = new URL(htmlurl);
            final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "utf-8"));// 读取网页全部内容
            while ((temp = in.readLine()) != null)
            {
                sb.append(temp);
            }
            in.close();
        }
        catch (final MalformedURLException me)
        {
            System.out.println("你输入的URL格式有问题!请仔细输入");
            me.getMessage();
            throw me;
        }
        catch (final IOException e)
        {
            e.printStackTrace();
            throw e;
        }
        return sb.toString();
    }

    /**
     * 根据空格和文字长度,过滤非正文部分的文字
     * @param s 内容
     * @return null
     */
    public String getText(String s)
    {
        String[] array = s.split(" ");
        String str = "";
        String str2 = "";
        StringBuffer sBuffer = new StringBuffer();
        for (int i = 0; i < array.length; i++)
        {
            str2 = array[i].trim();
            if (str2.length() > Com.NUM_40)
            {
                // str += Array[i];
                sBuffer.append(array[i]);
                // System.out.println(str);
            }

        }
        str = sBuffer.toString();
        return str;
    }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值