利用正则表达式抽取网页信息

最新推荐文章于 2024-04-02 05:19:32 发布
wlchn
最新推荐文章于 2024-04-02 05:19:32 发布
阅读量1k
点赞数
分类专栏： Java 网络算法
本文链接：https://blog.csdn.net/wlchn/article/details/47415573
版权
算法同时被 3 个专栏收录
22 篇文章 0 订阅
订阅专栏
网络
19 篇文章 0 订阅
订阅专栏
Java
17 篇文章 0 订阅
订阅专栏
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class ObtainNews {


    /**
     * 
     * @param htmlurl
     * @return 读取一个网页全部内容
     * @throws IOException
     */
    public String getHtml(final String htmlurl) throws IOException {
        URL url;
        String temp;
        final StringBuffer sb = new StringBuffer();
        try {
            url = new URL(htmlurl);
            final BufferedReader in = new BufferedReader(new InputStreamReader(
                url.openStream(), "utf-8")); // 读取网页全部内容
            while ((temp = in .readLine()) != null) {
                sb.append(temp);
            } in .close();
        } catch (final MalformedURLException me) {
            System.out.println("你输入的URL格式有问题！请仔细输入");
            me.getMessage();
            throw me;
        } catch (final IOException e) {
            e.printStackTrace();
            throw e;
        }
        return sb.toString();
    }


    /**
     * 获取 标题/正文/发布时间/发布者：null/来源站点：reuters/记者/分类频道/专题标识：null/图片/视频
     */


    public String getTitle(final String s) {
        String regex;
        String title = "";
        final List < String > list = new ArrayList < String > ();
        regex = "<h1>.*?</h1>";
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++) {
            title = title + list.get(i);
        }
        return title;
    }

    public String getContent(final String s) {
        String regex;
        String content = "";
        final List < String > list = new ArrayList < String > ();
        regex = "<span id=\"midArticle_start\"></span>.*?</span></span>";
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++) {
            content = content + list.get(i);
        }
        return content;
    }

    public String getTime(final String s) {
        String regex;
        String time = "";
        final List < String > list = new ArrayList < String > ();
        regex = "<span class=\"timestamp\">.*?</span>        </p>";
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++) {
            time = time + list.get(i);
        }
        return time;
    }

    public String getReporter(final String s) {
        String regex;
        String reporter = "";
        final List < String > list = new ArrayList < String > ();
        regex = "<p class=\"byline\">.*?</p>        <p>";
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++) {
            reporter = reporter + " " + list.get(i);
        }
        return reporter;
    }

    public String getChannel(final String s) {
        String regex;
        String channel = "";
        final List < String > list = new ArrayList < String > ();
        regex = "<div class=\"actionButton\">.*?</a></div>";
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++) {
            channel = channel + " " + list.get(i);
        }
        return channel;
    }

    public String getImgsrc(final String s) {
        String regex;
        String imgsrc = "";
        final List < String > list = new ArrayList < String > ();
        regex = "<img        src=\".*?\"        border";
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++) {
            imgsrc = imgsrc + list.get(i);
        }
        return imgsrc;
    }

    public String getVideosrc(final String s) {
        String regex;
        String videosrc = "";
        final List < String > list = new ArrayList < String > ();
        regex = "<div class=\"photo\">.*?<img";
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
        final Matcher ma = pa.matcher(s);
        while (ma.find()) {
            list.add(ma.group());
        }
        for (int i = 0; i < list.size(); i++) {
            videosrc = videosrc + list.get(i);
        }
        return videosrc;
    }

    /**
     * 
     * @param s
     * @return 去除标签
     */

    public String outTag(final String s) {
        return s.replaceAll("<.*?>", "");
    }


    /**
     * 
     * @param s
     * @return 获取内容
     */


    public HashMap < String, String > getFromWeb(final String s) {

        final HashMap < String, String > hm = new HashMap < String, String > ();
        String html = "";
        System.out.println("\n开始读取网页(" + s + ")");
        try {
            html = getHtml(s);
        } catch (final Exception e) {
            e.getMessage();
        }
        System.out.println(html);
        System.out.println("分析(" + s + ")结果\n");
        String title = outTag(getTitle(html));
        String content = outTag(getContent(html));
        String time = outTag(getTime(html));
        String reporter = outTag(getReporter(html)).replaceAll("By ", "");
        String channel = outTag(getChannel(html));
        String imgsrc = getImgsrc(html)
            .replaceAll("<img        src=\"", "").replaceAll("\"        border", "").replaceAll(" ", "");
        String videosrc = getVideosrc(html)
            .replaceAll("<div class=\"photo\"><a href=\"", "").replaceAll("\"><img", "");
        hm.put("title", title);
        hm.put("content", content);
        hm.put("time", time);
        hm.put("reporter", reporter);
        hm.put("channel", channel);
        hm.put("imgsrc", imgsrc);
        hm.put("videosrc", videosrc);
        return hm;


    }


    /**
     * 
     * @param args
     *            测试网页www.reuters.com
     */


    public static void main(final String args[]) {

        String url = "";
        final List < String > list = new ArrayList < String > ();
        System.out.print("输入新闻页面网址，换行输入run\n");
        final BufferedReader br = new BufferedReader(new InputStreamReader(
            System.in));
        //http://www.reuters.com/article/2014/04/04/us-usa-cia-interrogation-idUSBREA321UC20140404
        //http://www.reuters.com/article/2014/04/04/us-congress-justice-highspeed-idUSBREA3310O20140404
        try {
            while (!(url = br.readLine()).equals("run")) {
                list.add(url);
            }
        } catch (final Exception e) {
            e.getMessage();
        }
        final ObtainNews on = new ObtainNews();
        HashMap < String, String > hm = new HashMap < String, String > ();
        for (int i = 0; i < list.size(); i++) {
            hm = on.getFromWeb(list.get(i));
            String title = hm.get("title");
            String content = hm.get("content");
            String time = hm.get("time");
            String publisher = null;
            String site = "reuters";
            String reporter = hm.get("reporter");
            if (reporter == "") reporter = null;
            String channel = hm.get("channel");
            if (channel == "") channel = null;
            String subject = null;
            String imgsrc = hm.get("imgsrc");
            if (imgsrc == "") imgsrc = null;
            String videosrc = hm.get("videosrc");
            if (videosrc == "") videosrc = null;
            else {
                videosrc = "http://www.reuters.com" + videosrc;
                videosrc = videosrc.replaceAll(" ", "");
            }
            String str = list.get(i) + "\t" + title + "\t" + content + "\t" + time + "\t" + publisher + "\t" +
                site + "\t" + reporter + "\t" + channel + "\t" + subject + "\t" + imgsrc + "\t" + videosrc + "\n";

            System.out.println("URL： " + list.get(i));
            System.out.println("标题： " + title);
            System.out.println("正文： " + content);
            System.out.println("发布时间： " + time);
            System.out.println("发布者：" + publisher);
            System.out.println("来源站点：" + site);
            System.out.println("记者：" + reporter);
            System.out.println("分类频道：" + channel);
            System.out.println("主题：" + subject);
            System.out.println("图片链接：" + imgsrc);
            System.out.println("视频链接：" + videosrc);
            System.out.println(str);

            try {
                FileOutputStream fos = new FileOutputStream("D://News.txt", true);
                OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
                BufferedWriter bw = new BufferedWriter(osw);

                bw.write(str);
                bw.flush();
                bw.close();
            } catch (FileNotFoundException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            } catch (UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

        }
    }
}
wlchn
关注
0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
利用正则表达式抽取网页信息

import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;
复制链接

扫一扫