Java编程应用（六）：新浪新闻爬虫程序

最新推荐文章于 2024-08-03 15:58:00 发布
Wind Lu
最新推荐文章于 2024-08-03 15:58:00 发布
阅读量3.3k
点赞数 1
分类专栏： java 文章标签：爬虫新浪新闻
本文链接：https://blog.csdn.net/lxf_44944/article/details/43794607
版权
java 专栏收录该内容
16 篇文章 0 订阅
订阅专栏
下面是该爬虫的关键代码，查看更多的源代码请点击这里
package com.lxf.crawler;
import java.io.File;
import java.io.FileWriter;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import com.lxf.dao.bean.NewsBean;
import com.lxf.dao.imp.NewsDao;
import com.lxf.dao.inf.NewsDaoInf;
/**
 * <爬虫程序> 从新浪新闻中爬取新闻分类、标题及内容 (需导入htmlparser.jar包，可以从我的github上下载https://github.com/lxf44944/sinaNews_crawler/)
 * 
 * @author 刘向峰
 */
public class SinaNews {
    /**
     * 测试入口
     * 
     * @param args
     */
    public static void main(String args[]) {
        // china world
        // society media opinion
        String type = "world";
        File file = new File(type);
        if (!file.exists())// 如果不存在就创建
        {
            file.mkdirs();
        }
        SinaNews gn = new SinaNews();
        String a = gn.getNews(type);
        // gn.writefile(a, "SinaNews.html",type);
    }
    // 抓取信息 组成良好格式
    public String getNews(String type) {
        NewsDaoInf dao = new NewsDao();
        try {
            NodeFilter filter = new TagNameFilter("ul");
            Parser parser = new Parser();
            Parser bodyparser = new Parser();
            parser.setURL("http://news.sina.com.cn/" + type + "/");// 互联网模块的地址
            // System.out.println(parser.getEncoding());
            parser.setEncoding("gb2312");
            NodeList list = parser.extractAllNodesThatMatch(filter);
            StringBuilder newsStr = new StringBuilder(
                    "<!DOCTYPE html><html><head></head><body><table>");// 新闻表格字符串
            SinaNews gn = new SinaNews();
            for (int i = 0; i < list.size() - 1; i++) {
                Tag node = (Tag) list.elementAt(i);
                for (int j = 1; j < node.getChildren().size(); j++) {
                    String textstr = node.getChildren().elementAt(j).toHtml()
                            .trim();
                    if (textstr.length() > 0) {
                        int linkbegin = textstr.indexOf("href=");// 截取<a>链接字符串起始位置
                        int linkend = textstr.indexOf("\">");// 截取<a>链接字符串结束位置
                        String sublink = textstr.substring(linkbegin + 6,
                                linkend);
                        // 链接字符串
                        String link = "";
                        if (sublink.indexOf("target") != -1) {
                            link = sublink.substring(0, sublink.indexOf("\""));
                        } else {
                            link = sublink;// 链接字符串
                        }
                        int titlebegin = textstr.indexOf("\">");
                        int titleend = textstr.indexOf("</a>");
                        String title = textstr.substring(titlebegin + 2,
                                titleend).trim();
                        System.out.println("正在抓取: " + title);
                        // 通过标题判断该新闻是否已经存在
                        if (dao.hasNews(title)) {
                            System.out.println("【该记录已经存在】");
                            continue;
                        }
                        if (title.contains("视频:") || title.contains("视频：")) {
                            System.out.println("【无法获得视频新闻】");
                            continue;
                        }
                        if (title.contains("(图)")) {
                            title = title.replace("(图)", "");
                        }
                        try {
                            /** 新闻内容处理开始 */
                            NodeFilter bodyfilter = new AndFilter(
                                    new TagNameFilter("div"),
                                    new HasAttributeFilter("id", "artibody"));
                            bodyparser.setURL(link);// 地址url
                            // bodyparser.setEncoding(bodyparser.getEncoding());
                            bodyparser.setEncoding("gb2312");
                            NodeList bodylist = bodyparser
                                    .extractAllNodesThatMatch(bodyfilter);
                            // 新闻内容字符串
                            if (bodylist.elementAt(0) == null) {
                                System.out.println("【新闻无内容】");
                                continue;
                            }
                            String newstextstr = bodylist.elementAt(0).toHtml()
                                    .trim();
                            // 只保留正文内容，保留P标签以保持其排版
                            int bodybegin = newstextstr.indexOf("<p>");
                            int bodyend = newstextstr.lastIndexOf("</p>") + 4;
                            int bodyimgbegin = newstextstr
                                    .indexOf("<div class=\"img_wrapper\">");
                            int bodyimgend = newstextstr
                                    .lastIndexOf("<span class=\"img_descr\">");
                            String body = "";
                            if (bodybegin < 0) {
                                body = newstextstr;
                            } else {
                                body = newstextstr
                                        .substring(bodybegin, bodyend);
                            }
                            if (bodyimgbegin >= 0) {
                                body = newstextstr.substring(bodyimgbegin,
                                        bodyimgend) + "</div>" + body;
                            }
                            /** 写入数据库 */
                            NewsBean newsBean = new NewsBean(0, title, body,
                                    link, link.substring(
                                            link.lastIndexOf("/") - 10,
                                            link.lastIndexOf("/")), type);
                            dao.add(newsBean);
                            // gn.writefile(body, link,type); // 写文件
                        } catch (Exception e) {
                            System.out.println("抓取信息子页面出错，出错信息为：");
                            e.printStackTrace();
                            /** 新闻内容处理结束 */
                        }
                        /** 将标题拼接到字符串中 */
                        newsStr.append("<tr><td><a target=\"_blank\" href=\""
                                + link + "\">");
                        newsStr.append(title);
                        newsStr.append("</a></td></tr>");
                    }
                }
            }
            newsStr.append("</table></body></html>");
            return newsStr.toString();
        } catch (Exception e) {
            System.out.println("抓取信息出错，出错信息为：");
            e.printStackTrace();
            return "";
        }
    }
    // 写文件
    public void writefile(String str, String filename, String type) {
        if (filename.contains(".cn/")) {
            filename = type
                    + "\\"
                    + filename.substring(filename.indexOf(".cn/") + 4).replace(
                            "/", "_");
        } else {
            filename = type + "\\" + filename;
        }
        File file = new File(filename);
        if (!file.exists() && filename.indexOf("/") != -1)// 如果不存在就创建
        {
            file.mkdirs();
        }
        try {
            FileWriter writer = new FileWriter(filename);
            writer.write(str);
            writer.close();
            System.out.println("成功生成新闻页面" + filename);
        } catch (Exception e) {
            System.out.println("将信息写入文件" + filename + "发生错误，错误信息为：");
            e.printStackTrace();
        }
    }
}