做毕设(九)——爬新闻

学习爬虫
http://blog.csdn.net/pleasecallmewhy/article/details/17594303
照样画葫芦,去爬新闻网站

public class Spider {
    public static String SendGet(String url) {
        // 定义一个字符串用来存储网页内容
        String result = "";
        // 定义一个缓冲字符输入流
        BufferedReader in = null;
        try {
            // 将string转成url对象
            URL realUrl = new URL(url);
            // 初始化一个链接到那个url的连接
            URLConnection connection = realUrl.openConnection();
            // 开始实际的连接
            connection.connect();
            // 初始化 BufferedReader输入流来读取URL的响应
            in = new BufferedReader(new InputStreamReader(
                    connection.getInputStream(), "UTF-8"));
            // 用来临时存储抓取到的每一行的数据
            String line;
            while ((line = in.readLine()) != null) {
                // 遍历抓取到的每一行并将其存储到result里面
                result += line;
            }
        } catch (Exception e) {
            System.out.println("发送GET请求出现异常!" + e);
            e.printStackTrace();
        }
        // 使用finally来关闭输入流
        finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        return result;
    }

    public static ArrayList<News> GetNews(String content) {
        // 预定义一个ArrayList来存储结果
        ArrayList<News> results = new ArrayList<News>();
        // 用来匹配标题
        Pattern titlePattern = Pattern.compile("<h3.+?title=\"(.+?)\"\\st");
        Matcher titleMatcher = titlePattern.matcher(content);
        // 用来匹配url,也就是问题的链接
        Pattern urlPattern = Pattern.compile("<h3.+?href=\"//(.+?)\"");
        Matcher urlMatcher = urlPattern.matcher(content);
        // 标题和链接要均能匹配到
        boolean isFind = titleMatcher.find() && urlMatcher.find();
        while (isFind) {
            // 定义一个新闻对象来存储抓取到的信息
            News news = new News();
            news.setTitle(titleMatcher.group(1));
            String newsUrl =urlMatcher.group(1);
            String context = Spider.SendGet("http://" + newsUrl);
            //抓作者
            Pattern authorPattern = Pattern.compile("•\\s<span>(.+?)<");
            Matcher authorMatcher = authorPattern.matcher(context);
            //抓正文
            Pattern contextPattern = Pattern.compile("hb_content\">\\s+(.+?)</div");
            Matcher contextMatcher = contextPattern.matcher(context);
            //抓类型
            //Pattern typePattern = Pattern.compile("-\\s(..)\\s-");
            //Matcher typeMatcher = typePattern.matcher(context);
            while(authorMatcher.find()&&contextMatcher.find()){
                news.setAuthor(authorMatcher.group(1));
                news.setContext(contextMatcher.group(1));
            }
            // 添加成功匹配的结果
            results.add(news);
            //System.out.println("添加成功");
            // 继续查找下一个匹配对象
            isFind = titleMatcher.find() && urlMatcher.find();
        }

        return results;
    }

类型其实也是可以抓的,不过我就让它空着代替审核的功能。
将前面的获取新闻改成获取类型不为空的新闻

public List<News> getAllNewsByTypeIsNotNull(){
   return newsRepository.findAllByTypeIsNotNull();
}
public List<News> getAllNewsByTypeIsNull(){
    return newsRepository.findAllByTypeIsNull();
}

edit.html

<form action="/edit/getnews">
        <input type="submit" value="获取新闻">
    </form>
    <p th:text="'爬取了'+${count}+'条新新闻'"></p>
    <p th:text="'有'+${#lists.size(editNews)}+'条新闻待编辑'"></p>
    <table border="1">
        <tr>
            <th>标题</th>
            <th>作者</th>
            <th>内容</th>
            <th width="10%">操作</th>
        </tr>
        <tr th:each="news:${editNews}">
            <td th:text="${news.title}"></td>
            <td th:text="${news.author}"></td>
            <td th:text="${news.context}"></td>
            <td>
                <a th:href="@{'/edit/'+${news.id}+'?type=明星'}">明星</a>
                <a th:href="@{'/edit/'+${news.id}+'?type=时尚'}">时尚</a>
                <a th:href="@{'/edit/'+${news.id}+'?type=影视'}">影视</a>
                <a th:href="@{'/edit/'+${news.id}+'?type=宠物'}">宠物</a>
                <a th:href="@{'/edit/'+${news.id}+'?type=生活'}">生活</a>
                <a th:href="@{'/edit/'+${news.id}+'?type=删除'}">删除</a>
            </td>
        </tr>
    </table>
@Controller
@RequestMapping(value = "/edit")
public class EditController {
    @Autowired
    NewsService newsService;
    //只有admin才可以编辑新闻
    public boolean isAdmin(HttpSession httpSession){
        if(httpSession.getAttribute("user")==null){

        }else{
            User user = (User)httpSession.getAttribute("user");
            if(user.getUsername().equals("admin"))
                return true;
        }
        return true;//方便测试,应为false
    }
    @RequestMapping(value = "",method = RequestMethod.GET)
    public String toEditPage(HttpSession httpSession,ModelMap map){
        if(isAdmin(httpSession)){
            int count = 0;//记录抓取了多少条新数据
            map.addAttribute("count",count);
            List<News> editNews = newsService.getAllNewsByTypeIsNull();
            map.addAttribute("editNews", editNews);
            return "edit";
        }
        return "error";
    }

    @RequestMapping(value = "/getnews",method = RequestMethod.GET)
    public String getNews(HttpSession httpSession,ModelMap map){
        if(isAdmin(httpSession)) {
            // 定义即将访问的链接
            String url = "http://www.huabian.com/";
            // 访问链接并获取页面内容
            String content = Spider.SendGet(url);
            // 获取该页面的所有的新闻对象
            ArrayList<News> newNews = Spider.GetNews(content);
            // 打印结果
            //System.out.println(newNews);
            int count = 0;//记录抓取了多少条新数据
            for(int i=0;i<newNews.size();i++){
                count+=newsService.saveDifferentNews(newNews.get(i));
            }
            map.addAttribute("count",count);
            List<News> editNews = newsService.getAllNewsByTypeIsNull();
            map.addAttribute("editNews", editNews);
            return "edit";
        }
        return "error";
    }

    @RequestMapping(value = "/{id}",method = RequestMethod.GET)
    public String checkNews(@PathVariable("id") Integer id,String type, HttpSession httpSession){
        if(isAdmin(httpSession)){
            News news = newsService.getNewsById(id);
            if(type.equals("删除")){
                newsService.delNews(news);
            }else {
                news.setType(type);
                newsService.updateNews(news);
            }
            return "redirect:/edit/";
        }
        return "error";
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值