抓取腾讯新闻评论


package com.orange.qqnews;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.sf.json.JSONObject;

public class Test {
	public static void main(String[] args) {
		//新闻正文正则
		String regex1 = "<div id=\"Cnt-Main-Article-QQ\" bossZone=\"content\">([\\d\\D]*?)</div>";
		//评论ID正则
		String regex2 = "cmt_id = ([\\d]*?);";
		//获取网页源代码
		String html = openUrl("http://news.qq.com/a/20150825/004734.htm","gb2312");
		//获取新闻正则
		String content = getContent(regex1,html);
		System.out.println(content);
		//获取评论ID
		String cmtId = getContent(regex2,html);
		System.out.println(cmtId);
		
		//拼接评论地址
		String cmtUrl = "http://coral.qq.com/article/"+cmtId+"/comment?commentid=0&reqnum=20";
		String cmt = openUrl(cmtUrl,"gb2312");

		JSONObject jsonMap = new JSONObject();
		Map map = jsonMap.fromObject(cmt);
		Map<String,List> data = (Map)map.get("data");
		List<Map<String,String>> comments = data.get("commentid");
		
		for(Map<String,String> m : comments){
			String cmtContent = m.get("content"); //评论
			
			//其他信息略过(回复人,回复时间,赞等)
			
			System.out.println(cmtContent);
		}
		

	}

	/**
	 * 访问url返回url的html代码
	 */
	public static String openUrl(String currentUrl,String charset) {
		InputStream is = null;
		BufferedReader br = null;
		URL url;
		StringBuffer html = new StringBuffer();
		try {
			url = new URL(currentUrl);
			URLConnection conn = url.openConnection();
			conn.setReadTimeout(5000);
			conn.connect();
			is = conn.getInputStream();
			br = new BufferedReader(new InputStreamReader(is,charset));
			String str;
			while (null != (str = br.readLine())) {
				html.append(str).append("\n");
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			if (is != null) {
				try {
					is.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}

		}
		return html.toString();
	}
	
	private static String getContent(String regex,String text) {
		String content = "";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(text);
		while(matcher.find()) {
			content = matcher.group(1).toString();
		}
		return content;
	}
}




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值
>