抓取新浪数据

5 篇文章 0 订阅
4 篇文章 0 订阅
package cn.com.sample;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;
import com.sun.org.apache.xml.internal.serialize.OutputFormat.DTD;

public class ExtractSearchResult {
	private static String blockRegex = "<script>STK\\s&&\\sSTK\\.pageletM\\s&&\\sSTK\\.pageletM\\.view\\(.*\\)";
	private static Pattern pattern = Pattern.compile(blockRegex);
	private static Whitelist whitelist = new Whitelist();
	static {
		// 只保留em标签的文本
		whitelist.addTags("em");
	}

	protected static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");

	public static void getWeiboContent(Document inDocument) {
		String source = inDocument.html();
		Document pageDocument = null;
		System.out.println(source);
		// 匹配文本块
		Matcher m = pattern.matcher(source);
		while (m.find()) {

			String jsonStr = m.group();

			jsonStr = jsonStr.substring(jsonStr.indexOf("{"),
					jsonStr.lastIndexOf(")"));
			// 解析json,转换为实体类
			WeiboBlock block = new WeiboBlock();
			block = JSON.parseObject(jsonStr, WeiboBlock.class);
			System.out.println("SSS:::" + JSON.parse(jsonStr));
			//System.out.println("SSS:::" + block.getHtml());

			if (block.getHtml().trim()
					.startsWith("<div class=\"search_feed\">")) {
				inDocument = Jsoup.parse(block.getHtml());
			}
			
			if (block.getHtml().trim().startsWith("<div class=\"topcon_num\">")){
				pageDocument = Jsoup.parse(block.getHtml());
			}
		}
		
		//结果数
		String pageContent = "0";
		if (pageDocument != null){
		Element pageElement = pageDocument.select("[node-type=totalNum]").first();
		pageContent = pageElement.text();
		pageContent = pageContent.trim().replace("找到 ", "").replace(" 条结果", "");
		}
		//System.out.println("SSS:::" + pageContent);//貌似出不来了

		List<Element> elements = getAllElement(inDocument);

		if (elements == null || elements.size() == 0) {

			System.out.println("No more urls to fetch with current keyword.");
			return;
		}
		//System.out.println("indoc:");
		//System.out.println(inDocument.html());
		//System.out.println("\n");

		for (Element elem : elements) {
			//System.out.println(elem.html());
			String url = elem.select(".date").last().attr("href");
			String dateS = elem.select(".date").last().attr("date");
			String content = null;
			Date date = null;
			String content_text = null;
			String title = null;
			String userName = null;
			String uid = null;
			String mid = null;
			String dateString = null;
			String reUserName = null;
			String reDateString = null;
			String reContent = null;
			String reMid = null;
			String reUid = null;
			double lon = 0.0;
			double lat = 0.0;
			if (url != null) {

				/*
				 * if (dateS != null && !"".equals(dateS)) { try { date =
				 * sdf.parse(dateS); } catch (ParseException e) {
				 * e.printStackTrace(); } }
				 */

				if (dateS != null) {
					mid = elem.attr("mid");//mid值
					
					//elem.getElementsByClass("info W_linkb W_textb").remove();
					userName = elem.select(".content").select("a[nick-name]")
							.attr("title");
					
					uid = elem.select(".content").select("a[nick-name]")
							.attr("suda-data");
					int startIndex  =uid.indexOf("weibo_nologin_name:");
					uid = uid.substring(startIndex+19);//正则或改进
					//System.out.println("XXX:"+uid);
					
					dateString = elem.select(".content").select("a[date]")
							.attr("title");
					content = Jsoup.clean(
							elem.select(".content")
									.select("p[node-type=feed_list_content]")
									.select("em").html(), Whitelist.none())
							.replaceAll(""", "\"");
					// content = content.substring(content.indexOf(":")+1,
					// content.indexOf(" 转发")).replaceAll(""", "\"");
					
					String actionData = elem.select(".content").select(".map_data").select("a[action-data]")
							.attr("action-data");
					if(actionData.contains("geo")){
						String[] geos = actionData.split("&");
						String[] cards = geos[0].replace("geo=", "").split(",");
						lon = Double.valueOf(cards[0]);
						lat = Double.valueOf(cards[1]);
						System.out.println(userName + "\t" + dateString + "\t"
								+ content + "\t" + lon + "\t" + lat);
					}
					//title = this.parseTitle(content);

					Element reElem = elem
							.select(".content")
							.select("dl[class=comment W_textc W_linecolor W_bgcolor]")
							.first();
					if (reElem != null) {
						if(reElem.html().indexOf("此微博已被作者删除") > 0){
							continue;
						}
						reUserName = reElem.select("a[nick-name]")
								.attr("title");
						reDateString = reElem.select("a[date]").html();
						reContent = Jsoup
								.clean(reElem
										.select("dt[node-type=feed_list_forwardContent]")
										.select("em").html(), Whitelist.none())
								.replaceAll(""", "\"");
						
						
						System.out.println("\tRe:" + reUserName + "\t"
								+ reDateString + "\t" + reContent);
						
						String metaString = elem.select("p[class=info W_linkb W_textb]")
								.select("a").attr("action-data");
						//改进正则表达式
						int remidStartIndex = metaString.indexOf("rootmid=");
						int remidEndIndex = metaString.indexOf("&rootname");
						int reuidStartTndex = metaString.indexOf("rootuid=");
						int reuidEndTndex = metaString.indexOf("&rooturl=");
						
						reMid = metaString.substring(remidStartIndex+"rootmid=".length(),
								remidEndIndex);
						
						reUid = metaString.substring(reuidStartTndex+"rootuid=".length(),
								reuidEndTndex);
						
						System.out.println("XXX:" + reMid + "-" + reUid);
					}
					

					url = elem.select(".date").last().attr("href");
				}
			} else {
				System.out.println("current Url: ---------null------------");
			}
		}

	}

	public static List<String> getWeiboContentWithGeo(Document inDocument) {
		List<String> resultList = new ArrayList<>();
		String source = inDocument.html();
		Document pageDocument = null;
		//System.out.println(source);
		// 匹配文本块
		Matcher m = pattern.matcher(source);
		while (m.find()) {

			String jsonStr = m.group();

			jsonStr = jsonStr.substring(jsonStr.indexOf("{"),
					jsonStr.lastIndexOf(")"));
			// 解析json,转换为实体类
			WeiboBlock block = new WeiboBlock();
			block = JSON.parseObject(jsonStr, WeiboBlock.class);
			//System.out.println("SSS:::" + JSON.parse(jsonStr));
			//System.out.println("SSS:::" + block.getHtml());

			if (block.getHtml().trim()
					.startsWith("<div class=\"search_feed\">")) {
				inDocument = Jsoup.parse(block.getHtml());
			}
			
			if (block.getHtml().trim().startsWith("<div class=\"topcon_num\">")){
				pageDocument = Jsoup.parse(block.getHtml());
			}
		}
		
		//结果数
		String pageContent = "0";
		if (pageDocument != null){
		Element pageElement = pageDocument.select("[node-type=totalNum]").first();
		pageContent = pageElement.text();
		pageContent = pageContent.trim().replace("找到 ", "").replace(" 条结果", "");
		}
		//System.out.println("SSS:::" + pageContent);//貌似出不来了

		List<Element> elements = getAllElement(inDocument);

		if (elements == null || elements.size() == 0) {

			System.out.println("No more urls to fetch with current keyword.");
			return null;
		}
		//System.out.println("indoc:");
		//System.out.println(inDocument.html());
		//System.out.println("\n");

		for (Element elem : elements) {
			//System.out.println(elem.html());
			String url = elem.select(".date").last().attr("href");
			String dateS = elem.select(".date").last().attr("date");
			String content = null;
			Date date = null;
			String content_text = null;
			String title = null;
			String userName = null;
			String uid = null;
			String mid = null;
			String dateString = null;
			String reUserName = null;
			String reDateString = null;
			String reContent = null;
			String reMid = null;
			String reUid = null;
			double lon = 0.0;
			double lat = 0.0;
			if (url != null) {

				/*
				 * if (dateS != null && !"".equals(dateS)) { try { date =
				 * sdf.parse(dateS); } catch (ParseException e) {
				 * e.printStackTrace(); } }
				 */

				if (dateS != null) {
					mid = elem.attr("mid");//mid值
					
					//elem.getElementsByClass("info W_linkb W_textb").remove();
					userName = elem.select(".content").select("a[nick-name]")
							.attr("title");
					
					uid = elem.select(".content").select("a[nick-name]")
							.attr("suda-data");
					int startIndex  =uid.indexOf("weibo_nologin_name:");
					uid = uid.substring(startIndex+19);//正则或改进
					//System.out.println("XXX:"+uid);
					
					dateString = elem.select(".content").select("a[date]")
							.attr("title");
					content = Jsoup.clean(
							elem.select(".content")
									.select("p[node-type=feed_list_content]")
									.select("em").html(), Whitelist.none())
							.replaceAll(""", "\"");
					// content = content.substring(content.indexOf(":")+1,
					// content.indexOf(" 转发")).replaceAll(""", "\"");
					
					String actionData = elem.select(".content").select(".map_data").select("a[action-data]")
							.attr("action-data");
					if(actionData.contains("geo")){
						String[] geos = actionData.split("&");
						String[] cards = geos[0].replace("geo=", "").split(",");
						lon = Double.valueOf(cards[0]);
						lat = Double.valueOf(cards[1]);
						System.out.println(mid + "\t" + userName + "\t" + dateString + "\t"
								+ content + "\t" + lon + "\t" + lat);
						resultList.add(mid + userName + "\t" + dateString + "\t"
								+ content + "\t" + lon + "\t" + lat);
					}
				}
			} else {
				System.out.println("current Url: ---------null------------");
			}
		}
		return resultList;
	}
	
	/**
	 * 生成标题
	 * 
	 * @param htmlContent
	 * @return
	 */
	private static String parseTitle(String htmlContent) {
		if (htmlContent == null || htmlContent.trim().equals(""))
			return null;
		String title = htmlContent;
		title = title.trim();
		for (int i = 0; i < title.length(); i++) {
			if (String.valueOf((title.charAt(i))).matches("[,.\\?\\!\\.,]")) {
				title = title.substring(0, i);
				break;
			}
		}
		return title;
	}

	/**
	 * 获取所有的结果正文节点
	 * 
	 * @param doc
	 * @return
	 */
	private static List<Element> getAllElement(Document doc) {

		List<Element> resultList = new ArrayList<Element>();

		Elements elems = doc.select(".search_feed .feed_list");

		for (Element element : elems) {
			resultList.add(element);
		}

		return resultList;
	}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值