package cn.com.sample;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSON;
import com.sun.org.apache.xml.internal.serialize.OutputFormat.DTD;
public class ExtractSearchResult {
private static String blockRegex = "<script>STK\\s&&\\sSTK\\.pageletM\\s&&\\sSTK\\.pageletM\\.view\\(.*\\)";
private static Pattern pattern = Pattern.compile(blockRegex);
private static Whitelist whitelist = new Whitelist();
static {
// 只保留em标签的文本
whitelist.addTags("em");
}
protected static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
public static void getWeiboContent(Document inDocument) {
String source = inDocument.html();
Document pageDocument = null;
System.out.println(source);
// 匹配文本块
Matcher m = pattern.matcher(source);
while (m.find()) {
String jsonStr = m.group();
jsonStr = jsonStr.substring(jsonStr.indexOf("{"),
jsonStr.lastIndexOf(")"));
// 解析json,转换为实体类
WeiboBlock block = new WeiboBlock();
block = JSON.parseObject(jsonStr, WeiboBlock.class);
System.out.println("SSS:::" + JSON.parse(jsonStr));
//System.out.println("SSS:::" + block.getHtml());
if (block.getHtml().trim()
.startsWith("<div class=\"search_feed\">")) {
inDocument = Jsoup.parse(block.getHtml());
}
if (block.getHtml().trim().startsWith("<div class=\"topcon_num\">")){
pageDocument = Jsoup.parse(block.getHtml());
}
}
//结果数
String pageContent = "0";
if (pageDocument != null){
Element pageElement = pageDocument.select("[node-type=totalNum]").first();
pageContent = pageElement.text();
pageContent = pageContent.trim().replace("找到 ", "").replace(" 条结果", "");
}
//System.out.println("SSS:::" + pageContent);//貌似出不来了
List<Element> elements = getAllElement(inDocument);
if (elements == null || elements.size() == 0) {
System.out.println("No more urls to fetch with current keyword.");
return;
}
//System.out.println("indoc:");
//System.out.println(inDocument.html());
//System.out.println("\n");
for (Element elem : elements) {
//System.out.println(elem.html());
String url = elem.select(".date").last().attr("href");
String dateS = elem.select(".date").last().attr("date");
String content = null;
Date date = null;
String content_text = null;
String title = null;
String userName = null;
String uid = null;
String mid = null;
String dateString = null;
String reUserName = null;
String reDateString = null;
String reContent = null;
String reMid = null;
String reUid = null;
double lon = 0.0;
double lat = 0.0;
if (url != null) {
/*
* if (dateS != null && !"".equals(dateS)) { try { date =
* sdf.parse(dateS); } catch (ParseException e) {
* e.printStackTrace(); } }
*/
if (dateS != null) {
mid = elem.attr("mid");//mid值
//elem.getElementsByClass("info W_linkb W_textb").remove();
userName = elem.select(".content").select("a[nick-name]")
.attr("title");
uid = elem.select(".content").select("a[nick-name]")
.attr("suda-data");
int startIndex =uid.indexOf("weibo_nologin_name:");
uid = uid.substring(startIndex+19);//正则或改进
//System.out.println("XXX:"+uid);
dateString = elem.select(".content").select("a[date]")
.attr("title");
content = Jsoup.clean(
elem.select(".content")
.select("p[node-type=feed_list_content]")
.select("em").html(), Whitelist.none())
.replaceAll(""", "\"");
// content = content.substring(content.indexOf(":")+1,
// content.indexOf(" 转发")).replaceAll(""", "\"");
String actionData = elem.select(".content").select(".map_data").select("a[action-data]")
.attr("action-data");
if(actionData.contains("geo")){
String[] geos = actionData.split("&");
String[] cards = geos[0].replace("geo=", "").split(",");
lon = Double.valueOf(cards[0]);
lat = Double.valueOf(cards[1]);
System.out.println(userName + "\t" + dateString + "\t"
+ content + "\t" + lon + "\t" + lat);
}
//title = this.parseTitle(content);
Element reElem = elem
.select(".content")
.select("dl[class=comment W_textc W_linecolor W_bgcolor]")
.first();
if (reElem != null) {
if(reElem.html().indexOf("此微博已被作者删除") > 0){
continue;
}
reUserName = reElem.select("a[nick-name]")
.attr("title");
reDateString = reElem.select("a[date]").html();
reContent = Jsoup
.clean(reElem
.select("dt[node-type=feed_list_forwardContent]")
.select("em").html(), Whitelist.none())
.replaceAll(""", "\"");
System.out.println("\tRe:" + reUserName + "\t"
+ reDateString + "\t" + reContent);
String metaString = elem.select("p[class=info W_linkb W_textb]")
.select("a").attr("action-data");
//改进正则表达式
int remidStartIndex = metaString.indexOf("rootmid=");
int remidEndIndex = metaString.indexOf("&rootname");
int reuidStartTndex = metaString.indexOf("rootuid=");
int reuidEndTndex = metaString.indexOf("&rooturl=");
reMid = metaString.substring(remidStartIndex+"rootmid=".length(),
remidEndIndex);
reUid = metaString.substring(reuidStartTndex+"rootuid=".length(),
reuidEndTndex);
System.out.println("XXX:" + reMid + "-" + reUid);
}
url = elem.select(".date").last().attr("href");
}
} else {
System.out.println("current Url: ---------null------------");
}
}
}
public static List<String> getWeiboContentWithGeo(Document inDocument) {
List<String> resultList = new ArrayList<>();
String source = inDocument.html();
Document pageDocument = null;
//System.out.println(source);
// 匹配文本块
Matcher m = pattern.matcher(source);
while (m.find()) {
String jsonStr = m.group();
jsonStr = jsonStr.substring(jsonStr.indexOf("{"),
jsonStr.lastIndexOf(")"));
// 解析json,转换为实体类
WeiboBlock block = new WeiboBlock();
block = JSON.parseObject(jsonStr, WeiboBlock.class);
//System.out.println("SSS:::" + JSON.parse(jsonStr));
//System.out.println("SSS:::" + block.getHtml());
if (block.getHtml().trim()
.startsWith("<div class=\"search_feed\">")) {
inDocument = Jsoup.parse(block.getHtml());
}
if (block.getHtml().trim().startsWith("<div class=\"topcon_num\">")){
pageDocument = Jsoup.parse(block.getHtml());
}
}
//结果数
String pageContent = "0";
if (pageDocument != null){
Element pageElement = pageDocument.select("[node-type=totalNum]").first();
pageContent = pageElement.text();
pageContent = pageContent.trim().replace("找到 ", "").replace(" 条结果", "");
}
//System.out.println("SSS:::" + pageContent);//貌似出不来了
List<Element> elements = getAllElement(inDocument);
if (elements == null || elements.size() == 0) {
System.out.println("No more urls to fetch with current keyword.");
return null;
}
//System.out.println("indoc:");
//System.out.println(inDocument.html());
//System.out.println("\n");
for (Element elem : elements) {
//System.out.println(elem.html());
String url = elem.select(".date").last().attr("href");
String dateS = elem.select(".date").last().attr("date");
String content = null;
Date date = null;
String content_text = null;
String title = null;
String userName = null;
String uid = null;
String mid = null;
String dateString = null;
String reUserName = null;
String reDateString = null;
String reContent = null;
String reMid = null;
String reUid = null;
double lon = 0.0;
double lat = 0.0;
if (url != null) {
/*
* if (dateS != null && !"".equals(dateS)) { try { date =
* sdf.parse(dateS); } catch (ParseException e) {
* e.printStackTrace(); } }
*/
if (dateS != null) {
mid = elem.attr("mid");//mid值
//elem.getElementsByClass("info W_linkb W_textb").remove();
userName = elem.select(".content").select("a[nick-name]")
.attr("title");
uid = elem.select(".content").select("a[nick-name]")
.attr("suda-data");
int startIndex =uid.indexOf("weibo_nologin_name:");
uid = uid.substring(startIndex+19);//正则或改进
//System.out.println("XXX:"+uid);
dateString = elem.select(".content").select("a[date]")
.attr("title");
content = Jsoup.clean(
elem.select(".content")
.select("p[node-type=feed_list_content]")
.select("em").html(), Whitelist.none())
.replaceAll(""", "\"");
// content = content.substring(content.indexOf(":")+1,
// content.indexOf(" 转发")).replaceAll(""", "\"");
String actionData = elem.select(".content").select(".map_data").select("a[action-data]")
.attr("action-data");
if(actionData.contains("geo")){
String[] geos = actionData.split("&");
String[] cards = geos[0].replace("geo=", "").split(",");
lon = Double.valueOf(cards[0]);
lat = Double.valueOf(cards[1]);
System.out.println(mid + "\t" + userName + "\t" + dateString + "\t"
+ content + "\t" + lon + "\t" + lat);
resultList.add(mid + userName + "\t" + dateString + "\t"
+ content + "\t" + lon + "\t" + lat);
}
}
} else {
System.out.println("current Url: ---------null------------");
}
}
return resultList;
}
/**
* 生成标题
*
* @param htmlContent
* @return
*/
private static String parseTitle(String htmlContent) {
if (htmlContent == null || htmlContent.trim().equals(""))
return null;
String title = htmlContent;
title = title.trim();
for (int i = 0; i < title.length(); i++) {
if (String.valueOf((title.charAt(i))).matches("[,.\\?\\!\\.,]")) {
title = title.substring(0, i);
break;
}
}
return title;
}
/**
* 获取所有的结果正文节点
*
* @param doc
* @return
*/
private static List<Element> getAllElement(Document doc) {
List<Element> resultList = new ArrayList<Element>();
Elements elems = doc.select(".search_feed .feed_list");
for (Element element : elems) {
resultList.add(element);
}
return resultList;
}
}
抓取新浪数据
最新推荐文章于 2021-03-20 04:20:51 发布