package com.laudandjolynn.test;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.exception.TikaException;
import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class WeiboCnUtils {
private final static Logger logger = LoggerFactory
.getLogger(WeiboCnUtils.class);
private final static Pattern PATTERN_SID = Pattern.compile("uid=(\d+)");
private final static Pattern PATTERN_WB_POST_DATE_TIME1 = Pattern
.compile("(\d{2}):(\d{2})");
private final static Pattern PATTERN_WB_POST_DATE_TIME2 = Pattern
.compile("(\d{2})月(\d{2})日\s+(\d{2}:\d{2})");
private final static Pattern PATTERN_WB_POST_DATE_TIME3 = Pattern
.compile("(\d{1,4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})");
private final static String SINA_IMG_HREF_PREFIX = "http://ww1.sinaimg.cn/thumbnail/";
/**
* 解析weibo.cn页面
*
* @param html
* @param userService
* @return
* @throws ParserConfigurationException
* @throws SAXException
* @throws IOException
* @throws XPathExpressionException
*/
public static void parsePageAtWeibocn(String html) throws ParserConfigurationException,
SAXException, IOException, XPathExpressionException {
DocumentBuilderFactory domFactory = DocumentBuilderFactory
.newInstance();
domFactory.setIgnoringComments(true);
domFactory.setValidating(false);
// 需要注意处理html不规范的问题,因此这里使用jsoup得到可以正常解析的html
DocumentBuilder domBuilder = domFactory.newDocumentBuilder();
Document doc = domBuilder.parse(new InputSource(new StringReader(Jsoup
.parse(html).html())));
XPathFactory factory = XPathFactory.newInstance();
XPath xpath = factory.newXPath();
NodeList nodes = (NodeList) xpath.evaluate(
"html/body//div[@class='c' and @id]", doc,
XPathConstants.NODESET);
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
try {
// 微博id
String weiboid = ((String) xpath.evaluate("@id", node,
XPathConstants.STRING)).substring(2);
logger.info("weibo.cn - (" + i + "), weiboid: " + weiboid);
NodeList children = (NodeList) xpath.evaluate("child::*", node,
XPathConstants.NODESET);
Node feed1 = children.item(0);
// uid
String uid = null;
String weibo = null;
// 博主昵称
String nickName = ((String) xpath.evaluate(
"a[@class='nk']/text()", feed1, XPathConstants.STRING))
.trim();
logger.debug("weibo.cn - (" + i + "), nickname: " + nickName);
boolean v = false;
String pic = null;
long attitudeCount = 0;
long repostsCount = 0;
long commentsCount = 0;
// #########################
// 被转发用户uid
String fuid = null;
String fweibo = null;
// 被转发用户昵称
String fnickName = null;
// 被转发用户身份
boolean fv = false;
String fpic = null;
long fattitudeCount = 0;
long frepostsCount = 0;
long fcommentsCount = 0;
long fcreatedAt = 0;
// 身份:加V,达人等
if (xpath.evaluate("img[@alt][1]", feed1, XPathConstants.NODE) != null) {
v = true;
}
logger.debug("weibo.cn - (" + i + "), vip: " + v);
// 是否转发
boolean isForward = false;
Node forwardNode = (Node) xpath.evaluate("span[@class='cmt']",
feed1, XPathConstants.NODE);
if (forwardNode != null) {
fnickName = ((String) xpath.evaluate("a[@href]/text()",
forwardNode, XPathConstants.STRING)).trim();
logger.debug("weibo.cn - (" + i + "), forward nickname: "
+ fnickName);
if (xpath.evaluate("img[@alt and @src]", forwardNode,
XPathConstants.NODE) != null) {
fv = true;
}
isForward = true;
logger.debug("weibo.cn - (" + i + "), forward: "
+ isForward);
}
// 微博内容
String tweibo = ((String) xpath.evaluate(
"span[@class='ctt']/text()", feed1,
XPathConstants.STRING)).trim();
logger.debug("weibo.cn - (" + i + "), content: " + tweibo);
weibo = isForward ? null : tweibo;
fweibo = isForward ? tweibo : null;
int childCount = children.getLength();
if (childCount == 1) {
uid = getUid(feed1, xpath);
logger.debug("weibo.cn - (" + i + "), uid: " + uid);
getFeedStatistic(feed1, xpath, i);
continue;
}
if (childCount >= 2) {
// 是否单图还是组图
boolean hasGroupPic = false;
String groupPicText = (String) xpath.evaluate(
"a[contains(@href,'picAll')]/text()", feed1,
XPathConstants.STRING);
if (!StringUtils.isEmpty(groupPicText)) {
int picCount = Integer.valueOf(groupPicText.substring(
3, groupPicText.length() - 1));
hasGroupPic = true;
logger.debug("weibo.cn - (" + i
+ "), group picture count: " + picCount);
}
// 微博图片、原微博统计数据
Node feed2 = children.item(1);
String tmpPicSrc = null;
// 取出微博图片链接
if (hasGroupPic) {
String imageHref = (String) xpath.evaluate(
"a[contains(@href,'oripic')]/@href", feed2,
XPathConstants.STRING);
String imageName = imageHref.substring(imageHref
.indexOf("u=") + 2);
int index = imageName.indexOf("&");
if (index != -1) {
imageName = imageName.substring(0, index) + ".jpg";
} else {
imageName += ".jpg";
}
tmpPicSrc = SINA_IMG_HREF_PREFIX + imageName;
} else {
String src = (String) xpath.evaluate(
"a/img[@class='ib']/@src", feed2,
XPathConstants.STRING);
tmpPicSrc = SINA_IMG_HREF_PREFIX
+ src.substring(src.lastIndexOf("/") + 1);
}
logger.debug("weibo.cn - (" + i + "), picture: "
+ tmpPicSrc);
pic = isForward ? null : tmpPicSrc;
fpic = isForward ? tmpPicSrc : null;
if (!isForward) {
uid = getUid(feed2, xpath);
logger.debug("weibo.cn - (" + i + "), uid: " + uid);
getFeedStatistic(feed2, xpath, i);
} else {
fattitudeCount = ((Number) xpath
.evaluate(
"substring-after(substring-before(span[@class='cmt'][1]/text(),']'),'[')",
feed2, XPathConstants.NUMBER))
.longValue();
frepostsCount = ((Number) xpath
.evaluate(
"substring-after(substring-before(span[@class='cmt'][2]/text(),']'),'[')",
feed2, XPathConstants.NUMBER))
.longValue();
fcommentsCount = ((Number) xpath
.evaluate(
"substring-after(substring-before(a[contains(@href,'comment') and @class='cc']/text(), ']'),'[')",
feed2, XPathConstants.NUMBER))
.longValue();
fuid = getUid(feed2, xpath);
logger.debug("weibo.cn - (" + i + "), forward uid:"
+ fuid + " 赞: " + fattitudeCount + ", 转发: "
+ frepostsCount + ", 评论: " + fcommentsCount);
}
}
if (childCount == 3) {
Node feed3 = children.item(2);
// 转发理由
weibo = ((String) xpath.evaluate("./text()", feed3,
XPathConstants.STRING)).trim();
attitudeCount = ((Number) xpath
.evaluate(
"substring-after(substring-before((a[contains(@href,'attitude')]|span[@class='cmt'][2])/text(),']'),'[')",
feed3, XPathConstants.NUMBER)).longValue();
repostsCount = ((Number) xpath
.evaluate(
"substring-after(substring-before(a[contains(@href,'repost')]/text(),']'),'[')",
feed3, XPathConstants.NUMBER)).longValue();
commentsCount = ((Number) xpath
.evaluate(
"substring-after(substring-before(a[contains(@href,'comment') and @class='cc']/text(),']'),'[')",
feed3, XPathConstants.NUMBER)).longValue();
uid = getUid(feed3, xpath);
// 发表时间
String postDateTime = ((String) xpath.evaluate(
"span[@class='ct']/text()", feed3,
XPathConstants.STRING)).trim();
fcreatedAt = getCreatedAt(postDateTime);
logger.debug("weibo.cn - (" + i + "), uid: " + uid + " 赞: "
+ attitudeCount + ", 转发: " + repostsCount
+ ", 评论: " + commentsCount + ", 发表时间: "
+ postDateTime);
}
} catch (Exception e) {
continue;
}
}
}
private static void getFeedStatistic(Node feed, XPath xpath, int index)
throws XPathExpressionException {
// 微博赞数
long attitudeCount = ((Number) xpath
.evaluate(
"substring-after(substring-before((a[contains(@href,'attitude')]|span[@class='cmt'])/text(),']'),'[')",
feed, XPathConstants.NUMBER)).longValue();
// 转发数
long repostsCount = ((Number) xpath
.evaluate(
"substring-after(substring-before(a[contains(@href,'repost')]/text(),']'),'[')",
feed, XPathConstants.NUMBER)).longValue();
// 评论数
long commentsCount = ((Number) xpath
.evaluate(
"substring-after(substring-before(a[contains(@href,'comment') and @class='cc']/text(),']'),'[')",
feed, XPathConstants.NUMBER)).longValue();
// 发表时间
String postDateTime = ((String) xpath.evaluate(
"span[@class='ct']/text()", feed, XPathConstants.STRING))
.trim();
logger.debug("weibo.cn - (" + index + "), 赞: " + attitudeCount
+ ", 转发: " + repostsCount + ", 评论: " + commentsCount
+ ", 发表时间: " + postDateTime);
}
private static long getCreatedAt(String postDateTime)
throws XPathExpressionException {
// 发表时间,1分钟前/11:3/06月01日 12:30/2013-03-01 11:30:10
Calendar calendar = Calendar.getInstance();
Matcher timeMatcher = null;
if ((timeMatcher = PATTERN_WB_POST_DATE_TIME1.matcher(postDateTime))
.find()) {
// 11:00
calendar.set(Calendar.HOUR_OF_DAY,
Integer.valueOf(timeMatcher.group(1)));
calendar.set(Calendar.MINUTE, Integer.valueOf(timeMatcher.group(2)));
} else if ((timeMatcher = PATTERN_WB_POST_DATE_TIME3
.matcher(postDateTime)).find()) {
// 06月01日 00:00
calendar.set(Calendar.MONTH, Integer.valueOf(timeMatcher.group(1)));
calendar.set(Calendar.DAY_OF_MONTH,
Integer.valueOf(timeMatcher.group(2)));
calendar.set(Calendar.HOUR_OF_DAY,
Integer.valueOf(timeMatcher.group(3)));
calendar.set(Calendar.MINUTE, Integer.valueOf(timeMatcher.group(4)));
} else if ((timeMatcher = PATTERN_WB_POST_DATE_TIME2
.matcher(postDateTime)).find()) {
// 2013-03-01 11:30:10
calendar.set(Integer.valueOf(timeMatcher.group(1)),
Integer.valueOf(timeMatcher.group(2)),
Integer.valueOf(timeMatcher.group(3)),
Integer.valueOf(timeMatcher.group(4)),
Integer.valueOf(timeMatcher.group(5)),
Integer.valueOf(timeMatcher.group(6)));
} else {
// n分钟前 or 刚刚
// do nothing, user current time instead.
}
return calendar.getTimeInMillis();
}
private static String getUid(Node node, XPath xpath)
throws XPathExpressionException {
String href = (String) xpath.evaluate(
"a[contains(@href,'comment') and @class='cc']/@href", node,
XPathConstants.STRING);
// 用户sid
Matcher matcher = PATTERN_SID.matcher(href);
if (matcher.find()) {
return matcher.group(1);
}
return null;
}
private final static String WEIBO_CN_LOGIN_URL = "http://login.weibo.cn/login/?ns=1&revalid=2&backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%CE%A2%B2%A9&vt=";
private final static Pattern PATTERN_RAND_VALUE = Pattern
.compile("rand=(\\d+)");
/**
* 模拟登录weibo.cn
*
* @param loginName
* 微博帐号
* @param password
* 明文密码
* @return map, 包含cookie, cookie_expire, uid and success(true, false)
*/
public static Map<String, String> getCookieAndUidAtWeibocn(String loginName,
String password) {
Map<String, String> result = new HashMap<String, String>();
result.put("success", "false");
try {
Connection conn = Jsoup.connect(WEIBO_CN_LOGIN_URL);
conn.header("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
conn.header("Accept-Encoding", "gzip, deflate, sdch");
conn.header("Accept-Language",
"en-GB,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2,zh-TW;q=0.2");
conn.header("Cache-Control", "no-cache");
conn.header("Connection", "Keep-Alive");
conn.header("Content-Type", "application/x-www-form-urlencoded");
conn.header("Host", "login.weibo.cn");
conn.header("Pragma", "no-cache");
conn.header("Referer", "http://weibo.cn/pub/");
conn.header(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36");
Response getResponse = conn.method(Method.GET).execute();
org.jsoup.nodes.Document doc = getResponse.parse();
Element form = doc.select("form[method=post]").get(0);
String action = form.attr("action");
Matcher matcher = PATTERN_RAND_VALUE.matcher(action);
Map<String, String> dataMap = new HashMap<String, String>();
String rand = null;
if (matcher.find()) {
rand = matcher.group(1);
} else {
return result;
}
dataMap.put("backURL", form.select("div input[name=backURL]")
.get(0).attr("value"));
dataMap.put("backTitle", form.select("div input[name=backTitle]")
.get(0).attr("value"));
dataMap.put("mobile", loginName);
dataMap.put(
form.select("div input[type=password]").get(0).attr("name"),
password);
dataMap.put("remember", "on");
dataMap.put("tryCount", form.select("div input[name=tryCount]")
.get(0).attr("value"));
dataMap.put("vk",
form.select("div input[name=vk]").get(0).attr("value"));
dataMap.put("submit", form.select("div input[name=submit]").get(0)
.attr("value"));
String postUrl = "http://login.weibo.cn/login/?rand="
+ rand
+ "&backURL=http%3A%2F%2Fweibo.cn%2F%3Fs2w%3Dlogin&backTitle=%E5%BE%AE%E5%8D%9A&vt=4&revalid=2&ns=1";
Map<String, String> header = new HashMap<String, String>();
header.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
header.put("Accept-Encoding", "gzip, deflate");
header.put("Accept-Language",
"en-GB,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2,zh-TW;q=0.2");
header.put("Cache-Control", "no-cache");
header.put("Connection", "keep-alive");
header.put("Content-Type", "application/x-www-form-urlencoded");
header.put("Host", "login.weibo.cn");
header.put("Origin", "http://login.weibo.cn");
header.put("Pragma", "no-cache");
header.put(
"Referer",
"http://login.weibo.cn/login/?ns=1&revalid=2&backURL=http%3A%2F%2Fweibo.cn%2F%3Fs2w%3Dlogin&backTitle=%CE%A2%B2%A9&vt=");
header.put(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36");
Map<String, List<String>> responseHeader = postThenGetHeader(
postUrl, header, dataMap);
if (responseHeader.containsKey("X-Log-Uid")) {
result.put("uid", responseHeader.get("X-Log-Uid").get(0));
}
if (responseHeader.containsKey("Set-Cookie")) {
StringBuffer cookieBuffer = new StringBuffer();
for (String v : responseHeader.get("Set-Cookie")) {
String[] cookies = v.split(";");
for (String c : cookies) {
if (c.contains("gsid") || c.contains("_T_WM")
|| c.contains("SUB") || c.contains("PHPSESSID")) {
cookieBuffer.append(c).append(";");
} else if (c.contains("expires")) {
result.put("cookie_expire", c.split("=")[1]);
}
}
}
result.put("cookie", cookieBuffer.toString());
}
if (!result.containsKey("cookie")
|| result.get("cookie").contains("=deleted")) {
result.put("success", "false");
} else {
CACHE_WEIBO_CN.put(loginName, result);
result.put("success", "true");
}
return result;
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
return result;
}
private static Map<String, List<String>> postThenGetHeader(String url,
Map<String, String> header, Map<String, String> data) {
HttpURLConnection conn = null;
Map<String, List<String>> resultHeader = new HashMap<String, List<String>>();
try {
URL _url = new URL(url);
conn = (HttpURLConnection) _url.openConnection();
conn.setRequestMethod("POST");
conn.setInstanceFollowRedirects(false);
conn.setDoOutput(true);
conn.setDoInput(true);
for (String key : header.keySet()) {
conn.addRequestProperty(key, header.get(key));
}
conn.connect();
writePost(data, conn.getOutputStream());
resultHeader.putAll(conn.getHeaderFields());
return resultHeader;
} catch (Exception e) {
logger.error(e.getMessage(), e);
} finally {
if (conn != null) {
conn.disconnect();
}
}
return resultHeader;
}
private static void writePost(Map<String, String> data,
OutputStream outputStream) throws IOException {
OutputStreamWriter w = new OutputStreamWriter(outputStream, "UTF-8");
boolean first = true;
for (String key : data.keySet()) {
if (!(first))
w.append('&');
else {
first = false;
}
w.write(URLEncoder.encode(key, "UTF-8"));
w.write(61);
w.write(URLEncoder.encode(data.get(key), "UTF-8"));
}
w.close();
}
}