0、背景
原来对新浪微博的采集主要靠对weibo.com这个入口进行,但是最近发现有人使用weibo.cn这个入口操作。应为weibo.cn对应的是微博的3g版本,基本没有广告,页面小,这样下载数据量会小很多,并且3g版本的采集比较简单。于是就有了这个3g版本的采集程序。
写出来,分享给大家,希望对有需要的朋友有所帮助。
使用到的类库:httpclient、htmlcleaner
httpclient负责处理http的get和post请求,下载页面;htmlcleaner负责将下载的页面转化为规范的xml,之后用xpath匹配所需内容。
1、基础的http请求类
package cn.mingyuan.weibo.commons;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.DefaultHttpClient;
/**
* http请求基类
*
* @author mingyuan
*
*/
public abstract class RequestCommons {
protected HttpClient httpclient = null;
public RequestCommons() {
initHttpClient();
}
/**
* 初始化httpclient
*/
protected void initHttpClient() {
httpclient = new DefaultHttpClient();
}
protected HttpClient getHttpClient() {
return httpclient;
}
protected void addHeader(HttpRequestBase request, String key, String value) {
request.addHeader(key, value);
}
protected void addCookie(HttpRequestBase request, String cookie) {
addHeader(request, "Cookie", cookie);
}
protected void setCookie(HttpRequestBase request, String cookie) {
request.setHeader("Cookie", cookie);
}
/**
* 设置请求的header值
*
* @param request
* http的get或者post请求
*/
protected void setHeader(HttpRequestBase request) {
request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
request.setHeader("Accept-Language", "en-us,en;q=0.5");
request.setHeader("Connection", "keep-alive");
request.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:13.0) Gecko/20100101 Firefox/13.0.1");
}
}
2、weibo.cn登陆
package cn.mingyuan.weibo.login;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import cn.mingyuan.weibo.commons.RequestCommons;
import cn.mingyuan.weibo.until.Constants;
/**
* 登陆 获取cookie
*
* @author mingyuan
*
*/
public class Login extends RequestCommons {
/**
* 获取登陆参数。主要有三个值:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值
*
* @return 返回登陆参数,string数组,里面的元素:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值
*/
private String[] getLoginParameters() {
HttpClient httpClient = getHttpClient();
String location = "http://3g.sina.com.cn/prog/wapsite/sso/login.php?backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%D0%C2%C0%CB%CE%A2%B2%A9&vt=4&revalid=2&ns=1";
HttpGet get = new HttpGet(location);
setHeader(get);
HttpResponse response;
InputStream content;
String retAction = null;
String retPassword = null;
String retVk = null;
try {
response = httpClient.execute(get);
HttpEntity entity = response.getEntity();
content = entity.getContent();
// 提取登陆参数
HtmlCleaner cleaner = new HtmlCleaner();
TagNode tagNode = cleaner.clean(content, "utf-8");
Object[] action = tagNode.evaluateXPath("//form/@action");
if (action.length > 0) {
retAction = action[0].toString();
}
Object[] passwordKey = tagNode.evaluateXPath("//form//input[@type='password']/@name");
if (passwordKey.length > 0) {
retPassword = passwordKey[0].toString();
}
Object[] vkKey = tagNode.evaluateXPath("//form//input[@name='vk']/@value");
if (vkKey.length > 0) {
retVk = vkKey[0].toString();
}
EntityUtils.consume(entity);
} catch (ClientProtocolException e) {
System.out.println("获取登陆页面失败,location=" + location);
e.printStackTrace();
} catch (IOException e) {
System.out.println("获取页面内容流失败");
e.printStackTrace();
} catch (XPatherException e) {
System.out.println("解析登陆参数失败");
e.printStackTrace();
} finally {
if (get != null) {
get.releaseConnection();
}
}
System.out.println("请求页面:" + location);
System.out.println("提交地址:" + retAction);
System.out.println("密码输入框名称:" + retPassword);
System.out.println("vk值:" + retVk);
return new String[] { retAction, retPassword, retVk };
}
/**
* 提交账号密码,开始登陆
*
* @param postAction
* 登陆地址
* @param userNameValue
* 微博登陆账号
* @param passwordValue
* 微博登陆密码
* @param passwordKey
* 微博登陆框的name
* @param vkValue
* vk的值
* @return 返回取到的cookie与跳转地址,组合成一个String数组。第一个元素为cookie,第二个元素为跳转地址
*/
private String[] submitPassword(String postAction, String userNameValue, String passwordValue, String passwordKey, String vkValue) {
HttpClient httpclient = getHttpClient();
String url = "http://3g.sina.com.cn/prog/wapsite/sso/" + postAction;
System.out.println("开始提交账号密码:" + url);
HttpPost post = new HttpPost(url);
setHeader(post);
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
nvps.add(new BasicNameValuePair("mobile", userNameValue));
nvps.add(new BasicNameValuePair(passwordKey, passwordValue));
nvps.add(new BasicNameValuePair("remember", "on"));
nvps.add(new BasicNameValuePair("vk", vkValue));
nvps.add(new BasicNameValuePair("backURL", "http://weibo.cn/"));
nvps.add(new BasicNameValuePair("backTitle", "新浪微博"));
nvps.add(new BasicNameValuePair("submit", "登录"));
HttpResponse response;
String cookie = null;
String location = null;
try {
post.setEntity(new UrlEncodedFormEntity(nvps));
response = httpclient.execute(post);
HttpEntity entity2 = response.getEntity();
Header[] setCookie = response.getHeaders("Set-Cookie");
if (setCookie != null) {
cookie = setCookie[0].getValue();
System.out.println("获取到Cookie:" + cookie);
}
Header[] locations = response.getHeaders("Location");
if (locations != null) {
location = locations[0].getValue();
System.out.println("获取到跳转链接:" + location);
}
EntityUtils.consume(entity2);
} catch (UnsupportedEncodingException e1) {
e1.printStackTrace();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
post.releaseConnection();
}
return new String[] { cookie, location };
}
/**
* 获取重定向页面内容
*
* @param redirectUrl
* 获取重定向页面地址
* @return 获取cookie和要跳转的地址
*/
private String[] getRedirectPageInfo(String redirectUrl) {
System.out.println("开始获取跳转链接页面");
HttpGet get = new HttpGet(redirectUrl);
setHeader(get);
HttpResponse redirectResponse;
String cookie = null;
String clickHref = null;
try {
redirectResponse = httpclient.execute(get);
Header[] headers = redirectResponse.getHeaders("Set-Cookie");
if (headers != null) {
cookie = headers[0].getValue();
String[] splits = cookie.split(";");
for (String str : splits) {
if (str.startsWith("gsid_CTandWM")) {
cookie = str;
break;
}
}
}
HttpEntity entity = redirectResponse.getEntity();
InputStream content = entity.getContent();
HtmlCleaner cleaner = new HtmlCleaner();
TagNode tagNode = cleaner.clean(content, "utf-8");
Object[] clickHrefs = tagNode.evaluateXPath("//div/a/@href");
if (clickHrefs != null) {
clickHref = clickHrefs[0].toString();
System.out.println("获取到跳转链接地址:" + clickHref);
}
EntityUtils.consume(entity);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (XPatherException e) {
e.printStackTrace();
} finally {
get.releaseConnection();
}
return new String[] { cookie, clickHref };
}
/**
* 跳转
*
* @param cookie
* 上次请求取到的cookie
* @param redirectUrl
* 跳转url
* @return 返回跳转后取得的cookie
*/
private String doRedirection(String cookie, String redirectUrl) {
HttpGet get = new HttpGet(redirectUrl);
setHeader(get);
get.setHeader("Cookie", cookie);
HttpResponse response;
try {
response = httpclient.execute(get);
HttpEntity entity = response.getEntity();
Header[] headers2 = response.getHeaders("Set-Cookie");
if (headers2 != null) {
cookie = headers2[0].getValue();
System.out.println("跳转页面取回的cookie:" + cookie);
String[] splits = cookie.split(";");
for (String str : splits) {
if (str.startsWith("_WEIBO_UID")) {
cookie = str;
break;
}
}
}
EntityUtils.consume(entity);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return cookie;
}
/**
* 提交账号密码,登陆
*
* @param userNameValue
* 微博账号
* @param passwordValue
* 微博密码
* @return 返回cookie
*/
public String doLogin(String userNameValue, String passwordValue) {
// 获取登陆页面的参数
String[] loginParameters = getLoginParameters();
String postAction = loginParameters[0];
String passwordKey = loginParameters[1];
String vkValue = loginParameters[2];
// 提交账号密码,获取重定向页面链接与cookie
String[] cookieRedirectLocation = submitPassword(postAction, userNameValue, passwordValue, passwordKey, vkValue);
String cookie = cookieRedirectLocation[0];
String redirectUrl = cookieRedirectLocation[1];
// 获取重定向页面内容
String[] redirectInfo = getRedirectPageInfo(redirectUrl);
cookie = redirectInfo[0];
redirectUrl = redirectInfo[1];
System.out.println("准备跳转");
try {
TimeUnit.SECONDS.sleep(3);
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("开始跳转");
String cookieOfRedirect = doRedirection(cookie, redirectUrl);
StringBuffer sb = new StringBuffer(cookie);
sb.append(';').append(cookieOfRedirect);
System.out.println("登陆成功,最终cookie为:" + sb.toString());
return sb.toString();
}
/**
* 使用配置的账号、密码登陆
*
* @return 返回登陆cookie
*/
public String doLogin() {
return this.doLogin(Constants.LOGIN_USERNAME, Constants.LOGIN_PASSWORD);
}
}
3、测试
步骤:先登录取cookie再拿cookie填到请求里面取页面内容
package cn.mingyuan.weibo.test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.util.EntityUtils;
import cn.mingyuan.weibo.commons.RequestCommons;
import cn.mingyuan.weibo.login.Login;
/**
* 测试,取页面内容
* @author mingyuan
*
*/
public class WeiboTest extends RequestCommons {
/**
* 打印流
*
* @param in
* InputStream
*/
private void printContent(InputStream in) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(in, "utf-8"));
String line;
while ((line = reader.readLine()) != null) {
System.out.println(line);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
// 这里不关闭流,留作他用
}
}
/**
* 测试,读取一个微博地址,打印出页面内容
*
* @param finalCookie
* cookie
*/
private void test(String finalCookie) {
HttpGet get = new HttpGet("http://weibo.cn/irlucene");
setHeader(get);
get.setHeader("Cookie", finalCookie.toString());
HttpResponse response;
try {
response = httpclient.execute(get);
HttpEntity entity = response.getEntity();
printContent(entity.getContent());
EntityUtils.consume(entity);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
get.releaseConnection();
}
}
public static void main(String[] args) {
Login login = new Login();
String userNameValue = "username";
String passwordValue = "password";
String cookie = login.doLogin(userNameValue, passwordValue);
System.out.println("final Cookie=" + cookie);
new WeiboTest().test(cookie);
}
}