目录
模拟登陆的原因
很多网站,我们是无法直接获得服务器返回的数据,需要输入用户名及密码才能看到数据。如我们登陆人人网时,网站网址http://www.renren.com/。如图所示,看到的是一个登陆的界面,必须输入用户名及密码才能看到返回的数据。
登陆后,看到的数据,便可以通过爬虫将网站的html,或者json数据抓下来,解析。
如何模拟登陆
模拟登陆之前,需要进行抓包,获取相关的Cookie信息,这时候抓包很重要,如果不会抓包,请看前面的博客,学习抓包,看数据请求的地址及相关参数,本文将采用java的方法获取响应数据。
实战(demo)
package renren.renren;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;
public class RenRen {
/*以下是模拟登陆程序*/
/*输入你的用户名及密码 ,这里输入*/
private static String userName = "";
private static String password = "";
private static String redirectURL = "http://www.renren.com/465530468/profile?v=info_timeline";
// Don't change the following URL
private static String renRenLoginURL = "http://www.renren.com/PLogin.do";
// The HttpClient is used in one session
private HttpResponse response;
private DefaultHttpClient httpclient = new DefaultHttpClient();
/*输入抓包的参数,即传递的参数*/
private boolean login() {
HttpPost httpost = new HttpPost(renRenLoginURL);
// All the parameters post to the web site
//建立一个NameValuePair数组,用于存储欲传送的参数,添加相关参数,见上图中的参数
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
nvps.add(new BasicNameValuePair("origURL", redirectURL));
nvps.add(new BasicNameValuePair("domain", "renren.com"));
nvps.add(new BasicNameValuePair("isplogin", "true"));
nvps.add(new BasicNameValuePair("email", userName));
nvps.add(new BasicNameValuePair("password", password));
try {
/*登陆成功,获取返回的数据,即html文件*/
httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));
response = httpclient.execute(httpost);
} catch (Exception e) {
e.printStackTrace();
return false;
} finally {
httpost.abort();
}
return true;
}
private String getRedirectLocation() {
/*获取响应的头 url*/
Header locationHeader = response.getFirstHeader("Location");
if (locationHeader == null) {
return null;
}
return locationHeader.getValue();
}
/*获取html文本*/
private String getText(String redirectLocation) {
HttpGet httpget = new HttpGet(redirectLocation);
// Create a response handler
ResponseHandler<String> responseHandler = new BasicResponseHandler();
String responseBody = "";
try {
responseBody = httpclient.execute(httpget, responseHandler);
} catch (Exception e) {
e.printStackTrace();
responseBody = null;
} finally {
httpget.abort();
httpclient.getConnectionManager().shutdown();
}
return responseBody;
}
public void printText() {
/*如果注册成功了,输入相应后的html*/
if (login()) {
String redirectLocation = getRedirectLocation();
if (redirectLocation != null) {
System.out.println(getText(redirectLocation));
}
}
}
/*main方法*/
public static void main(String[] args) {
RenRen renRen = new RenRen();
renRen.printText();
}
}
在这里说明一下,为什么使用”http://www.renren.com/PLogin.do“,这个地址,底下有很多人问。如下截图:
如下图所示,便获得了html文件,只要对这个html文件解析就行了。
有什么不明白的,请发邮件至1563178220@qq.com 合肥工业大学管理学院 钱洋