java爬虫绕过登录页面

最新推荐文章于 2024-08-16 10:35:54 发布

一身气质范

最新推荐文章于 2024-08-16 10:35:54 发布

阅读量4k

点赞数 3

分类专栏：爬虫 java

爬虫同时被 2 个专栏收录

3 篇文章 0 订阅

订阅专栏

java

3 篇文章 0 订阅

订阅专栏

转自：http://blog.csdn.net/jiangsanfeng1111/article/details/51282966

当我们在写爬虫的时候，往往会遇到很多反爬的问题。

比如：在登录页面设置验证码、扫描二维码登录、滑动鼠标登录、手机短信验证码登录等等。这里介绍一种个人已经实现的方法——绕过登录页面。这里的绕过不是说真的可以绕过登录，除非这个系统本来就有问题，这是这个系统天大的bug。这里说的绕过登录是指登录一次记住cookie信息，下次登录的时候就直接跳过了登录的页面。废话不多说，直接上代码：

[java] view plain copy

package com.xiaojiang.spidertest;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.HttpCookie;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicHeader;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.xiaojiang.exception.DataTaskException;
import com.xiaojiang.httpclient.HttpUserAgent;
public class CookieTest {
public static void main(String[] args) throws Exception, IOException {
DefaultHttpClient client = new DefaultHttpClient();
HttpResponse response = null;
String newUrl = "http://www.dajie.com/home";
HttpGet httpGet = new HttpGet(newUrl);
//在页面控制台执行document.cookie
String cookie = "DJ_RF=empty; DJ_EU=http%3A%2F%2Fwww.dajie.com%2Fhome; DJ_UVID=MTQ2MTkwNzk3NDU4MTg1NjQ2; dj_cap=0564c054acc1ce12402998471ae0af54; regSucceedType=email; dj_auth_v3=MrZrP3TGNRNXCNiOpQY7Ggscf4kjfEEsJzFPDzu3iwi5XtG9tS3Sw-WgChC2DVKL; uchome_loginuser=35375099; USER_ACTION=\"request^AProfessional^AREG^Aregm:crt0^A-\"; send_verify_mail=961254858%40qq.com; login_email=961254858%40qq.com; inbound_tag=true";
httpGet.addHeader(new BasicHeader("Cookie", cookie));
httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpGet.setHeader("Accept-Encoding", "gzip, deflate");
httpGet.setHeader("User-Agent", HttpUserAgent.get());
response = client.execute(httpGet);
String html = formatResponse(response);
Document doc = Jsoup.parse(html);
String text = doc.select(".feed-header").get(0).text();
System.out.println(text);
httpGet.releaseConnection();
}
private static String formatResponse(HttpResponse response) throws Exception {
ByteArrayInputStream bis = null;
Header contentEncoding = response.getFirstHeader("Content-Encoding");
if(contentEncoding == null){
return EntityUtils.toString(response.getEntity(),"UTF-8");
} else {
String charset = "utf-8";
Header contentType = response.getFirstHeader("Content-Type");
if(contentType != null){
String contentTypeStr = contentType.getValue();
if(contentTypeStr != null && !"".equals(contentTypeStr)){
charset = contentTypeStr.substring(contentTypeStr.indexOf("=") + 1,contentTypeStr.length());
}
}
String contentEncodingType = contentEncoding.getValue();
if(contentEncodingType.equalsIgnoreCase("gzip")){
if(response.toString().contains("soufun"))
charset = "gb2312";
byte[] bytes = IOUtils.toByteArray(response.getEntity().getContent());
bis = new ByteArrayInputStream(bytes);
return uncompress(bis ,charset);
}
}
return null;
}
/**
* GZIP解压
*/
private static String uncompress(ByteArrayInputStream in, String charset) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
try {
GZIPInputStream gunzip = new GZIPInputStream(in);
byte[] buffer = new byte[256];
int n;
while((n = gunzip.read(buffer)) >=0 ){
out.write(buffer, 0, n);
}
return out.toString(charset);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}