爬虫模拟登陆获取需要的数据

最新推荐文章于 2024-05-02 12:00:51 发布

贤和兄

最新推荐文章于 2024-05-02 12:00:51 发布

阅读量1.4k

点赞数

分类专栏：我的经验总结文章标签：爬虫模拟登陆

本文链接：https://blog.csdn.net/u010598111/article/details/79175790

版权

我的经验总结专栏收录该内容

107 篇文章 0 订阅

订阅专栏

爬虫的原理就是利用同一样的cookie或者是session去访问需要获取数据的链接，然后解析数据为我所用，现在这种框架有很多，比如WebMagic等开源的框架，本人在开发时候也参考过网上很多方法，有模拟谷歌浏览器，调用浏览器驱动爬取数据等，看了网上许多代码都大同小异；值得一提的是，有些网站虽然做了反爬虫的处理，爬数据有一定的难度，但是只要研究透，一样可以爬，只要你够细心，有信心，没啥不能爬的。话不多说上代码：

1、准备工具：Fiddler.exe 监听所爬网站的所有请求，找出需要的链接

2、准备jar包：jsoup解析html代码，其他均为辅助jar

3、直接运行代码即可

工具类准备 HttpUtils.java

package util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

/**
* 工具类
*/
public class HttpUtils {
//代理服务器
public static HttpHost proxy = new HttpHost("127.0.0.1", 8080);
//用共同的客户端，确保cookies是一致的
public static CloseableHttpClient httpClient = HttpClients.custom().setProxy(proxy).build();
//用共同的上下文确保客户端的上下问是一致的
public static HttpClientContext context = new HttpClientContext();
/**
* get方法
* @param url
* @return
* @author pangxianhe
* @date 2018年1月26日
*/
public static String sendGet(String url) {
CloseableHttpResponse response = null;
String content = null;
try {
HttpGet get = new HttpGet(url);
response = httpClient.execute(get, context);
HttpEntity entity = response.getEntity();
content = EntityUtils.toString(entity);
EntityUtils.consume(entity);
return content;
} catch (Exception e) {
e.printStackTrace();
if (response != null) {
try {
response.close();
} catch (IOException e1) {
e1.printStackTrace();
}
}
}
return content;
}
/**
* post方法
* @param url
* @param map
* @param charset
* @return
* @author pangxianhe
* @date 2018年1月26日
*/
public static String sendPost(String url,Map<String, String> map, String charset) {
CloseableHttpResponse response = null;
String content = null;
try {
// 　HttpClient中的post请求包装类
HttpPost httpPost = new HttpPost(url);
List<NameValuePair> list = new ArrayList<NameValuePair>();
Iterator<Map.Entry<String, String>> iterator = map.entrySet().iterator();
while (iterator.hasNext()) {
Entry<String, String> elem = (Entry<String, String>) iterator.next();
list.add(new BasicNameValuePair(elem.getKey(), elem.getValue()));
}
if (list.size() > 0) {
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(list, charset);
httpPost.setEntity(entity);
}
// 执行请求用execute方法，content用来帮我们附带上额外信息
response = httpClient.execute(httpPost, context);
content = EntityUtils.toString(response.getEntity(), "utf-8");
return content;
} catch (Exception e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return content;
}
}

调用main方法

package com;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import net.sf.json.JSONObject;
import util.HttpUtils;

public class SimulateLoginPolicy1 {
static boolean result = false;
static String token = "";
public static void main(String[] args) throws Exception{
//访问页面，模拟登陆
boolean loginFlag = loginCsdnPager("http://baidu.com","123","456");
if (loginFlag) {
// 登陆后即可以进入登陆后的页面。
String htmls = HttpUtils.sendGet("http://baidu.com/xx/Index");
token = preaseHtml(htmls);
Map<String, String> map = new HashMap<String, String>();
map.put("CityCode", "11");
map.put("__RequestVerificationToken", token);//反爬对应处理，此处的token与cookies中的完全不一样

String dd = HttpUtils.sendPost("http://baidu.com/xx/CheckRenewal", map,"utf-8");
System.out.println(dd);
}

}
/**
* 解析xml获取token
* @param htmls
* @return
* @author pangxianhe
* @date 2018年1月26日
*/
private static String preaseHtml(String htmls) {
//解析对应的html代码
Document doc = Jsoup.parse(htmls);
Elements inline = doc.getElementsByClass("**inline");
doc = Jsoup.parse(inline.toString());
Elements dd = doc.getElementsByTag("input");
String tionTokenHtml = dd.get(0).toString();
String[] qq = tionTokenHtml.split("value");
String tt = qq[1];
String tionToken = tt.substring(tt.indexOf("\"")+1, tt.lastIndexOf("\""));
return tionToken;
}

/**
* 请求登陆页面，并下载验证码，然后模拟登陆
* @throws Exception
* @author pangxianhe
* @date 2018年1月26日
*/
private static boolean loginCsdnPager(String url,String UserName,String Password) throws Exception {
boolean flag = false;
try {
//获取登陆页面
String html = HttpUtils.sendGet(url);
Document doc = Jsoup.parse(html);
//获取验证码图片
getImgID();
String captcha_solution="";
//输入验证码
System.out.println("请输入验证码：");
BufferedReader buff=new BufferedReader(new InputStreamReader(System.in));
try {
captcha_solution=buff.readLine();
} catch (IOException e) {
e.printStackTrace();
}
// 开始构造登录的信息
Map<String, String> map = new HashMap<String, String>();
map.put("UserName", UserName);
map.put("Password", Password);
map.put("yzm", captcha_solution);
String ret = HttpUtils.sendPost(url, map,"utf-8");
//TODO 登陆
ret = ret.substring(1, ret.lastIndexOf("\"")).replaceAll ("\\\\r\\\\n", "").replaceAll(" ", "").replaceAll("/", "").replaceAll("\\\\", "");
if (null!=ret&&!"code_error".equals(ret)) {
JSONObject json = JSONObject.fromObject(ret);
String businessstatus = (String) json.get("BusinessStatus");
flag = "1".equals(businessstatus)?true:flag;
}
} catch (Exception e) {
e.printStackTrace();
}
return flag;
}

/**
* 获取验证码图片
* @return
* @throws Exception
* @author pangxianhe
* @date 2018年1月26日
*/
private static void getImgID() throws Exception{
Date date=new Date();
String timestamp = Long.toString(date.getTime());
String src="http://baidu.com/xx/xx?time="+timestamp;
HttpGet httpGet=new HttpGet(src);
try {

HttpResponse response=HttpUtils.httpClient.execute(httpGet,HttpUtils.context);
HttpEntity entity=response.getEntity();
//保存验证码图片
getSecret(entity.getContent(),"secretCode.png", "c://");
}catch (IOException e) {
e.printStackTrace();
}
}
/**
* 把图片写如硬盘中
* @param is
* @param filename
* @param savePath
* @throws Exception
* @author pangxianhe
* @date 2018年1月26日
*/
public static void getSecret(InputStream is, String filename,String savePath) throws Exception {
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
File sf = new File(savePath);
if (!sf.exists()) {
sf.mkdirs();
}
OutputStream os = new FileOutputStream(sf.getPath() + "\\" + filename);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕，关闭所有链接
os.close();
is.close();
}
}

贤和兄

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫模拟登陆获取需要的数据

爬虫的原理就是利用同一样的cookie或者是session去访问需要获取数据的链接，然后解析数据为我所用，现在这种框架有很多，比如WebMagic等开源的框架，本人在开发时候也参考过网上很多方法，有模拟谷歌浏览器，调用浏览器驱动爬取数据等，看了网上许多代码都大同小异；值得一提的是，有些网站虽然做了反爬虫的处理，爬数据有一定的难度，但是只要研究透，一样可以爬，只要你够细心，有信心，没啥不能爬的。话不
复制链接

扫一扫