Java 爬虫模拟知乎登陆

Java 爬虫模拟知乎登陆

一、知乎登陆页面分析

1、我用的是Chrome浏览器 按F12打开开发者工具 选到NetWork 然后登陆一次观察在登陆过程中所用的URL及post了哪些参数

上传的参数如下:

2、模拟登陆知乎分为以下几步:

     1)获取_xsrf

     2)获取验证码 captcha

     3)登陆知乎,拿到cookie

    4)使用cookie进一步访问登陆后的其他页面

二、Java 模拟知乎登陆

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;
import javax.imageio.ImageIO;
import org.apache.http.Consts;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

public class LogIn {

	private String indexURL = "https://www.zhihu.com/";
	private String loginURL = "https://www.zhihu.com/login/email";
	private String captchaURL = "https://www.zhihu.com/captcha.gif?type=login";
	protected RequestConfig requestConfig = null;
	protected CloseableHttpClient httpClient = null;

	public LogIn(String indexURL, String loginURL, String captchaURL) {
		super();
		this.indexURL = indexURL;
		this.loginURL = loginURL;
		this.captchaURL = captchaURL;
		requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build();
		httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
	}
	public LogIn() {
		super();
		requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build();
		httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
	}
	//获取XSRF
    public String getXSRF() throws ClientProtocolException, IOException{
    	HttpGet get = new HttpGet(indexURL);
        get.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
        get.setHeader("Accept","*/*");
        get.setHeader("Accept-Encoding","gzip,deflate,br");
        get.setHeader("Accept-Language","zh-CN,zh;q=0.8");
        get.setHeader("Origin","https://www.zhihu.com");
        get.setHeader("Referer","https://www.zhihu.com/");
        CloseableHttpResponse response = httpClient.execute(get);
        String responseHtml = EntityUtils.toString(response.getEntity());
        String xsrfValue = responseHtml.split("<input type=\"hidden\" name=\"_xsrf\" value=\"")[1].split("\"/>")[0];
        return xsrfValue;
    }
    //获取验证码
    public String getCaptcha() throws ClientProtocolException, IOException{
    	CloseableHttpResponse response = httpClient.execute(new HttpGet(captchaURL));
        InputStream input= response.getEntity().getContent();
        BufferedImage bio = ImageIO.read(input);
        File w2 = new File("src/QQ.jpg");
        ImageIO.write(bio, "jpg", w2);
        input.close();
        response.close();
        String captcha =null;
        Scanner s = new Scanner(System.in);
        System.out.print("captcha:");
        captcha = s.nextLine();
        s.close();
        return captcha;
    }
    //获取登陆后的响应状态,包含cookie信息
    public HttpResponse logIn(String email,String password) throws ClientProtocolException, IOException{
    	List<NameValuePair> valuePairs = new LinkedList<NameValuePair>();
        valuePairs.add(new BasicNameValuePair("_xsrf", getXSRF()));
        valuePairs.add(new BasicNameValuePair("email", email));
        valuePairs.add(new BasicNameValuePair("password", password));
        valuePairs.add(new BasicNameValuePair("captcha", getCaptcha()));
        UrlEncodedFormEntity entity = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8);
        HttpPost post = new HttpPost(loginURL);
        post.setEntity(entity);
        HttpResponse httpResponse = httpClient.execute(post);
        
        return httpResponse;
    }
    //根据cookie信息,访问其他页面
    public String visitURL(HttpResponse httpResponse, String url) throws ClientProtocolException, IOException{
    	HttpGet get = new HttpGet("http://www.zhihu.com/question/following");
    	Header[] headers = httpResponse.getHeaders("Set-Cookie");
    	for(int i =0 ;i<headers.length;i++){
    		get.addHeader(headers[i]);
		}
        CloseableHttpResponse r = httpClient.execute(get);
        String content = EntityUtils.toString(r.getEntity());
        System.out.println(content);
        r.close();
        return null;
    }
	/**
	 * @param args
	 * @throws IOException 
	 * @throws ClientProtocolException 
	 */
	public static void main(String[] args) throws ClientProtocolException, IOException {
		LogIn login = new LogIn();
		HttpResponse httpResponse = login.logIn("xxxxxx@xxx.com", "xxxxxxx");
		StatusLine responseState = httpResponse.getStatusLine();
		System.out.println(responseState.toString());
		Header[] headers = httpResponse.getAllHeaders();
		for(int i =0 ;i<headers.length;i++){
			System.out.println(headers[i].getName()+": "+headers[i].getValue());
		}
		HttpEntity httpEntiey = httpResponse.getEntity();
		String responseString = EntityUtils.toString(httpEntiey);
		System.out.println(responseString);
//		{"r":0,
//			 "msg": "\u767b\u5f55\u6210\u529f"
//			}

		login.visitURL(httpResponse,"http://www.zhihu.com/question/following");
	}

}

运行效果



  • 0
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值