java爬虫

以前让用C#写过爬虫,还是第一次写,用C#爬的时候几乎照着别人的代码爬着写的,所以慢慢才懂了一点爬虫技术。最近需要用java写爬虫,java爬虫也挺好用的,都说python好,还没用过,以后慢慢学^-^。好啦,开始正题:

我爬虫习惯用工具就是Fiddler,我感觉挺好用的。Fiddler功能据说比较强大,我只晓得一点。java爬虫,用HttpClient和Jsoup在一起用简直好用,绝配。上传代码:

package com.pachong;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;


public class T3 {
public static void main(String[] args) {
   String user_name = "xxxx";    //用户名
    String password = "xxx";   //密码      
    // 全局请求设置
    RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build();
    // 创建cookie store的本地实例
    CookieStore cookieStore = new BasicCookieStore();
    // 创建HttpClient上下文
    HttpClientContext context = HttpClientContext.create();
    context.setCookieStore(cookieStore);
 
    // 创建一个HttpClient
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig).setConnectionTimeToLive(50L, TimeUnit.MILLISECONDS)
        .setDefaultCookieStore(cookieStore).build();
 
    CloseableHttpResponse res = null;
    // 创建本地的HTTP内容
    try {
      try {
        // 创建一个get请求用来获取必要的Cookie,如_xsrf信息 (不好意思,我爬了汇博^-^)
        HttpGet get = new HttpGet("http://person.huibo.com/login");
 
        res = httpClient.execute(get, context);         
        // 获取常用Cookie,包括_xsrf信息
        System.out.println("第一步 访问http://person.huibo.com/login 得到的结果");
       // System.out.println(res);
        HttpEntity entity = res.getEntity();
        String strResult = EntityUtils.toString(entity,"UTF-8");  
           // System.out.println(strResult);  
            EntityUtils.consume(entity); 
        res.close();
        Document doc = Jsoup.parse(strResult);
        Element loginSeed = doc.getElementById("login_seed");
        System.out.println("loginSeed =" +loginSeed.val());
 
//         // 构造post数据
        List<NameValuePair> valuePairs = new LinkedList<NameValuePair>();
        valuePairs.add(new BasicNameValuePair("user_name", user_name));
        valuePairs.add(new BasicNameValuePair("password", password));
        valuePairs.add(new BasicNameValuePair("catcha", ""));
        valuePairs.add(new BasicNameValuePair("seed", loginSeed.val()));
        valuePairs.add(new BasicNameValuePair("chkSave", "false"));
        UrlEncodedFormEntity entityUrl = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8);
        
       // entityUrl.setContentType("application/x-www-form-urlencoded");
//  
//         // 创建一个post请求
        HttpPost post = new HttpPost("http://person.huibo.com/login/LoginDo");
//         // 注入post数据
        post.setEntity(entityUrl);
        res = httpClient.execute(post, context);
//  
//         for (Cookie c : cookieStore.getCookies()) {
//           System.out.println(c.getName() + ": " + c.getValue());
//         }
//         res.close();
//  
        System.out.println("登陆成功后,新的Cookie:===============");
        for (Cookie c : context.getCookieStore().getCookies()) {
          System.out.println(c.getName() + ": " + c.getValue());
        }
        res.close();
//  
        // 构造一个新的get请求,用来测试登录是否成功
        HttpGet newGet = new HttpGet("http://person.huibo.com/");
        res = httpClient.execute(newGet, context);
        String content = EntityUtils.toString(res.getEntity(),"UTF-8");
        System.out.println("登陆成功后访问的页面===============");
        //System.out.println(content);
        String patenName= "<a\\shref=\\\"http:\\/\\/person.huibo.com\\/\\\">(.*?)</a>";
String Name = reg(content,patenName,1);
        System.out.println(Name);
        res.close();
        
        HttpGet newGetMyResume = new HttpGet("http://person.huibo.com/resume/manage/");
        res = httpClient.execute(newGetMyResume, context);
        String contentMyResume = EntityUtils.toString(res.getEntity(),"UTF-8");
        System.out.println("我的简历页面===============");
        //System.out.println(contentMyResume);
        res.close();
        
        //匹配找到 resume_id
        String paten= "href=\\\"\\/resume\\/update\\/resume_id-(.*?)\\\"\\starget=";
String resumeId = reg(contentMyResume,paten,1);
        System.out.println(resumeId);
        
        System.out.println("下载world文档页面===============");
        HttpGet newGetDown = new HttpGet("http://person.huibo.com/resume/worddown/resumeid-"+resumeId);
        res = httpClient.execute(newGetDown, context);
        
        FileOutputStream out = (new FileOutputStream(new java.io.File("D:\\huibo\\"+Name+"简历.doc")));
        InputStream in = res.getEntity().getContent();
        byte[] buff = new byte[1024];//创建字节缓冲大小
            int bytesRead;
            while (-1 != (bytesRead = in.read(buff, 0, buff.length))) {
                out.write(buff, 0, bytesRead);
            }
        out.close();
        res.close();
        
 
      } finally {
        httpClient.close();
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

public static String reg(String string, String reg, int i) {
        String s = "";
        Pattern pattern = Pattern.compile(reg);
        Matcher matcher = pattern.matcher(string);
        while (matcher.find()) {
            s = matcher.group(i);
        }
        return s;
    }

}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值