以前让用C#写过爬虫,还是第一次写,用C#爬的时候几乎照着别人的代码爬着写的,所以慢慢才懂了一点爬虫技术。最近需要用java写爬虫,java爬虫也挺好用的,都说python好,还没用过,以后慢慢学^-^。好啦,开始正题:
我爬虫习惯用工具就是Fiddler,我感觉挺好用的。Fiddler功能据说比较强大,我只晓得一点。java爬虫,用HttpClient和Jsoup在一起用简直好用,绝配。上传代码:
package com.pachong;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class T3 {
public static void main(String[] args) {
String user_name = "xxxx"; //用户名
String password = "xxx"; //密码
// 全局请求设置
RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build();
// 创建cookie store的本地实例
CookieStore cookieStore = new BasicCookieStore();
// 创建HttpClient上下文
HttpClientContext context = HttpClientContext.create();
context.setCookieStore(cookieStore);
// 创建一个HttpClient
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig).setConnectionTimeToLive(50L, TimeUnit.MILLISECONDS)
.setDefaultCookieStore(cookieStore).build();
CloseableHttpResponse res = null;
// 创建本地的HTTP内容
try {
try {
// 创建一个get请求用来获取必要的Cookie,如_xsrf信息 (不好意思,我爬了汇博^-^)
HttpGet get = new HttpGet("http://person.huibo.com/login");
res = httpClient.execute(get, context);
// 获取常用Cookie,包括_xsrf信息
System.out.println("第一步 访问http://person.huibo.com/login 得到的结果");
// System.out.println(res);
HttpEntity entity = res.getEntity();
String strResult = EntityUtils.toString(entity,"UTF-8");
// System.out.println(strResult);
EntityUtils.consume(entity);
res.close();
Document doc = Jsoup.parse(strResult);
Element loginSeed = doc.getElementById("login_seed");
System.out.println("loginSeed =" +loginSeed.val());
// // 构造post数据
List<NameValuePair> valuePairs = new LinkedList<NameValuePair>();
valuePairs.add(new BasicNameValuePair("user_name", user_name));
valuePairs.add(new BasicNameValuePair("password", password));
valuePairs.add(new BasicNameValuePair("catcha", ""));
valuePairs.add(new BasicNameValuePair("seed", loginSeed.val()));
valuePairs.add(new BasicNameValuePair("chkSave", "false"));
UrlEncodedFormEntity entityUrl = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8);
// entityUrl.setContentType("application/x-www-form-urlencoded");
//
// // 创建一个post请求
HttpPost post = new HttpPost("http://person.huibo.com/login/LoginDo");
// // 注入post数据
post.setEntity(entityUrl);
res = httpClient.execute(post, context);
//
// for (Cookie c : cookieStore.getCookies()) {
// System.out.println(c.getName() + ": " + c.getValue());
// }
// res.close();
//
System.out.println("登陆成功后,新的Cookie:===============");
for (Cookie c : context.getCookieStore().getCookies()) {
System.out.println(c.getName() + ": " + c.getValue());
}
res.close();
//
// 构造一个新的get请求,用来测试登录是否成功
HttpGet newGet = new HttpGet("http://person.huibo.com/");
res = httpClient.execute(newGet, context);
String content = EntityUtils.toString(res.getEntity(),"UTF-8");
System.out.println("登陆成功后访问的页面===============");
//System.out.println(content);
String patenName= "<a\\shref=\\\"http:\\/\\/person.huibo.com\\/\\\">(.*?)</a>";
String Name = reg(content,patenName,1);
System.out.println(Name);
res.close();
HttpGet newGetMyResume = new HttpGet("http://person.huibo.com/resume/manage/");
res = httpClient.execute(newGetMyResume, context);
String contentMyResume = EntityUtils.toString(res.getEntity(),"UTF-8");
System.out.println("我的简历页面===============");
//System.out.println(contentMyResume);
res.close();
//匹配找到 resume_id
String paten= "href=\\\"\\/resume\\/update\\/resume_id-(.*?)\\\"\\starget=";
String resumeId = reg(contentMyResume,paten,1);
System.out.println(resumeId);
System.out.println("下载world文档页面===============");
HttpGet newGetDown = new HttpGet("http://person.huibo.com/resume/worddown/resumeid-"+resumeId);
res = httpClient.execute(newGetDown, context);
FileOutputStream out = (new FileOutputStream(new java.io.File("D:\\huibo\\"+Name+"简历.doc")));
InputStream in = res.getEntity().getContent();
byte[] buff = new byte[1024];//创建字节缓冲大小
int bytesRead;
while (-1 != (bytesRead = in.read(buff, 0, buff.length))) {
out.write(buff, 0, bytesRead);
}
out.close();
res.close();
} finally {
httpClient.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static String reg(String string, String reg, int i) {
String s = "";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(string);
while (matcher.find()) {
s = matcher.group(i);
}
return s;
}
}