java爬虫

最新推荐文章于 2022-06-13 14:57:58 发布

xuqian1638

最新推荐文章于 2022-06-13 14:57:58 发布

阅读量573

点赞数 2

分类专栏： java设计文章标签： java爬虫

本文链接：https://blog.csdn.net/xuqian1638/article/details/79699418

版权

java设计专栏收录该内容

9 篇文章 0 订阅

订阅专栏

以前让用C#写过爬虫，还是第一次写，用C#爬的时候几乎照着别人的代码爬着写的，所以慢慢才懂了一点爬虫技术。最近需要用java写爬虫，java爬虫也挺好用的，都说python好，还没用过，以后慢慢学^-^。好啦，开始正题：

我爬虫习惯用工具就是Fiddler，我感觉挺好用的。Fiddler功能据说比较强大，我只晓得一点。java爬虫，用HttpClient和Jsoup在一起用简直好用，绝配。上传代码：

package com.pachong;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class T3 {
public static void main(String[] args) {
String user_name = "xxxx"; //用户名
String password = "xxx"; //密码
// 全局请求设置
RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build();
// 创建cookie store的本地实例
CookieStore cookieStore = new BasicCookieStore();
// 创建HttpClient上下文
HttpClientContext context = HttpClientContext.create();
context.setCookieStore(cookieStore);

// 创建一个HttpClient
CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig).setConnectionTimeToLive(50L, TimeUnit.MILLISECONDS)
.setDefaultCookieStore(cookieStore).build();

CloseableHttpResponse res = null;
// 创建本地的HTTP内容
try {
try {
// 创建一个get请求用来获取必要的Cookie，如_xsrf信息 (不好意思，我爬了汇博^-^)
HttpGet get = new HttpGet("http://person.huibo.com/login");

res = httpClient.execute(get, context);
// 获取常用Cookie,包括_xsrf信息
System.out.println("第一步访问http://person.huibo.com/login 得到的结果");
// System.out.println(res);
HttpEntity entity = res.getEntity();
String strResult = EntityUtils.toString(entity,"UTF-8");
// System.out.println(strResult);
EntityUtils.consume(entity);
res.close();
Document doc = Jsoup.parse(strResult);
Element loginSeed = doc.getElementById("login_seed");
System.out.println("loginSeed =" +loginSeed.val());

// // 构造post数据
List<NameValuePair> valuePairs = new LinkedList<NameValuePair>();
valuePairs.add(new BasicNameValuePair("user_name", user_name));
valuePairs.add(new BasicNameValuePair("password", password));
valuePairs.add(new BasicNameValuePair("catcha", ""));
valuePairs.add(new BasicNameValuePair("seed", loginSeed.val()));
valuePairs.add(new BasicNameValuePair("chkSave", "false"));
UrlEncodedFormEntity entityUrl = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8);

// entityUrl.setContentType("application/x-www-form-urlencoded");
//
// // 创建一个post请求
HttpPost post = new HttpPost("http://person.huibo.com/login/LoginDo");
// // 注入post数据
post.setEntity(entityUrl);
res = httpClient.execute(post, context);
//
// for (Cookie c : cookieStore.getCookies()) {
// System.out.println(c.getName() + ": " + c.getValue());
// }
// res.close();
//
System.out.println("登陆成功后,新的Cookie:===============");
for (Cookie c : context.getCookieStore().getCookies()) {
System.out.println(c.getName() + ": " + c.getValue());
}
res.close();
//
// 构造一个新的get请求，用来测试登录是否成功
HttpGet newGet = new HttpGet("http://person.huibo.com/");
res = httpClient.execute(newGet, context);
String content = EntityUtils.toString(res.getEntity(),"UTF-8");
System.out.println("登陆成功后访问的页面===============");
//System.out.println(content);
String patenName= "<a\\shref=\\\"http:\\/\\/person.huibo.com\\/\\\">(.*?)</a>";
String Name = reg(content,patenName,1);
System.out.println(Name);
res.close();

HttpGet newGetMyResume = new HttpGet("http://person.huibo.com/resume/manage/");
res = httpClient.execute(newGetMyResume, context);
String contentMyResume = EntityUtils.toString(res.getEntity(),"UTF-8");
System.out.println("我的简历页面===============");
//System.out.println(contentMyResume);
res.close();

//匹配找到 resume_id
String paten= "href=\\\"\\/resume\\/update\\/resume_id-(.*?)\\\"\\starget=";
String resumeId = reg(contentMyResume,paten,1);
System.out.println(resumeId);

System.out.println("下载world文档页面===============");
HttpGet newGetDown = new HttpGet("http://person.huibo.com/resume/worddown/resumeid-"+resumeId);
res = httpClient.execute(newGetDown, context);

FileOutputStream out = (new FileOutputStream(new java.io.File("D:\\huibo\\"+Name+"简历.doc")));
InputStream in = res.getEntity().getContent();
byte[] buff = new byte[1024];//创建字节缓冲大小
int bytesRead;
while (-1 != (bytesRead = in.read(buff, 0, buff.length))) {
out.write(buff, 0, bytesRead);
}
out.close();
res.close();


} finally {
httpClient.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}

public static String reg(String string, String reg, int i) {
String s = "";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(string);
while (matcher.find()) {
s = matcher.group(i);
}
return s;
}

}