前段时间,按照上面的要求,需要做一个职场黑名单的项目,负责的部分是数据采集,也就是通过对各大招聘网站,按照地区或者其它划分,采集HR的邮箱信息入库,由于采集的网站较多,所以把部分公用的方法放在一个类中,方便调用,下面是对51job的采集,代码如下:
package org.hr.integrity.crawl;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.httpclient.NameValuePair;
import org.hr.util.ConnectionUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 爬取51job
* @author 72414
*
*/
public class JobsHref {
NameValuePair[] data = null;
static List<String> col = new ArrayList<String>();// 公司主页col
static Example ex = new Example();
//放入到set集合中
static Set<String> list = new HashSet<String>();
public boolean getEmail(String body){//判断email地址
boolean flag=false;
try{
Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}");
Matcher m =p.matcher(body);
if(m.find()){
String email=m.group();
if(!email.equals("club@51job.com")){
flag=true;
System.out.println("email:"+email);
list.add(email);
}
}
}
catch(Exception e){
e.printStackTrace();
}
return flag;
}
@SuppressWarnings({ "static-access", "unused" })
public List<String> getHref(String body, NameValuePair[] data1) throws Exception// 得到招聘网站公司发布的第一页的网址
{
JobsHref jh = new JobsHref();
NameValuePair data[] = {
new NameValuePair("loginname", "2066989394@qq.com"),
new NameValuePair("password", "dir13652") };
if (body != null && !"".equals(body)) {
Document doc = Jsoup.parse(body);//Document doc = jh.requestDocumnet(body);
Elements linksElements = doc.select("[class=el]");
for (Element element : linksElements) {
Elements jobs = element.getElementsByClass("t1");//岗位名称
for (Element ele : jobs) {
Element links = ele.getElementsByTag("a").first();
String href = links.attr("href");
if (href.indexOf("https://") >= 0) {
if (href != null && !"null".equals(href)&& !"".equals(href) && !col.contains(href)) {
col.add(href);
String context = ex.getPostResponseWithHttpClient(href, "GBK");// 得到招聘时应聘的html,这里成为广度优先的第2层
if(!getEmail(context)){
}
}
}
}
}
}
return col;
}
public static String getURLValidate2(String url) {//检测URL
String URL=null;
if(url.length()<=29){
return "";
}
else{
Matcher m =null;
try {
Pattern p = Pattern.compile("https://search.51job.com/list/");
/*
* 51job中和工作有关的网页以http://search\\.51job\\.com/list/开头,
* 截取从工作高级搜索找工作网页得到的超链与上面匹配,返回需要的超链
*/
String suburl=url.trim().substring(0, 30);//
m = p.matcher(suburl);
if (m.find()) {
URL = url;
}
else{
return "";
}
}
catch (Exception e) {
e.printStackTrace();
}
}
return URL;
}
@SuppressWarnings("unused")
public List<String> getHref1(String body, NameValuePair[] data1) throws Exception// 得到招聘分页的网址
{
LinkedList<String> nowpageHref = new LinkedList<String>();
NameValuePair data[] = {
new NameValuePair("loginname", "2066989394@qq.com"),
new NameValuePair("password", "dir13652") };
if (body != null && !"".equals(body)) {
Document doc = Jsoup.parse(body);
Elements linksElements = doc.select("div.p_in>ul>li>a");//得到分页链接
for (Element ele : linksElements) {
String href = getURLValidate2(ele.attr("href")) ;
if (href != null && href.indexOf("https://") >= 0&&!"".equals(href)) {
if (!nowpageHref.contains(href))
nowpageHref.add(href);
}
}
}
return nowpageHref;
}
@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
JobsHref js= new JobsHref();
NameValuePair data1[] = {
new NameValuePair("loginname", "2066989394@qq.com"),
new NameValuePair("password", "dir13652") };
String body= ex.getGetResponseWithHttpClient(
"http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=010000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=3&providesalary=99&keywordtype=2&curr_page=1&lang=c&stype=2&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=01&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14"
,"GBK");// 得到各地区发布的工作的html页面
js.getHref(body, data1);//返回当页工作的网站
List<String> page = js.getHref1(body, data1);//得到招聘分页的网站
Iterator<String> It = page.iterator();
while (It.hasNext()) {
String result = ex.getGetResponseWithHttpClient(It.next(),"GBK");
js.getHref(result, data1);
}
ex.printEmialList();
ConnectionUtil cu = new ConnectionUtil();
for (String str : list) {
cu.addEmail(str.trim());//去掉空格
}
System.out.println("运行完成!");
}
}
下面是公用的代码部分,
package org.hr.integrity.crawl;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
public class Example {
// 获得ConnectionManager,设置相关参数
private static MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
private static int connectionTimeOut = 20000;
private static int socketTimeOut = 10000;
private static int maxConnectionPerHost = 5;
private static int maxTotalConnections = 40;
// 标志初始化是否完成的flag
private static boolean initialed = true;//设置值为true,2018年6月7日 10:28:09
static List<String> list=new LinkedList<String>();//总邮箱list
// 初始化ConnectionManger的方法
public static void SetPara() {
manager.getParams().setConnectionTimeout(connectionTimeOut);
manager.getParams().setSoTimeout(socketTimeOut);
manager.getParams().setDefaultMaxConnectionsPerHost(
maxConnectionPerHost);
manager.getParams().setMaxTotalConnections(maxTotalConnections);
initialed = true;
}
// 通过get方法获取网页内容
public static String getGetResponseWithHttpClient(String url, String encode) {
HttpClient client = new HttpClient(manager);
if (initialed) {
Example.SetPara();
}
GetMethod get = new GetMethod(url);
get.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告
get.setFollowRedirects(true);
String result = null;
StringBuffer resultBuffer = new StringBuffer();
try {
client.executeMethod(get);
// 在目标页面情况未知的条件下,不推荐使用getResponseBodyAsString()方法
//String strGetResponseBody = post.getResponseBodyAsString();
BufferedReader in = new BufferedReader(new InputStreamReader(get
.getResponseBodyAsStream(), get.getResponseCharSet()));
String inputLine = null;
while ((inputLine = in.readLine()) != null) {
resultBuffer.append(inputLine);
resultBuffer.append("\n");
}
in.close();
result = resultBuffer.toString();
// iso-8859-1 is the default reading encode
result = Example.ConverterStringCode(resultBuffer
.toString(), get.getResponseCharSet(), encode);
} catch (Exception e) {
e.printStackTrace();
result = "";
} finally {
get.releaseConnection();
}
return result;
}
@SuppressWarnings("resource")
public static void addEmail(String email) throws Exception{
FileOutputStream fos = new FileOutputStream(new File("1_1email.txt"),true);
fos.write(email.getBytes());
}
void printEmialList()throws IOException{
FileOutputStream fos = new FileOutputStream(new File("email.txt"),true);
Iterator<String> it=list.iterator();
System.out.println("生成email");
while(it.hasNext()){
String ema=it.next()+",";
fos.write(ema.getBytes());
}
fos.close();
}
public static String getPostResponseWithHttpClient(String url, String encode) {
HttpClient client = new HttpClient(manager);
if (initialed) {
HttpClientExample.SetPara();
}
PostMethod post = new PostMethod(url);
post.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告
post.setFollowRedirects(false);
StringBuffer resultBuffer = new StringBuffer();
String result = null;
try {
client.executeMethod(post);
BufferedReader in = new BufferedReader(new InputStreamReader(post
.getResponseBodyAsStream(), post.getResponseCharSet()));
String inputLine = null;
while ((inputLine = in.readLine()) != null) {
resultBuffer.append(inputLine);
resultBuffer.append("\n");
}
in.close();
// iso-8859-1 is the default reading encode
result = Example.ConverterStringCode(resultBuffer
.toString(), post.getResponseCharSet(), encode);
} catch (Exception e) {
e.printStackTrace();
result = "";
} finally {
post.releaseConnection();
}
return result;
}
public static boolean getEmail(String body){
boolean flag=false;
try{
Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}");
Matcher m =p.matcher(body);
if(m.find()){
flag=true;
String email=m.group();
//System.out.println("SSSS:"+email);
if(!list.contains(email)){
list.add(email);
addEmail(email);//将得到的Email加入数据库,这里先加入文本里面
}
}
}
catch(Exception e){
e.printStackTrace();
}
return flag;
}
public static String getPostResponseWithHttpClient (String url,
String encode, NameValuePair[] nameValuePair) throws Exception {
HttpClient client = new HttpClient(manager);
if (initialed) {//
HttpClientExample.SetPara();//初始化ConnectionManger的方法
}
PostMethod post = new PostMethod(url);
post.setRequestBody(nameValuePair);//将表单所有的值设置到PostMethod中
post.getParams().setParameter(//去除警告
"http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);
post.setFollowRedirects(false);//设置此类是否应该自动执行http重定向
String result = null;
StringBuffer resultBuffer = new StringBuffer();
try {
client.executeMethod(post);
BufferedReader in = new BufferedReader(new InputStreamReader(post
.getResponseBodyAsStream(), post.getResponseCharSet()));
String inputLine = null;
while ((inputLine = in.readLine()) != null) {
resultBuffer.append(inputLine);
resultBuffer.append("\n");
}
in.close();
// iso-8859-1 is the default reading encode
result = Example.ConverterStringCode(resultBuffer.toString(), post.getResponseCharSet(), encode);
//System.out.println("result:"+result.length());
if(getEmail(result)){//验证网址
System.out.println("hasemailurl:"+url);
}
} catch (Exception e) {
e.printStackTrace();
result = "";
} finally {
post.releaseConnection();
}
return result;
}
private static String ConverterStringCode(String source, String srcEncode,
String destEncode) {
if (source != null) {
try {
return new String(source.getBytes(srcEncode), destEncode);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return "";
}
} else {
return "";
}
}
}
上面的代码是先爬取能获取到的页面,爬到的邮箱先放入一个list里面,爬完之后再放入到数据库中,下面是ConnectionUtil.java中插入到数据库的片段代码:
/**
* 向数据库表添加数据
* @auther yuyu
*/
public boolean addEmail(String em){
boolean result = false;
try {
conn = DriverManager.getConnection(connStr);
String sqlInset = "insert into hrintegrity.email(email) values(?)";
PreparedStatement stmts = conn.prepareStatement(sqlInset);
stmts.setString(1, em);
//这里需要添加判断,email在表中是否存在
int i = stmts.executeUpdate();//执行插入数据操作,返回影响的行数
if(i == 1){
result = true;
}
} catch (Exception e) {
e.printStackTrace();
}finally{
try {
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return result;
}
上面就是一个获取51job的邮箱的完整代码,除了51job外,其它招聘网站的获取方式大同小异,如智联,不同点就是在Example.java中调用的方法不同,而且在采集数据的时候select的标签不一样,需要自己一个一个去尝试。
有问题可以在留言中一起交流。