获取招聘网站下的HR-Email信息

最新推荐文章于 2021-02-04 04:09:29 发布

Yuruiyu

最新推荐文章于 2021-02-04 04:09:29 发布

阅读量6.1k

点赞数

分类专栏：随笔文章标签：采集 select

本文链接：https://blog.csdn.net/Yuruiyu/article/details/81285451

版权

随笔专栏收录该内容

13 篇文章 1 订阅

订阅专栏

前段时间，按照上面的要求，需要做一个职场黑名单的项目，负责的部分是数据采集，也就是通过对各大招聘网站，按照地区或者其它划分，采集HR的邮箱信息入库，由于采集的网站较多，所以把部分公用的方法放在一个类中，方便调用，下面是对51job的采集，代码如下：

package org.hr.integrity.crawl;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;

import org.apache.commons.httpclient.NameValuePair;
import org.hr.util.ConnectionUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 爬取51job
 * @author 72414
 *
 */
public class JobsHref {
	
	NameValuePair[] data = null;

	static List<String> col = new ArrayList<String>();// 公司主页col 
	
	static Example ex = new Example();
	
	//放入到set集合中
	static Set<String> list = new HashSet<String>();
	
	public boolean getEmail(String body){//判断email地址
		
		boolean flag=false;
		
		 try{
			 Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}"); 
			 Matcher m =p.matcher(body);
			 	if(m.find()){

			 		String email=m.group();
			 		
			 			if(!email.equals("club@51job.com")){

			 					flag=true;
			 					
			 					System.out.println("email:"+email);
			 					
			 					list.add(email);
			 				}
			 		}
		 	}
		 	catch(Exception e){
			 
		 		e.printStackTrace();
		 }
    	return flag;
	}
	
	
	@SuppressWarnings({ "static-access", "unused" })
	public List<String> getHref(String body, NameValuePair[] data1) throws Exception// 得到招聘网站公司发布的第一页的网址
	{
		
		JobsHref jh = new JobsHref();
		
		NameValuePair data[] = {
				
				new NameValuePair("loginname", "2066989394@qq.com"),
				new NameValuePair("password", "dir13652") };
		
		if (body != null && !"".equals(body)) {
			
			Document doc = Jsoup.parse(body);//Document doc = jh.requestDocumnet(body);
			
			Elements linksElements = doc.select("[class=el]");
		
			for (Element element : linksElements) {
				
				Elements jobs = element.getElementsByClass("t1");//岗位名称
				
				for (Element ele : jobs) {
				
					Element links = ele.getElementsByTag("a").first();
				
					String href = links.attr("href");
				
					if (href.indexOf("https://") >= 0) {
					
						if (href != null && !"null".equals(href)&& !"".equals(href) && !col.contains(href)) {
						
							col.add(href);
						
							String context = ex.getPostResponseWithHttpClient(href, "GBK");// 得到招聘时应聘的html，这里成为广度优先的第2层
							
							if(!getEmail(context)){
								
							}
						}
					}
				}
			}
		}
		return col;
	}
	
	
	public static String getURLValidate2(String url) {//检测URL
		
		String URL=null;
		 if(url.length()<=29){
			 return "";
         		 }
		 else{
		     Matcher m =null;
		     	try {
					Pattern p = Pattern.compile("https://search.51job.com/list/");
				/*
				 * 51job中和工作有关的网页以http://search\\.51job\\.com/list/开头，
				 * 截取从工作高级搜索找工作网页得到的超链与上面匹配，返回需要的超链
				 */
				
					String suburl=url.trim().substring(0, 30);//
					
					m = p.matcher(suburl);
					
					if (m.find()) {
						
						URL = url;
						
					}
					else{
						
						return "";
						
					}

		     	} 
				catch (Exception e) {

					e.printStackTrace();
				}
		 	}
		 return URL;
		
		}

	@SuppressWarnings("unused")
	public List<String> getHref1(String body, NameValuePair[] data1) throws Exception// 得到招聘分页的网址
	{

		LinkedList<String> nowpageHref = new LinkedList<String>();
		
		NameValuePair data[] = {
				new NameValuePair("loginname", "2066989394@qq.com"),
				new NameValuePair("password", "dir13652") };
		
		if (body != null && !"".equals(body)) {
			
			Document doc = Jsoup.parse(body);
			
			Elements linksElements = doc.select("div.p_in>ul>li>a");//得到分页链接
			
			for (Element ele : linksElements) {	
				
				String href = getURLValidate2(ele.attr("href")) ;
				
				if (href != null && href.indexOf("https://") >= 0&&!"".equals(href)) {	
					
					if (!nowpageHref.contains(href))
						
						nowpageHref.add(href);

				}
				
			}

		}
		
		return nowpageHref;
	}
	

	@SuppressWarnings("static-access")
	public static void main(String[] args) throws Exception {
		
		JobsHref js= new JobsHref();

		NameValuePair data1[] = {
				
				new NameValuePair("loginname", "2066989394@qq.com"),
				new NameValuePair("password", "dir13652") };
		
			String body= ex.getGetResponseWithHttpClient(
					"http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=010000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=3&providesalary=99&keywordtype=2&curr_page=1&lang=c&stype=2&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=01&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14"
					,"GBK");// 得到各地区发布的工作的html页面  
      		js.getHref(body, data1);//返回当页工作的网站
      		
        	List<String> page = js.getHref1(body, data1);//得到招聘分页的网站
        	
    		Iterator<String> It = page.iterator();
    		
    		while (It.hasNext()) {

    			String result = ex.getGetResponseWithHttpClient(It.next(),"GBK");
 
    			js.getHref(result, data1);
    		}
		ex.printEmialList();
		
		ConnectionUtil cu = new ConnectionUtil();
		
		for (String str : list) {

			cu.addEmail(str.trim());//去掉空格
		}
		
		System.out.println("运行完成！");
	}
}

下面是公用的代码部分，

package org.hr.integrity.crawl;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;

public class Example {
	

	// 获得ConnectionManager，设置相关参数
	private static MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
	private static int connectionTimeOut = 20000;
	private static int socketTimeOut = 10000;
	private static int maxConnectionPerHost = 5;
	private static int maxTotalConnections = 40;
	// 标志初始化是否完成的flag
	private static boolean initialed = true;//设置值为true，2018年6月7日 10:28:09
	static List<String> list=new LinkedList<String>();//总邮箱list
	
	// 初始化ConnectionManger的方法
	public static void SetPara() {
		manager.getParams().setConnectionTimeout(connectionTimeOut);
		manager.getParams().setSoTimeout(socketTimeOut);
		manager.getParams().setDefaultMaxConnectionsPerHost(
				maxConnectionPerHost);
		manager.getParams().setMaxTotalConnections(maxTotalConnections);
		initialed = true;
	}

	// 通过get方法获取网页内容
	public static String getGetResponseWithHttpClient(String url, String encode) {
		HttpClient client = new HttpClient(manager);
		if (initialed) {
			Example.SetPara();
		}
		GetMethod get = new GetMethod(url);
		get.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告
		get.setFollowRedirects(true);
		String result = null;
		StringBuffer resultBuffer = new StringBuffer();
		try {
			client.executeMethod(get);
			// 在目标页面情况未知的条件下，不推荐使用getResponseBodyAsString()方法
			//String strGetResponseBody = post.getResponseBodyAsString();
			BufferedReader in = new BufferedReader(new InputStreamReader(get
					.getResponseBodyAsStream(), get.getResponseCharSet()));
			String inputLine = null;
			while ((inputLine = in.readLine()) != null) {
				resultBuffer.append(inputLine);
				resultBuffer.append("\n");
			}
			in.close();
			result = resultBuffer.toString();
			// iso-8859-1 is the default reading encode
			result = Example.ConverterStringCode(resultBuffer
					.toString(), get.getResponseCharSet(), encode);
		
		} catch (Exception e) {
			e.printStackTrace();
			result = "";
		} finally {
			get.releaseConnection();
			
		}
		return result;
	}
	
	
	@SuppressWarnings("resource")
	public static void addEmail(String email) throws Exception{
		  FileOutputStream fos = new FileOutputStream(new File("1_1email.txt"),true);  
		  fos.write(email.getBytes()); 
	}
	
	
	void printEmialList()throws IOException{
		
		FileOutputStream fos = new FileOutputStream(new File("email.txt"),true);  
		Iterator<String> it=list.iterator();
		System.out.println("生成email");
		while(it.hasNext()){
			String ema=it.next()+",";
			fos.write(ema.getBytes()); 
		}
		 fos.close();
	}
	
	
	public static String getPostResponseWithHttpClient(String url, String encode) {
		HttpClient client = new HttpClient(manager);
		if (initialed) {
			HttpClientExample.SetPara();
		}
		PostMethod post = new PostMethod(url);
		post.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告
		post.setFollowRedirects(false);
		StringBuffer resultBuffer = new StringBuffer();
		String result = null;
		try {
			client.executeMethod(post);
			BufferedReader in = new BufferedReader(new InputStreamReader(post
					.getResponseBodyAsStream(), post.getResponseCharSet()));
			String inputLine = null;
			while ((inputLine = in.readLine()) != null) {
				resultBuffer.append(inputLine);
				resultBuffer.append("\n");
			}
			in.close();
			// iso-8859-1 is the default reading encode
			result = Example.ConverterStringCode(resultBuffer
					.toString(), post.getResponseCharSet(), encode);
			
		} catch (Exception e) {
			e.printStackTrace();
			result = "";
		} finally {
			post.releaseConnection();
		
		}
		return result;
	}
	
	public static  boolean getEmail(String body){
		boolean flag=false;
		try{
			Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}"); 
		   	Matcher m  =p.matcher(body);
	   	if(m.find()){
	   		flag=true;
	   		String email=m.group();
   		//System.out.println("SSSS:"+email);
   		if(!list.contains(email)){
   	    	list.add(email);
   	    	addEmail(email);//将得到的Email加入数据库，这里先加入文本里面
   				}
	   		}
		 }
		 catch(Exception e){
			 
			 e.printStackTrace();
		 }
    	return flag;
	}
	
	
	public static String getPostResponseWithHttpClient (String url,
			String encode, NameValuePair[] nameValuePair) throws Exception {
			HttpClient client = new HttpClient(manager);
			if (initialed) {//
				HttpClientExample.SetPara();//初始化ConnectionManger的方法
				}
			PostMethod post = new PostMethod(url);
			post.setRequestBody(nameValuePair);//将表单所有的值设置到PostMethod中
			post.getParams().setParameter(//去除警告
				"http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);
			post.setFollowRedirects(false);//设置此类是否应该自动执行http重定向
			String result = null;
			StringBuffer resultBuffer = new StringBuffer();
			try {
				client.executeMethod(post);    
			BufferedReader in = new BufferedReader(new InputStreamReader(post
					.getResponseBodyAsStream(), post.getResponseCharSet()));
			String inputLine = null;
			while ((inputLine = in.readLine()) != null) {
				resultBuffer.append(inputLine);
				resultBuffer.append("\n");
			}
			in.close();
			// iso-8859-1 is the default reading encode
			result = Example.ConverterStringCode(resultBuffer.toString(), post.getResponseCharSet(), encode);
			//System.out.println("result:"+result.length());
			if(getEmail(result)){//验证网址
				System.out.println("hasemailurl："+url);
			}
		} catch (Exception e) {
			e.printStackTrace();
			result = "";
		} finally {
			post.releaseConnection();
			
		}
		return result;
	}

	private static String ConverterStringCode(String source, String srcEncode,
			String destEncode) {
		if (source != null) {
			try {
				return new String(source.getBytes(srcEncode), destEncode);
			} catch (UnsupportedEncodingException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				return "";
			}
		} else {
			return "";
		}
	}
	
}

上面的代码是先爬取能获取到的页面，爬到的邮箱先放入一个list里面，爬完之后再放入到数据库中，下面是ConnectionUtil.java中插入到数据库的片段代码：

/**
	 * 向数据库表添加数据
	 * @auther yuyu
	 */
	public boolean addEmail(String em){
		
		boolean result = false;

		try {
			
			conn = DriverManager.getConnection(connStr);
			
			String sqlInset = "insert into hrintegrity.email(email) values(?)";
			
			PreparedStatement stmts = conn.prepareStatement(sqlInset);
			
			stmts.setString(1, em);
			
			//这里需要添加判断，email在表中是否存在
			
			int i = stmts.executeUpdate();//执行插入数据操作，返回影响的行数
			
			if(i == 1){
				
				result = true;
			}
			
		} catch (Exception e) {
			
			e.printStackTrace();
			
		}finally{
			
			try {
				
				conn.close();
				
			} catch (Exception e) {
				
				e.printStackTrace();
			}
			
		}
		
		
		
		return result;
	}

上面就是一个获取51job的邮箱的完整代码，除了51job外，其它招聘网站的获取方式大同小异，如智联，不同点就是在Example.java中调用的方法不同，而且在采集数据的时候select的标签不一样，需要自己一个一个去尝试。

有问题可以在留言中一起交流。