使用Jsoup采集拉勾网招聘信息

最新推荐文章于 2023-07-09 21:26:23 发布

baalhuo

最新推荐文章于 2023-07-09 21:26:23 发布

阅读量1.9k

点赞数 3

分类专栏： Java 数据采集文章标签：数据采集 Jsoup csv java 爬虫

本文链接：https://blog.csdn.net/baalhuo/article/details/52511430

版权

Java 同时被 2 个专栏收录

19 篇文章 0 订阅

订阅专栏

数据采集

3 篇文章 0 订阅

订阅专栏

本文使用Jsoup采集拉勾网招聘数据并写入CSV文件中，实现非常简单，在此不做多余的解释，如有问题可留言交流。

数据模型：job.java

package xyz.baal.jsoup;

public class Job {
	
	private String jobname;//职位名称
	private String salary;//薪水
	private String place;//工作地点
	private String experience;//工作经验
	private String educational;//学历
	private String business;//业务
	private String stage;//发展阶段
	private String company;//公司名称
	
	public Job() {
		super();
	}
	
	public Job(String jobname, String salary, String place, String experience, String educational, String business,
			String stage, String company) {
		super();
		this.jobname = jobname;
		this.salary = salary;
		this.place = place;
		this.experience = experience;
		this.educational = educational;
		this.business = business;
		this.stage = stage;
		this.company = company;
	}

	public String getJobname() {
		return jobname;
	}
	public void setJobname(String jobname) {
		this.jobname = jobname;
	}
	public String getSalary() {
		return salary;
	}
	public void setSalary(String salary) {
		this.salary = salary;
	}
	public String getPlace() {
		return place;
	}
	public void setPlace(String place) {
		this.place = place;
	}
	public String getExperience() {
		return experience;
	}
	public void setExperience(String experience) {
		this.experience = experience;
	}
	public String getEducational() {
		return educational;
	}
	public void setEducational(String educational) {
		this.educational = educational;
	}
	public String getBusiness() {
		return business;
	}
	public void setBusiness(String business) {
		this.business = business;
	}
	public String getStage() {
		return stage;
	}
	public void setStage(String stage) {
		this.stage = stage;
	}
	public String getCompany() {
		return company;
	}
	public void setCompany(String company) {
		this.company = company;
	}

	@Override
	public String toString() {
		return "Job [jobname=" + jobname + ", salary=" + salary + ", place=" + place + ", experience=" + experience
				+ ", educational=" + educational + ", business=" + business + ", stage=" + stage + ", company="
				+ company + "]";
	}
}

写入CSV使用的是CSVReader，由于源文件中就CsvReader、CsvWriter两个文件，在这里直接引用了CsvWriter源文件（附：API文档）。

获取各个招聘职位首页链接，如java招聘链接为//www.lagou.com/zhaopin/Java/

package xyz.baal.jsoup;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 根据拉勾网首页获取各个招聘职位首页链接
 * 
 * @author 
 *
 */
public class GetZPURL {
	
	private List<String> zpURLlist = new ArrayList<String>();//存放各个招聘职位首页链接
	
	public GetZPURL(){
		super();
	}
	
	/**
	 * 网络加载html文档
	 * @param url	文档url
	 */
	public void loadInternet(String url) {
		Document doc = null;
		try {
			doc = Jsoup.connect(url)
					.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36")
					.timeout(5000)
					.get();
		} catch (IOException e) {
			System.out.println("获取招聘URL失败。");
			return;
		}
		Element content = doc.getElementById("container");
		Elements links = content.getElementsByTag("a");
		for (Element link : links) {
			String linkHref = link.attr("href");
			if(isZp(linkHref)){
				zpURLlist.add(linkHref);
			}
		}
	}
	
	/**
	 * 从本地加载html文档
	 * @param path	文档路径
	 * @param charset	文档字符集
	 * @param baseURL	基本url，当链接中存在相对路径时作为前缀
	 * @throws IOException	文件不存在或无法读取时抛出此异常
	 */
	public void loadLocal(String path ,String charset, String baseURL) throws IOException {

		File input = new File(path);
		Document doc = Jsoup.parse(input, charset, baseURL);
		Element content = doc.getElementById("container");
		Elements links = content.getElementsByTag("a");
		for (Element link : links) {
			String linkHref = link.attr("href");
			if(isZp(linkHref)){
				zpURLlist.add(linkHref);
			}
		}
	}
	
	public boolean isZp(String url){
		if(url.indexOf("//www.lagou.com/zhaopin/")!=-1&&url.length()>24){
			return true;
		}else {
			return false;
		}
	}

	public List<String> getZpURLlist() {
		return zpURLlist;
	}
}

获取某一招聘职位的30x15条数据，并写入CSV文件。

package xyz.baal.jsoup;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.csvreader.CsvWriter;

/**
 * 获取拉勾网某一个职位的30x15条招聘信息
 * 
 * @author
 * 
 */
public class GetJob implements Runnable{

	private String zpUrl;// 某招聘职位对应的原始URL
	private List<String> zpUrlList = new ArrayList<String>(); // 每个分页对应的URL
	private List<String> jobUrlList = new ArrayList<String>();// 每条招聘信息对应的URL
	private List<Job> joblist = new ArrayList<Job>();// 存放30x15条招聘信息

	private static final String A_HREF = "//www.lagou.com/jobs/\\d+.html"; // href格式 //www.lagou.com/jobs/2350451.html
	private static final String PATH = "D:/"; // 文件存放路径

	private String jobName = "";//招聘职位名称

	/**
	 * 
	 * @param url 招聘职位首页url,如java、hadoop等招聘职位
	 */
	public GetJob(String url) {
		zpUrl = url;
	}

	/**
	 * 在此方法内完成某一招聘职位的450条数据抓取
	 */
	public void init() {
		
		// 构建30个分页URL
		zpUrlList.add(zpUrl + "?filterOption=3");
		for (int i = 2; i <= 30; i++) {
			zpUrlList.add(zpUrl + i + "/?filterOption=3");
		}

		// 提取每个分页中的招聘信息URL
		for (String string : zpUrlList) {
			Document doc = null;
			try {
				doc = Jsoup.connect("http:" + string)
						.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36")
						.timeout(5000)
						.get();
			} catch (IOException e) {
				continue;
			}
			Element content = doc.getElementById("s_position_list");
			if (content == null) {
				continue;
			}

			Elements links = content.getElementsByTag("a");
			if (links == null) {
				continue;
			}
			for (Element link : links) {
				String linkHref = link.attr("href");
				Pattern pattern = Pattern.compile(A_HREF, Pattern.CASE_INSENSITIVE);
				Matcher matcher = pattern.matcher(linkHref);
				if (matcher.find()) {
					jobUrlList.add("http:" + linkHref);
				}
			}
			if (jobName == "") {
				jobName = doc.select("title").first().text().split("-")[0];
			}
		}

		// 根据招聘信息URL提取招聘详细信息
		for (String string : jobUrlList) {
			Job job = new Job();
			Document doc = null;
			try {
				doc = Jsoup.connect(string)
						.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36")
						.timeout(5000)
						.get();
				job.setJobname(jobName);

				Element content = doc.getElementById("container");
				Element job_request = content.select(".job_request p").first();
				if (job_request != null) {
					if (job_request.child(0) != null) {
						job.setSalary(job_request.child(0).text());
						job.setPlace(job_request.child(1).text());
						job.setExperience(job_request.child(2).text());
						job.setEducational(job_request.child(3).text());
					} else {
						continue;
					}

				} else {
					continue;
				}

				Element cpy = doc.getElementById("job_company");
				if (cpy.childNodeSize()>=2) {
					job.setCompany(cpy.child(0).child(0).child(0).attr("alt"));
					job.setBusiness(cpy.child(1).child(0).child(0).ownText());
					job.setStage(cpy.child(1).child(2).child(0).ownText());
				} else {
					continue;
				}
				joblist.add(job);
			} catch (IOException e) {
				continue;
			}
		}
	}

	public List<Job> getJoblist() {
		return joblist;
	}

	/**
	 * 将采集数据写入txt文件中
	 */
	public void writeTxtFile() {
		if (joblist.size() == 0 || joblist == null) {
			return;
		}
		File file = new File(PATH + joblist.get(0).getJobname() + ".txt");
		FileWriter fw = null;
		BufferedWriter bw = null;
		Iterator<Job> iter = joblist.iterator();
		try {
			fw = new FileWriter(file);
			bw = new BufferedWriter(fw);
			while (iter.hasNext()) {
				bw.write(iter.next().toString());
				bw.newLine();
			}
			bw.flush();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				if (bw != null) {
					bw.close();
				}
				if (fw != null) {
					fw.close();
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 将采集数据写入CSV文件中
	 */
	public void writeCSVFile() {
		CsvWriter wr = null;
		if (joblist.size() == 0 || joblist == null) {
			return;
		}
		try {
			String csvFilePath = PATH + joblist.get(0).getJobname() + ".csv";
			wr = new CsvWriter(csvFilePath, ',', Charset.forName("GBK"));
			String[] header = { "职位名称", "薪水", "工作地点", "工作经验", "学历", "公司名称", "公司业务", "发展阶段"};
			wr.writeRecord(header);
			for (Job job : joblist) {
				String[] jobstr = { job.getJobname(), job.getSalary(), job.getPlace(), job.getExperience(),
						job.getEducational(), job.getCompany(), job.getBusiness(), job.getStage() };
				wr.writeRecord(jobstr);
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (wr != null) {
				wr.close();
			}
		}
	}

	@Override
	public void run() {
		init();
		writeCSVFile();
		System.out.println(jobName+"--End");
	}
}

采集测试：

package xyz.baal.jsoup;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

public class Test {

public static List<String> zpURLlist = new ArrayList<String>();
	
	public static void main(String[] args) throws IOException {

		 //创建等待任务队列   
        BlockingQueue<Runnable> bqueue = new ArrayBlockingQueue<Runnable>(20);   
        //创建线程池，池中保存的线程数为3，池中允许的最大线程数为4  
        ThreadPoolExecutor pool = new ThreadPoolExecutor(3,4,50,TimeUnit.MILLISECONDS,bqueue);   
		
		Runnable job1 = new GetJob("//www.lagou.com/zhaopin/iOS/");
		Runnable job2 = new GetJob("//www.lagou.com/zhaopin/C/");
		Runnable job3 = new GetJob("//www.lagou.com/zhaopin/C++/");
		Runnable job4 = new GetJob("//www.lagou.com/zhaopin/Python/");
		Runnable job5 = new GetJob("//www.lagou.com/zhaopin/HTML5/");
		Runnable job6 = new GetJob("//www.lagou.com/zhaopin/webqianduan/");
		
		pool.execute(job1);
		pool.execute(job2);
		pool.execute(job3);
		pool.execute(job4);
		pool.execute(job5);
		pool.execute(job6);
		//关闭线程池
		pool.shutdown();
	}
}