本文使用Jsoup采集拉勾网招聘数据并写入CSV文件中,实现非常简单,在此不做多余的解释,如有问题可留言交流。
数据模型:job.java
package xyz.baal.jsoup;
public class Job {
private String jobname;//职位名称
private String salary;//薪水
private String place;//工作地点
private String experience;//工作经验
private String educational;//学历
private String business;//业务
private String stage;//发展阶段
private String company;//公司名称
public Job() {
super();
}
public Job(String jobname, String salary, String place, String experience, String educational, String business,
String stage, String company) {
super();
this.jobname = jobname;
this.salary = salary;
this.place = place;
this.experience = experience;
this.educational = educational;
this.business = business;
this.stage = stage;
this.company = company;
}
public String getJobname() {
return jobname;
}
public void setJobname(String jobname) {
this.jobname = jobname;
}
public String getSalary() {
return salary;
}
public void setSalary(String salary) {
this.salary = salary;
}
public String getPlace() {
return place;
}
public void setPlace(String place) {
this.place = place;
}
public String getExperience() {
return experience;
}
public void setExperience(String experience) {
this.experience = experience;
}
public String getEducational() {
return educational;
}
public void setEducational(String educational) {
this.educational = educational;
}
public String getBusiness() {
return business;
}
public void setBusiness(String business) {
this.business = business;
}
public String getStage() {
return stage;
}
public void setStage(String stage) {
this.stage = stage;
}
public String getCompany() {
return company;
}
public void setCompany(String company) {
this.company = company;
}
@Override
public String toString() {
return "Job [jobname=" + jobname + ", salary=" + salary + ", place=" + place + ", experience=" + experience
+ ", educational=" + educational + ", business=" + business + ", stage=" + stage + ", company="
+ company + "]";
}
}
写入CSV使用的是CSVReader,由于源文件中就CsvReader、CsvWriter两个文件,在这里直接引用了CsvWriter源文件(附:API文档)。
获取各个招聘职位首页链接,如java招聘链接为//www.lagou.com/zhaopin/Java/
package xyz.baal.jsoup;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 根据拉勾网首页获取各个招聘职位首页链接
*
* @author
*
*/
public class GetZPURL {
private List<String> zpURLlist = new ArrayList<String>();//存放各个招聘职位首页链接
public GetZPURL(){
super();
}
/**
* 网络加载html文档
* @param url 文档url
*/
public void loadInternet(String url) {
Document doc = null;
try {
doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36")
.timeout(5000)
.get();
} catch (IOException e) {
System.out.println("获取招聘URL失败。");
return;
}
Element content = doc.getElementById("container");
Elements links = content.getElementsByTag("a");
for (Element link : links) {
String linkHref = link.attr("href");
if(isZp(linkHref)){
zpURLlist.add(linkHref);
}
}
}
/**
* 从本地加载html文档
* @param path 文档路径
* @param charset 文档字符集
* @param baseURL 基本url,当链接中存在相对路径时作为前缀
* @throws IOException 文件不存在或无法读取时抛出此异常
*/
public void loadLocal(String path ,String charset, String baseURL) throws IOException {
File input = new File(path);
Document doc = Jsoup.parse(input, charset, baseURL);
Element content = doc.getElementById("container");
Elements links = content.getElementsByTag("a");
for (Element link : links) {
String linkHref = link.attr("href");
if(isZp(linkHref)){
zpURLlist.add(linkHref);
}
}
}
public boolean isZp(String url){
if(url.indexOf("//www.lagou.com/zhaopin/")!=-1&&url.length()>24){
return true;
}else {
return false;
}
}
public List<String> getZpURLlist() {
return zpURLlist;
}
}
获取某一招聘职位的30x15条数据,并写入CSV文件。
package xyz.baal.jsoup;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.csvreader.CsvWriter;
/**
* 获取拉勾网某一个职位的30x15条招聘信息
*
* @author
*
*/
public class GetJob implements Runnable{
private String zpUrl;// 某招聘职位对应的原始URL
private List<String> zpUrlList = new ArrayList<String>(); // 每个分页对应的URL
private List<String> jobUrlList = new ArrayList<String>();// 每条招聘信息对应的URL
private List<Job> joblist = new ArrayList<Job>();// 存放30x15条招聘信息
private static final String A_HREF = "//www.lagou.com/jobs/\\d+.html"; // href格式 //www.lagou.com/jobs/2350451.html
private static final String PATH = "D:/"; // 文件存放路径
private String jobName = "";//招聘职位名称
/**
*
* @param url 招聘职位首页url,如java、hadoop等招聘职位
*/
public GetJob(String url) {
zpUrl = url;
}
/**
* 在此方法内完成某一招聘职位的450条数据抓取
*/
public void init() {
// 构建30个分页URL
zpUrlList.add(zpUrl + "?filterOption=3");
for (int i = 2; i <= 30; i++) {
zpUrlList.add(zpUrl + i + "/?filterOption=3");
}
// 提取每个分页中的招聘信息URL
for (String string : zpUrlList) {
Document doc = null;
try {
doc = Jsoup.connect("http:" + string)
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36")
.timeout(5000)
.get();
} catch (IOException e) {
continue;
}
Element content = doc.getElementById("s_position_list");
if (content == null) {
continue;
}
Elements links = content.getElementsByTag("a");
if (links == null) {
continue;
}
for (Element link : links) {
String linkHref = link.attr("href");
Pattern pattern = Pattern.compile(A_HREF, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(linkHref);
if (matcher.find()) {
jobUrlList.add("http:" + linkHref);
}
}
if (jobName == "") {
jobName = doc.select("title").first().text().split("-")[0];
}
}
// 根据招聘信息URL提取招聘详细信息
for (String string : jobUrlList) {
Job job = new Job();
Document doc = null;
try {
doc = Jsoup.connect(string)
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36")
.timeout(5000)
.get();
job.setJobname(jobName);
Element content = doc.getElementById("container");
Element job_request = content.select(".job_request p").first();
if (job_request != null) {
if (job_request.child(0) != null) {
job.setSalary(job_request.child(0).text());
job.setPlace(job_request.child(1).text());
job.setExperience(job_request.child(2).text());
job.setEducational(job_request.child(3).text());
} else {
continue;
}
} else {
continue;
}
Element cpy = doc.getElementById("job_company");
if (cpy.childNodeSize()>=2) {
job.setCompany(cpy.child(0).child(0).child(0).attr("alt"));
job.setBusiness(cpy.child(1).child(0).child(0).ownText());
job.setStage(cpy.child(1).child(2).child(0).ownText());
} else {
continue;
}
joblist.add(job);
} catch (IOException e) {
continue;
}
}
}
public List<Job> getJoblist() {
return joblist;
}
/**
* 将采集数据写入txt文件中
*/
public void writeTxtFile() {
if (joblist.size() == 0 || joblist == null) {
return;
}
File file = new File(PATH + joblist.get(0).getJobname() + ".txt");
FileWriter fw = null;
BufferedWriter bw = null;
Iterator<Job> iter = joblist.iterator();
try {
fw = new FileWriter(file);
bw = new BufferedWriter(fw);
while (iter.hasNext()) {
bw.write(iter.next().toString());
bw.newLine();
}
bw.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (bw != null) {
bw.close();
}
if (fw != null) {
fw.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 将采集数据写入CSV文件中
*/
public void writeCSVFile() {
CsvWriter wr = null;
if (joblist.size() == 0 || joblist == null) {
return;
}
try {
String csvFilePath = PATH + joblist.get(0).getJobname() + ".csv";
wr = new CsvWriter(csvFilePath, ',', Charset.forName("GBK"));
String[] header = { "职位名称", "薪水", "工作地点", "工作经验", "学历", "公司名称", "公司业务", "发展阶段"};
wr.writeRecord(header);
for (Job job : joblist) {
String[] jobstr = { job.getJobname(), job.getSalary(), job.getPlace(), job.getExperience(),
job.getEducational(), job.getCompany(), job.getBusiness(), job.getStage() };
wr.writeRecord(jobstr);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (wr != null) {
wr.close();
}
}
}
@Override
public void run() {
init();
writeCSVFile();
System.out.println(jobName+"--End");
}
}
采集测试:
package xyz.baal.jsoup;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
public class Test {
public static List<String> zpURLlist = new ArrayList<String>();
public static void main(String[] args) throws IOException {
//创建等待任务队列
BlockingQueue<Runnable> bqueue = new ArrayBlockingQueue<Runnable>(20);
//创建线程池,池中保存的线程数为3,池中允许的最大线程数为4
ThreadPoolExecutor pool = new ThreadPoolExecutor(3,4,50,TimeUnit.MILLISECONDS,bqueue);
Runnable job1 = new GetJob("//www.lagou.com/zhaopin/iOS/");
Runnable job2 = new GetJob("//www.lagou.com/zhaopin/C/");
Runnable job3 = new GetJob("//www.lagou.com/zhaopin/C++/");
Runnable job4 = new GetJob("//www.lagou.com/zhaopin/Python/");
Runnable job5 = new GetJob("//www.lagou.com/zhaopin/HTML5/");
Runnable job6 = new GetJob("//www.lagou.com/zhaopin/webqianduan/");
pool.execute(job1);
pool.execute(job2);
pool.execute(job3);
pool.execute(job4);
pool.execute(job5);
pool.execute(job6);
//关闭线程池
pool.shutdown();
}
}
如需IP代理可在此网站寻找代理资源:http://www.xicidaili.com/
GitHub:点这里