目标:51job
不要问我为什么爬这个,因为技术能力有限。很有限。非常有限。
1.先看看这个网站有啥有用的信息吧。
地区,肯定是需要的,要不去哪找工作啊。
找工作为了啥,你说呢。直接爬5万以上的,手动滑稽。
总页数和总招聘信息数量。(一开始忽略了)
每条招聘信息有用的信息。
2.分析一下源码吧
最先看到的,就是这个东西。随便点开一个瞧一瞧,发现一下找到了有用的信息。
这行数字一看就不简单,男人的第六感。
返回看下网址果然不简单,聪明如我。。
现在确定了这个位置的数字就代表地区码。接着往下找可能有用的信息。
直呼好家伙这么草率吗,想爬的信息就这么摆在这里。
太乱了,抛出一部分来分析一下。
都是很固定的标识,这为我们筛选信息容易多了。
最下面的这个page就是一开始我忽略的信息,其实就是检索到总的页数
那这个网站是怎么实现翻页的呢,点开第二页看看网址的变化。
很明显,这个位置就是页数。至于工作的关键词,就不多说了。
总结:不难,多观察分析就好。
3.上代码开爬。
所需要的类。(excel处理,工作信息对象提取,网页分析,线程,测试类)
excel处理类
public class ExcelTools {
/**
* @desc Job对象写入excel
* @param biglist 装每页List<Job>
* @param filePath 文件路径
* @param key 网页关键字
* @throws IOException
* @throws RowsExceededException
* @throws WriteException
* @throws InterruptedException
*/
public void writeExcel(List <List> biglist,String filePath,String key) throws IOException, RowsExceededException, WriteException, InterruptedException{
File file=new File(filePath);
//创建工作簿
WritableWorkbook excel=Workbook.createWorkbook(file);
//创建sheet页
WritableSheet sheet=excel.createSheet("51job-"+key, 0);
//创建表头
sheet.addCell(new Label(0,0,"工作名字"));
sheet.addCell(new Label(1,0,"福利待遇"));
sheet.addCell(new Label(2,0,"公司名字"));
sheet.addCell(new Label(3,0,"工资"));
sheet.addCell(new Label(4,0,"地区"));
sheet.addCell(new Label(5,0,"发布时间"));
sheet.addCell(new Label(6,0,"公司性质"));
sheet.addCell(new Label(7,0,"招聘要求"));
//写入excel
int row=1;
for (int i = 0; i < biglist.size(); i++) {
List list=biglist.get(i);
for (int j = 0; j < list.size(); j++) {
Job job=(Job) list.get(j);
sheet.addCell(new Label(0,row,job.getJob_name()));
sheet.addCell(new Label(1,row,job.getJob_welf()));
sheet.addCell(new Label(2,row,job.getCompany_name()));
sheet.addCell(new Label(3,row,job.getProvidesalary_text()));
sheet.addCell(new Label(4,row,job.getWorkarea_text()));
sheet.addCell(new Label(5,row,job.getUpdatedate()));
sheet.addCell(new Label(6,row,job.getCompanytype_text()));
sheet.addCell(new Label(7,row,job.getAttribute_text()));
row++;
}
}
//写入
excel.write();
//关闭excel
excel.close();
}
}
job类
public class Job {
//工作名字
public String job_name;
//福利待遇
public String job_welf;
//公司名字
public String company_name;
//工资
public String providesalary_text;
//地区
public String workarea_text;
//发布时间
public String updatedate;
//公司性质
public String companytype_text;
//招聘要求
public String attribute_text;
public String getJob_name() {
return job_name;
}
public void setJob_name(String job_name) {
this.job_name = job_name;
}
public String getJob_welf() {
return job_welf;
}
public void setJob_welf(String job_welf) {
this.job_welf = job_welf;
}
public String getCompany_name() {
return company_name;
}
public void setCompany_name(String company_name) {
this.company_name = company_name;
}
public String getProvidesalary_text() {
return providesalary_text;
}
public void setProvidesalary_text(String providesalary_text) {
this.providesalary_text = providesalary_text;
}
public String getWorkarea_text() {
return workarea_text;
}
public void setWorkarea_text(String workarea_text) {
this.workarea_text = workarea_text;
}
public String getUpdatedate() {
return updatedate;
}
public void setUpdatedate(String updatedate) {
this.updatedate = updatedate;
}
public String getCompanytype_text() {
return companytype_text;
}
public void setCompanytype_text(String companytype_text) {
this.companytype_text = companytype_text;
}
public String getAttribute_text() {
return attribute_text;
}
public void setAttribute_text(String attribute_text) {
this.attribute_text = attribute_text;
}
@Override
public String toString() {
return "Job [job_name=" + job_name + ", job_welf=" + job_welf + ", company_name=" + company_name
+ ", providesalary_text=" + providesalary_text + ", workarea_text=" + workarea_text + ", updatedate="
+ updatedate + ", companytype_text=" + companytype_text + ", attribute_text=" + attribute_text + "]";
}
}
网页分析类
public class JsoupHtml {
String city; // 城市拼音
// 无参构造
public JsoupHtml() {
super();
}
// 有参构造
public JsoupHtml(String city) {
this.city = city;
}
/**
* @desc 获取HTML文档
* @param html
* @return document对象
* @throws IOException
*/
public Document getDocument(String html) throws IOException {
return Jsoup.connect(html).get();
}
/**
* @desc 获取城市的代号
* @return
* @throws IOException
*/
public String getCity() throws IOException {
String cityHtml = "https://www.51job.com/" + city + "/";
Document doc = getDocument(cityHtml);
String citynumber = doc.getElementsByTag("meta").get(4).attr("content");
String citynumber2 = citynumber.substring(citynumber.indexOf("areaid=") + 7, citynumber.indexOf("&"));
return citynumber2;
}
/**
* @desc 获取总页数和招聘信息总数
* @param city
* @param key
* @param moneyNum
* @return
* @throws IOException
*/
public String getPages(String city, String key, String moneyNum) throws IOException {
Document doc = getDocument("https://search.51job.com/list/" + city + ",000000,0000,00,9," + moneyNum + "," + key
+ ",2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=");
Elements e1 = doc.getElementsByAttributeValue("type", "text/javascript");
String str = e1.get(2).toString();
return str.substring(str.indexOf("total_page"), str.indexOf("keyword_ads") - 3).replace('\"', ' ') + "页数据"
+ "\t" + str.substring(str.indexOf("jobid_count"), str.indexOf("banner_ads") - 3).replace('\"', ' ');
}
/**
* @desc 获取job对象并存入容器
* @param doc
* @return List<Job>
* @throws IOException
*/
public List<Job> parseHtml(Document doc) throws IOException {
Elements element = doc.getElementsByAttributeValue("type", "text/javascript");
String[] joblist = element.get(2).toString().split("job_name");
List<String> list = new ArrayList<String>();
for (String string : joblist) {
list.add(string);
}
list.remove(0);
// 把工作信息容器转换成字符串数组进行解析
String[] joblist2 = (String[]) list.toArray(new String[list.size()]);
List<Job> jobList = new ArrayList<Job>();
for (int i = 0; i < joblist2.length; i++) {
String string = joblist2[i];
Job job = new Job();
// 工作名字
String job_name = string.substring(3, string.indexOf("job_title") - 3);
job.setJob_name(job_name);
// 福利待遇
String job_welf = string.substring(string.indexOf("jobwelf") + 10, string.indexOf("jobwelf_list") - 3);
job.setJob_welf(job_welf);
// 公司名字
String company_name = string.substring(string.indexOf("company_name") + 15, string.indexOf("provi") - 3);
job.setCompany_name(company_name);
// 工资
String providesalary_text = string.substring(string.indexOf("text") + 7, string.indexOf("workarea") - 3)
.replace('/', ' ');
job.setProvidesalary_text(providesalary_text);
// 地区
String workarea_text = string.substring(string.indexOf("workarea_text") + 16,
string.indexOf("updatedate") - 3);
job.setWorkarea_text(workarea_text);
// 发布时间
String updatedate = string.substring(string.indexOf("updatedate") + 13,
string.indexOf("iscommunicate") - 3);
job.setUpdatedate(updatedate);
// 公司性质
String companytype_text = string.substring(string.indexOf("companytype_text") + 19,
string.indexOf("degreefrom") - 3);
job.setCompanytype_text(companytype_text);
// 招聘要求
String attribute_text = string.substring(string.indexOf("attribute_text") + 17,
string.indexOf("companysize_text") - 3);
job.setAttribute_text(attribute_text);
jobList.add(job);
}
return jobList;
}
}
线程类
public class MyThread implements Runnable {
int pages; // 用户想获取到的页数
String key; // 网页关键词
String moneyNum; // 工资范围
String city; // 城市代号
public MyThread(String city, int pages, String key, String moneyNum) {
this.pages = pages;
this.key = key;
this.moneyNum = moneyNum;
this.city = city;
}
JsoupHtml jsouphtml = new JsoupHtml();
ExcelTools excel = new ExcelTools();
public void run() {
List big = new ArrayList<List>();
long start=System.currentTimeMillis();
System.out.println(Thread.currentThread().getName() + "开始写入excel.....");
for (int i = 1; i <= pages; i++) {
try {
String html = "https://search.51job.com/list/" + city + ",000000,0000,00,9," + moneyNum + "," + key
+ ",2," + i
+ ".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
Document document = jsouphtml.getDocument(html);
// 存储招聘信息的容器
List list = jsouphtml.parseHtml(document);
// 判断有多少符合的工作信息对象
if (list.size() == 0) {
System.out.println("没有符合的工作");
break;
}
big.add(list);
excel.writeExcel(big, "D:/学习/爬虫/" + key + ".xls", key);
} catch (Exception e) {
e.getStackTrace();
}
}
System.out.println(Thread.currentThread().getName() + "执行完毕!");
System.out.println(System.currentTimeMillis()-start+"/ms");
}
}
测试类
public class Test {
public static void main(String[] args) throws IOException, RowsExceededException, WriteException, InterruptedException {
System.out.println("请输入城市名称:");
String city=new Scanner(System.in).next();
JsoupHtml js=new JsoupHtml(city);
String city1=js.getCity();
System.out.println("输入查询工作个数:");
int jobNum=new Scanner(System.in).nextInt();
System.out.println("------------选择月薪范围-----------");
System.out.println("99-----------所有");
System.out.print("01-----------2k以下"+"\t");
System.out.println("02-----------2-3k");
System.out.print("03-----------3-4.5k"+"\t");
System.out.println("04-----------4.5-6k");
System.out.print("05-----------6-8k"+"\t");
System.out.println("06-----------0.8-10k");
System.out.print("07-----------10-15k"+"\t");
System.out.println("08-----------15-20k");
System.out.print("09-----------20-30k"+"\t");
System.out.println("10-----------30-40k");
System.out.print("11-----------40-50k"+"\t");
System.out.println("12-----------50k以上"+"\t");
String moneyNum=new Scanner(System.in).next();
JsoupHtml jsouphtml=new JsoupHtml();
ExecutorService executorService=Executors.newCachedThreadPool();
switch (jobNum) {
case 1: {
System.out.println("输入职业1:");
String key1=new Scanner(System.in).next();
System.out.println(jsouphtml.getPages(city1,key1,moneyNum));
System.out.println("请输入你打印的页数");
int pages1=new Scanner(System.in).nextInt();
executorService.execute(new MyThread(city1,pages1, key1,moneyNum));
break;
}
case 2:{
System.out.println("输入职业1:");
String key1=new Scanner(System.in).next();
System.out.println(jsouphtml.getPages(city1,key1,moneyNum));
System.out.println("请输入你打印的页数:");
int pages1=new Scanner(System.in).nextInt();
System.out.println("输入职业2:");
String key2=new Scanner(System.in).next();
System.out.println(jsouphtml.getPages(city1,key2,moneyNum));
System.out.println("请输入你打印的页数:");
int pages2=new Scanner(System.in).nextInt();
executorService.execute(new MyThread(city1, pages1, key1,moneyNum));
executorService.execute(new MyThread(city1, pages2, key2,moneyNum));
break;
}
case 3:{
System.out.println("输入职业1:");
String key1=new Scanner(System.in).next();
System.out.println(jsouphtml.getPages(city1,key1,moneyNum));
System.out.println("请输入你打印的页数:");
int pages1=new Scanner(System.in).nextInt();
System.out.println("输入职业2:");
String key2=new Scanner(System.in).next();
System.out.println(jsouphtml.getPages(city1,key2,moneyNum));
System.out.println("请输入你打印的页数:");
int pages2=new Scanner(System.in).nextInt();
System.out.println("输入职业3:");
String key3=new Scanner(System.in).next();
System.out.println(jsouphtml.getPages(city1,key3,moneyNum));
System.out.println("请输入你打印的页数:");
int pages3=new Scanner(System.in).nextInt();
executorService.execute(new MyThread( city1,pages1, key1,moneyNum));
executorService.execute(new MyThread( city1,pages2, key2,moneyNum));
executorService.execute(new MyThread( city1, pages3, key3,moneyNum));
}
}
4.爬取效果
请输入城市名称:
beijing
输入查询工作个数:
3
------------选择月薪范围-----------
99-----------所有
01-----------2k以下 02-----------2-3k
03-----------3-4.5k 04-----------4.5-6k
05-----------6-8k 06-----------0.8-10k
07-----------10-15k 08-----------15-20k
09-----------20-30k 10-----------30-40k
11-----------40-50k 12-----------50k以上
99
输入职业1:
java
total_page : 156页数据 jobid_count : 7775
请输入你打印的页数:
10
输入职业2:
python
total_page : 106页数据 jobid_count : 5288
请输入你打印的页数:
10
输入职业3:
前端
total_page : 85页数据 jobid_count : 4231
请输入你打印的页数:
10
pool-1-thread-1开始写入excel.....
pool-1-thread-3开始写入excel.....
pool-1-thread-2开始写入excel.....
pool-1-thread-3执行完毕!
1710/ms
pool-1-thread-1执行完毕!
2321/ms
pool-1-thread-2执行完毕!
3520/ms
爬取到的文件
5.总结
jsoup学的时间很短,技术不到位,能力有限,只能写个差不多,反正效果爬下来了,后续可能大量修改代码,线程池不会用。有啥问题,欢迎指正!
不过要温柔啊!!!!!!