引言
前途无忧提供了大量的岗位招聘信息。对于就业选择提供一种解决方案。首先是对前途无忧的 robots.txt进行检查,返回结果是404表明允许爬取该网页的所有内容。
目标内容
如爬取python为关键字的岗位内容。
技术流程
1.指定URL
#如前50页的URL,可在开发者工具中寻在,通过访问不同的页面发现URL的规律。
for i in range(1,50):
pages_url.append('https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25AE%25B6%25E6%2595%2599,2,{}.html'.format(i))
#UA伪装,产生随机的UA
ua=UserAgent()
headers = {
'User-Agent': ua.random
}
param = {
'lang': 'c',
'postchannel': 0000,
'workyear': 99,
'cotype': 99,
'degreefrom': 99,
'jobterm': 99,
'companysize': 99,
'ord_field': 0,
'dibiaoid': 0,
'line': '',
'welfare': ''
}
2.发起页面请求
dic_page_text = []
def get_dic_page_text(url):
response = rq.get(url = url,headers = headers,params = param).text
#得到的response与开发者工具显示不一致,可通过Beautifulsoap进行查看。使用正则表达对目标进行提取。
techer_page_text = re.compile('window.__SEARCH_RESULT__ = (.*?)</script>').findall(response)[0]
#对获取的内容进行json转码。
dic_page_text.append(json.loads(techer_page_text)['engine_search_result'])
#停滞1秒,避免请求的网页服务器崩溃。
time.sleep(1)
3.获得响应数据
#将获取的数据保存至列表,便于存储到数据框。
for dic in dic_page_text:
job_href.append(dic['job_href'])
job_name.append(dic['job_name'])
company_name.append(dic['company_name'])
company_href.append(dic['company_href'])
providesalary_text.append(dic['providesalary_text'])
workarea_text.append(dic['workarea_text'])
updatedate.append(dic['updatedate'])
iscommunicate.append(dic['iscommunicate'])
companytype_text.append(dic['companytype_text'])
degreefrom.append(dic['degreefrom'])
workyear.append(dic['workyear'])
issuedate.append(dic['issuedate'])
jobwelf.append(dic['jobwelf'])
attribute_text.append(dic['attribute_text'])
companysize_text.append(dic['companysize_text'])
companyind_text.append(dic['companyind_text'])
url = dic['job_href']
response = rq.get(url = url,headers = headers).content
tree = etree.HTML(response)
page_init = tree.xpath('string(//div[@class = "tBorderTop_box"]/div)')
#格式的初步处理。
page = re.sub(r'\r\n','',page_init).replace(' ', '')
job_information.append(page[:-4])
time.sleep(1)
4.数据持久化
init_data = pd.DataFrame({'job_href':job_href,
'job_name':job_name,
'company_name':company_name,
'company_href':company_href,
'providesalary_text':providesalary_text,
'workarea_text':workarea_text,
'updatedate':updatedate,
'iscommunicate':iscommunicate,
'companytype_text':companytype_text,
'degreefrom':degreefrom,
'workyear':workyear,
'issuedate':issuedate,
'jobwelf':jobwelf,
'attribute_text':attribute_text,
'companysize_text':companysize_text,
'companyind_text':companyind_text,
'job_information':job_information})
#当然也可存入数据库,具体看要求。
init_data.to_csv('./init_data0.csv')
代码实现
#导入模块
import re
import requests as rq
from lxml import etree
from time import sleep
from fake_useragent import UserAgent
import json
from multiprocessing.dummy import Pool as ThreadPool
import time
import modin.pandas as pd
import numpy as npbea
pages_url = []
for i in range(1,10):
pages_url.append('https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25AE%25B6%25E6%2595%2599,2,{}.html'.format(i))
ua=UserAgent()
headers = {
'User-Agent': ua.random
}
param = {
'lang': 'c',
'postchannel': 0000,
'workyear': 99,
'cotype': 99,
'degreefrom': 99,
'jobterm': 99,
'companysize': 99,
'ord_field': 0,
'dibiaoid': 0,
'line': '',
'welfare': ''
}
dic_page_text = []
def get_dic_page_text(url):
response = rq.get(url = url,headers = headers,params = param).text
techer_page_text = re.compile('window.__SEARCH_RESULT__ = (.*?)</script>').findall(response)[0]
dic_page_text.append(json.loads(techer_page_text)['engine_search_result'])
time.sleep(1)
job_href = []
job_name =[]
company_name = []
company_href = []
providesalary_text = []
workarea_text = []
updatedate = []
iscommunicate =[]
companytype_text = []
degreefrom = []
workyear = []
issuedate = []
jobwelf = []
attribute_text = []
companysize_text = []
companyind_text = []
job_information = []
def get_page_text(dic_page_text):
for dic in dic_page_text:
job_href.append(dic['job_href'])
job_name.append(dic['job_name'])
company_name.append(dic['company_name'])
company_href.append(dic['company_href'])
providesalary_text.append(dic['providesalary_text'])
workarea_text.append(dic['workarea_text'])
updatedate.append(dic['updatedate'])
iscommunicate.append(dic['iscommunicate'])
companytype_text.append(dic['companytype_text'])
degreefrom.append(dic['degreefrom'])
workyear.append(dic['workyear'])
issuedate.append(dic['issuedate'])
jobwelf.append(dic['jobwelf'])
attribute_text.append(dic['attribute_text'])
companysize_text.append(dic['companysize_text'])
companyind_text.append(dic['companyind_text'])
url = dic['job_href']
response = rq.get(url = url,headers = headers).content
tree = etree.HTML(response)
page_init = tree.xpath('string(//div[@class = "tBorderTop_box"]/div)')
page = re.sub(r'\r\n','',page_init).replace(' ', '')
job_information.append(page[:-4])
#此处为了实现高并发。
time.sleep(1)
if __name__ == '__main__' :
start = time.time()
ThreadPool_1 = ThreadPool(10)
ThreadPool_1.map(get_dic_page_text,pages_url)
ThreadPool_1.close()
ThreadPool_1.join()
ThreadPool_2 = ThreadPool(12)
ThreadPool_2.map(get_page_text,dic_page_text)
ThreadPool_2.close()
ThreadPool_2.join()
end = time.time()
print(end-start)
init_data = pd.DataFrame({'job_href':job_href,
'job_name':job_name,
'company_name':company_name,
'company_href':company_href,
'providesalary_text':providesalary_text,
'workarea_text':workarea_text,
'updatedate':updatedate,
'iscommunicate':iscommunicate,
'companytype_text':companytype_text,
'degreefrom':degreefrom,
'workyear':workyear,
'issuedate':issuedate,
'jobwelf':jobwelf,
'attribute_text':attribute_text,
'companysize_text':companysize_text,
'companyind_text':companyind_text,
'job_information':job_information})
init_data.to_csv('./init_data0.csv')
致谢
首先,声明该文章非商业用途!
若有疑问可留言,也希望各位提出纠正,恳请宁不吝啬教授,谢谢!!!
同时也感谢小破站提供的网络爬虫的学习,如:
[1]: https://book.apeland.cn/details/69/