使用的IDE工具:pyCharm
首先创建一个python项目,
然后创建一个python Package,我的Package名是learn_crawl_51job,你们可以自己命名哈
然后在此目录下新建一个python文件,我命名为crawl_51job.py
然后代码如下:
# 导入要用的包
import requests
import multiprocessing
from lxml import etree
# 定义一个类
class Crawl_51job(object):
def __init__(self):
# 设置请求头
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3902.4 Safari/537.36'
}
# 空列表用于存储我构造的页码url
self.url = []
# 定义一请求url的方法
def request(self, url):
response = requests.get(url=url, headers=self.header)
# 对返回的数据进行重新编码,不设置的话会出现乱码
# 原因看看我上一篇文章
response.encoding = 'gbk'
return response.text
# 获取页数和构造url
def get_page(self):
# 第一页的url,打开51job,职位搜索框中输入python,地区为全国,然后复制其url
first_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html'
# 请求第一页
response = self.request(first_url)
# 经过反复的在网页中翻页发现url 页码 的规律
# 使用XPath从第一页中获取总页数
html = etree.HTML(response)
page = html.xpath('//input[@id="hidTotalPage"]/@value')[0]
# 通过规律构造搜索结果的所有页码的url,保存到_init_方法中的self.url中
for i in range(1, int(page) + 1):
self.url.append('https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,' + str(i) + '.html')
# 爬取目标信息
def crwal_info(self, url):
response = self.request(url)
html = etree.HTML(response)
all_div = html.xpath('//div[@id="resultList"]/div[@class="el"]')
job_info_list =[]
for item in all_div:
job_info = {}
job_info['职位'] = str(item.xpath('./p/span/a/@title')[0])
job_info['公司'] = str(item.xpath('./span/a/@title')[0])
job_info['工作地点'] = str(item.xpath('./span[@class="t3"]/text()')[0])
try:
job_info['薪资'] = str(item.xpath('./span[@class="t4"]/text()')[0])
except IndexError:
job_info['薪资'] = '无数据'
job_info['发布时间'] = str(item.xpath('./span[@class="t5"]/text()')[0])
job_info_list.append(job_info)
print(job_info_list)
# 多进程爬取信息
def run(self):
self.get_page()
# 引入进程池5个
pool = multiprocessing.Pool(5)
for url in self.url:
pool.apply_async(self.crwal_info, args=(url,))
# 关闭进程池,注意一定不要放在for循环里面,不然会报错
pool.close()
pool.join()
if __name__ == '__main__':
# 实例化
text = Crawl_51job()
text.run()
运行效果如下: