分析如下:
1.(深圳)搜索页: https://search.51job.com/list/040000,000000,0000,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 2.输入python岗位搜索后: https://search.51job.com/list/040000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 3.每页只改变2,后面的数字
代码如下:
from urllib.request import *
import re
from lxml import etree
class Job(object):
# 1.发送首页请求
def __init__(self):
# self.position = input("请输入你想要查询的岗位:")
self.start_page = int(input("请输入你要爬的起始页:"))
self.end_page = int(input("请输入你要爬的结束页:"))
# 请求报头
self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
def get_page_html(self):
"""爬虫开始,获取每页的网页源码"""
print('开始爬取数据')
for page in range(self.start_page, self.end_page + 1):
url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,' + 'python' + ',2,' + str(page) + '.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
# print(url)
# 防止请求网页页数过大而报错
try:
print("正在爬取第%d页..."%page)
request = Request(url, headers=self.headers)
response = urlopen(request)
# 得到网页源码
html = response.read().decode("gbk")
self.get_positon_html(html,page)
except:
pass
def get_positon_html(self, html,page):
"""获取每个职位的网页源码"""
selector = etree.HTML(html)
# 获取个职位的单独的html
postion_list = selector.xpath('//p[@class="t1 "][1]//a/@href')
print('正在保存第%d页的数据...' % page)
for postion_url in postion_list:
resquest = Request(postion_url, headers=self.headers)
response = urlopen(resquest)
html1 = response.read().decode('gbk')
self.get_position_info(html1)
def get_position_info(self, html):
# 把每个职位的相关信息放入一个字典中
item = {}
h1 = re.findall(r'<h1 title=.*?>', html)[0]
item['职位'] = re.sub('<h1 title="|">', '', h1)
h2 = re.findall(r'target="_blank" title=.*? ', html)[0]
item["公司"] = re.sub('target="_blank" title="|" ', '', h2)
h3 = re.findall(r'<p class="msg ltype" title=.*?>', html)[0]
item['工作年限'] = re.sub('<p class="msg ltype" title="| |">', '', h3)
h4 = re.findall(r'<strong>.*?</strong>', html)[1]
item['薪水'] = re.sub('<strong>|</strong>', '', h4)
item['分割线'] = '-' *30
for i in item:
with open('51job.txt', 'a') as f:
f.write(item[i]+'\n')
def main():
job = Job()
job.get_page_html()
if __name__ == '__main__':
main()
爬取结果如下:
如果你和我有共同爱好,我们可以加个好友一起交流!