import requests
import lxml
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
def getPage(url):
#抓取城市招聘信息总页数
'''
获取页数
:param url:
:return:
'''
response = requests.get(url,headers=headers)
mytree = lxml.etree.HTML(response)
totalPage = mytree.xpath('//*[@id="hidTotalPage"]/@value')[0]
return int(totalPage)
#抓取城市信息
def getCity(url):
'''
获取城市列表
:param url:
:return:
'''
response = requests.get(url, headers=headers)
# gb2312 常见简体
# gbk 简体和繁体
html = response.content.decode('gbk')
mytree = lxml.etree.HTML(html)
#创建城市字典cityDict = {'cityName':'cityHref'}
cityDict = {}
#定位抓取信息模块
cityList = mytree.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/a')
for city in cityList:
#城市信息
cityName = city.xpath('./text()')[0]
#城市url
cityHref = city.xpath('./@href')[0]
cityDict[cityName] = cityHref
#返回
return cityDict
#抓取岗位信息
def getJob(url):
'''
获取岗位信息
:param url:
:return:
'''
response = requests.get(url, headers=headers).content.decode('gbk', 'ignore')
mytree = lxml.etree.HTML(response)
#定位抓取信息模块
jobList = mytree.xpath('//div[@class="detlist gbox"]/div')
for job in jobList:
#岗位名
#两种获取方式
#一种是通过 copy xpath
jobName = job.xpath('./p[1]/span[1]/a/@title')[0]
#一种是通过属性
jobName = job.xpath('.//p[@class="info"]/span/a/@title')[0]
jobHref = job.xoath('.//p[@class="info"]/a/@title')[0]
# 公司名
company = job.xpath('.//p[@class="info"]/a/@title')[0]
# 地址
address = job.xpath('.//p[@class="info"]/span[2]/text()')[0]
# 薪资
money = job.xpath('.//p[@class="info"]/span[3]/text()')
#判断薪资信息为空时的逻辑
if len(money) !=0:
money = money[0]
else:
money = '面议'
#要求
orderList = job.xpath('//p[@class="order"]/text()')
order = ""
for o in orderList:
order += o.strip()
#职责
jobRes = job.xpath('.//p[@class="text"/@title]')[0]
#返回 用yield
yield{
"jobName": jobName,
"jobHref": jobHref,
"company": company,
"address": address,
"money": money,
"order": order,
"jobRes": jobRes,
}
if __name__ == '__main__':
#爬取城市信息的url
cityUrl = 'http://jobs.51job.com/'
#将爬取城市的url代入getCity中获得城市名和城市Url
cityDict = getCity(cityUrl)
#将城市名和城市url进行遍历
for cityName,cityHref in cityDict.items():
将城市url代入getPage中获取城市招聘信息总页数
totalPage = getPage(cityHref)
for i in range(1,totalPage + 1):
#拼接url
url = cityHref + "p%d" % i
#写入文本
with open('./job/' + cityName + '.txt', 'a+',encoding='utf-8',erros='ignore')
for data in getJob(url):
f.write(str((data['jobName'], data['jobHref'],
data['company'], data['address'],
data['money'], data['order'], data['jobRes'] + '\n')))
f.flush()