# coding:utf-8
import os
import re
import requests
import lxml
from lxml import etree
#请求头
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36"}
#获取城市列表
def getCityList(url):
html=requests.get(url,headers=headers).content.decode('gbk')
mytree=lxml.etree.HTML(html)
cityList=mytree.xpath('//div[@class="e e4"][1]//div[@class="lkst"]/a')
#print(cityList)
cityDict={}
for city in cityList:
cityName=city.xpath('./text()')[0]
cityUrl=city.xpath('./@href')[0]
#print(cityName,cityUrl)
cityDict[cityName]=cityUrl
#print(cityDict)
return cityDict
#获取页码数
def getPageNum(url):
html=requests.get(url,headers=headers).content.decode('gb2312','ignore')
#print(html)
mytree=lxml.etree.HTML(html)
pageNum=mytree.xpath('//*[@id="cppageno"]/span[1]/text()')[0]
#print(pageNum)
pat='共(\d+)页'
pageNum=re.findall(pat,pageNum)[0]
print(pageNum)
return int(pageNum)
#获取岗位具体信息
def getJobInfo(cityurl,pageNum,cityName):
#创建data目录
for page in range(1,pageNum+1):
print('第'+str(page)+'页')
pageUrl=cityUrl+'p%d'%page
html=requests.get(pageUrl,headers=headers).content.decode('gbk','ignore')
mytree=lxml.etree.HTML(html)
#获取岗位列表
#/html/body/div[3]/div[3]/div[1]/div[2]
jobList=mytree.xpath('//div[@class="detlist gbox"]/div')
for job in jobList:
#职位名称
jobName=job.xpath('.//span[@class="title"]/a/@title')[0]
#url
jobUrl = job.xpath('.//span[@class="title"]/a/@href')[0]
#公司名称
companyName = job.xpath('.//p[@class="info"]/a/@title')[0]
#工作地点
jobAddress=job.xpath('.//span[@class="location name"]/text()')[0]
#薪资
jobMoney = job.xpath('.//span[@class="location"]/text()')
if len(jobMoney)==0:
jobMoney='面谈'
else:
jobMoney=jobMoney[0]
#工作要求
jobOrder=''
orderList=job.xpath('.//p[@class="order"]/text()')
for order in orderList:
order=order.strip()
jobOrder+=order
#print(jobOrder)
#工作描述
jobContent=job.xpath('.//p[@class="text"]/@title')[0]
content=str((jobName,companyName,jobAddress,jobMoney,jobOrder,jobContent,jobUrl))
print(content)
#写入文件
with open('./data/'+cityName+'.txt','a+',encoding='utf-8',errors='ignore') as f:
f.write(content)
f.flush()
if __name__ == '__main__':
url = 'http://jobs.51job.com/'
#获取城市列表
cityDict=getCityList(url)
for cityName,cityUrl in cityDict.items():
pageNum=getPageNum(cityUrl)
#测试 pageNum=20
getJobInfo(cityUrl,pageNum,cityName)
爬取51岗位(xpath的运用)
最新推荐文章于 2021-05-30 19:16:12 发布