#从 文件内 导入 函数
from getHtml import getHtml
from bs4 import BeautifulSoup
from openpyxl import Workbook
import time
#获取职位信息的函数
def getData(url):
#准备网址
#url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
#获取函数返回的网页文本内容
html = getHtml(url)
#获取beautifulsoup对象
soup = BeautifulSoup(html,'html.parser')
#分析网页提取内容
#遵循常用的规则,由大到小,先确定父元素,再确定子元素
#先获取指定内容的父元素
parent = soup.find(id = 'resultList')
#通过父元素获取所有class为el的div,可以避免父元素之外的div的影响
#获取所有class为el的div
divs = parent.find_all('div',class_='el')
#使用for从每一个div提取信息
#在实际操作过程中发现,第一次就报错,回去查看结构发现,第一个不是想要的p标签
divs.pop(0)#那么就删除第一个p标签
for each in divs:
#获取每一个el的职位名信息
jobName = each.find('p').get_text().strip()
# print(jobName)
#获取span下的公司名
company = each.find_all('span',recursive=False)[0].get_text().strip()
#print(company)
#获取工作地点
address = each.find_all('span', recursive=False)[1].get_text().strip()
#获取薪资
salary = each.find_all('span', recursive=False)[2].get_text().strip()
#获取发布时间
lunch_time = each.find_all('span', recursive=False)[3].text.strip()
dataList.append([jobName,company,address,salary,lunch_time])
dataList = []
def saveToExcel(excelname):
wb = Workbook()
ws = wb.active
ws.title = excelname+'职位信息'
ws.append(['职位名称','公司名称','工作地点','薪资','发布时间'])
for i in dataList:
ws.append(i)
wb.save(excelname+'.xlsx')
#创建主函数,可以爬取多页
def main(jobname,n,excelname):
for index in range(1,n+1) :
url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'+jobname+',2,'+str(index)+'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
print("正在爬取第" + str(index) + "页")
getData(url)
time.sleep(1)
saveToExcel(excelname)
main('数据分析师',3,'数据分析师')
pthon 爬虫 使用requests爬取51job职位信息
最新推荐文章于 2021-10-01 22:31:53 发布