最近想要爬取招聘网站的信息看看,偷懒撸了下现成的代码,但是发现只能用一次。又查了下,有大神说是header的问题,所以精简了下header,同时进一步的将职位信息和公司信息细化爬取,免得后期处理起来更加麻烦。
import urllib
import requests
import time
import random
from bs4 import BeautifulSoup
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
page_headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
}
#设置搜索职位名称
key_words = "数据分析"
key = urllib.parse.quote(key_words)
#url='https://www.zhipin.com/c101010100/?query='+key+'&page=1&ka=page-1'
def get_data(url):
try:
res=requests.get(url,headers=page_headers)
status=res.status_code
data=res.text
print(status)
soup=BeautifulSoup(data,'lxml')
#print(soup.prettify()) #输出格式化的html代码
return soup,status
except Exception as e:
print(str(e))
return 0,0
def get_job(url):
soup,status=get_data(url)
if status==200:
job_all=soup.find_all('div',class_="job-primary")
for job in job_all:
try:
List = []
#职位名
job_title=job.find('div',class_="job-title").string
#薪资
job_salary=job.find('span',class_="red").string
#公司地址
CompanyAddress = job.find('div',class_="info-primary").p.get_text("|").split('|')[0]
#工作经验
WorkTime = job.find('div',class_="info-primary").p.get_text("|").split('|')[1]
#学历要求
Education = job.find('div',class_="info-primary").p.get_text("|").split('|')[2]
#职位标签
#job_tag1=job.p.text
#公司
job_company=job.find('div',class_="company-text").a.text
#招聘详情页链接
job_url=job.find('div',class_="company-text").a.attrs['href']
#公司标签
#job_tag2=job.find('div',class_="company-text").p.text
#发布时间
job_time=job.find('div',class_="info-publis").p.text
#公司名称
CompanyName = job.find('div',class_="info-company").find('div',class_='company-text').h3.a.get_text()
List.extend([job_title,job_salary,CompanyAddress,WorkTime,Education,CompanyName])
CompanyInfo = job.find('div',class_="info-company").find('div',class_='company-text').p.get_text("|").split('|')
if len(CompanyInfo) == 1:
# 公司行业
Trade = job.find('div', class_="info-company").find('div', class_='company-text').p.get_text("|").split('|')[0]
List.append(Trade)
elif len(CompanyInfo) == 2:
# 公司行业
Trade = job.find('div', class_="info-company").find('div', class_='company-text').p.get_text("|").split('|')[0]
# 公司发展阶段
CompanyStage = job.find('div', class_="info-company").find('div', class_='company-text').p.get_text("|").split('|')[1]
List.extend([Trade,CompanyStage])
elif len(CompanyInfo) == 3:
# 公司行业
Trade = job.find('div', class_="info-company").find('div', class_='company-text').p.get_text("|").split('|')[0]
# 公司发展阶段
CompanyStage = job.find('div', class_="info-company").find('div', class_='company-text').p.get_text("|").split('|')[1]
# 公司规模
CompanyScale = job.find('div', class_="info-company").find('div', class_='company-text').p.get_text("|").split('|')[2]
List.extend([Trade,CompanyStage,CompanyScale])
with open('job.csv','a+',encoding='utf-8') as fh:
fh.write(job_company+","+job_title+","+job_salary+","+CompanyAddress+","+WorkTime+","+Education+","+Trade+","+CompanyStage+","+CompanyScale+",https://www.zhipin.com"+job_url+"\n")
except Exception as e:
print(str(e))
if __name__=='__main__':
with open('job.csv','w',encoding='utf-8') as fh:
fh.write("公司,职位名,薪资,地址,工作经验,学历要求,公司行业,公司发展阶段,公司规模,招聘链接\n")
for i in range(1,11):
print("正在爬取第 %s 页..." % i)
url='https://www.zhipin.com/c101010100/?query='+key+'&page='+str(i)+'&ka=page-'+str(i)
get_job(url)
#随机等待
span=round(random.random()*6,1)
time.sleep(span)
感谢:Jepson2017的文章https://blog.csdn.net/d1240673769/article/details/91409878