最新拉钩网站数据爬取
import requests
import pandas as pd
def get_page(num,name):
print('正在爬取第{}页'.format(num))
urls = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'accept-encoding':'gzip, deflate, br',
'accept-language':'zh-CN,zh;q=0.9',
'cache-control':'no-cache',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
}
data={
'first':'false',
'pn':num,
'kd':name,
'sid':'7fd39764bafa4e92a0e99571f703693d'
}
s=requests.Session()
s.get(url=url,headers=headers,timeout=3)
cookie=s.cookies
res=requests.post(url=urls,headers=headers,data=data,cookies=cookie,timeout=3)
res.encoding='utf-8'
page_data=res.json()
return page_data
def get_page_info(page_data):
data=page_data['content']['positionResult']['result']
result=[]
for i in data:
list=[]
positionName=i['positionName']
company=i['companyFullName']
companysname=i['companyShortName']
companysize=i['companySize']
companyimage=i['companyLogo']
firstType=i['firstType']
secondType=i['secondType']
major=i['thirdType']
positionLables=i['positionLables']
city=i['city']
salary=i['salary']
workYear=i['workYear']
education=i['education']
list.append(positionName)
list.append(company)
list.append(companysname)
list.append(companysize)
list.append(companyimage)
list.append(firstType)
list.append(secondType)
list.append(major)
list.append(positionLables)
list.append(city)
list.append(salary)
list.append(workYear)
list.append(education)
result.append(list)
return result
def url_page(name):
num=1
results=[]
while num<=30:
page_data=get_page(num,name)
result=get_page_info(page_data)
results+=result
num+=1
else:
print('已经爬取所有的页数')
return results
def write_csv(results,name):
df=pd.DataFrame(data=results,columns=['职位名称','公司名称','公司小名','公司人数','公司照片','公司种类1','公司种类2',\
'专业','职位','所在城市','年薪','工作经验','学位'])
filename=name+'.csv'
df.to_csv(filename,index=False)
def main(name):
results=url_page(name)
write_csv(results,name)
if __name__=='__main__':
name=input('请输入你要查询的职位:')
main(name)