import requests
import jsonpath
import json
import xlwt
def saveData(data_list):
book=xlwt.Workbook(encoding=‘utf-8’,style_compression=0)
sheet=book.add_sheet(‘tengxun2’)
col=(“postid”,“职位名”,“工作地点”,“工作职责”,“工作要求”,“发布时间”,“url”)
for i in range(7):
sheet.write(0,i,col[i])
for i in range(len(data_list)):
data=data_list[i]
print(data)
for j in data:
# print(data)
print(j)
# for j in range(len(data)):C
sheet.write(i+1,0,j[‘id’])
sheet.write(i+1,1,j[‘职位’])
sheet.write(i+1,2,j[‘工作地点’])
sheet.write(i+1,3,j[‘工作职责’])
sheet.write(i+1,4,j[‘工作要求’])
sheet.write(i+1,5,j[‘发布时间’])
sheet.write(i+1,6,j[‘url’])
book.save('tengxun2.xls')
def spider_page(j):
url=‘https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=&postId=’+str(j)+’&language=zh-cn’
response = requests.get(url)
jsonobj = json.loads(response.text)
items = jsonpath.jsonpath(jsonobj, ‘$.Data’)
#print(items)
list1=[]
result={}
result['id']=items[0]['PostId']
result['职位']=items[0]['RecruitPostName']
result['工作地点']=items[0]['LocationName']
result['工作职责']=items[0]['Responsibility']
result['工作要求']=items[0]['Requirement']
result['发布时间']=items[0]['LastUpdateTime']
result['url']=items[0]['PostURL']
list1.append(result)
# print(list1)
#saveData(list1)
return list1
def spider_url(post,num):
url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1591689554351&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword='+post+'&pageIndex='+str(num)+'&pageSize=10&language=zh-cn&area=cn'
#url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1591681767428&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex='+str(num)+'&pageSize=10&language=zh-cn&area=cn'
headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
response=requests.get(url,headers=headers)
html=response.text
jsonobj=json.loads(html)
items=jsonpath.jsonpath(jsonobj,'$.Data.Posts[*]')
list=[]
for i in items:
postid=i['PostId']
list.append(postid)
return list
if name==‘main’:
data_list=[]
post=str(input(‘输入要查找的职位:’))
num=int(input(‘输入要爬取的页数:’))
for i in range(1,num+1):
for j in spider_url(post,i):
data_list.append(spider_page(j))
# for i in range(len(data_list)):
# data=data_list[i]
# for j in data:
# print(j['id'])
saveData(data_list)
print('爬取完毕')