爬取腾讯招聘网-写入Excel

最新推荐文章于 2022-06-29 07:51:02 发布

打铃小分队

最新推荐文章于 2022-06-29 07:51:02 发布

阅读量406

点赞数

文章标签： python 爬虫搜索引擎 java爬虫程序爬虫搜索关键字搜索腾讯

本文链接：https://blog.csdn.net/m0_46206005/article/details/107057988

版权

import requests
import jsonpath
import json
import xlwt

def saveData(data_list):
book=xlwt.Workbook(encoding=‘utf-8’,style_compression=0)
sheet=book.add_sheet(‘tengxun2’)
col=(“postid”,“职位名”,“工作地点”,“工作职责”,“工作要求”,“发布时间”,“url”)
for i in range(7):
sheet.write(0,i,col[i])
for i in range(len(data_list)):
data=data_list[i]
print(data)
for j in data:
# print(data)
print(j)
# for j in range(len(data)):C
sheet.write(i+1,0,j[‘id’])
sheet.write(i+1,1,j[‘职位’])
sheet.write(i+1,2,j[‘工作地点’])
sheet.write(i+1,3,j[‘工作职责’])
sheet.write(i+1,4,j[‘工作要求’])
sheet.write(i+1,5,j[‘发布时间’])
sheet.write(i+1,6,j[‘url’])

book.save('tengxun2.xls')

def spider_page(j):
url=‘https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=&postId=’+str(j)+’&language=zh-cn’
response = requests.get(url)
jsonobj = json.loads(response.text)
items = jsonpath.jsonpath(jsonobj, ‘$.Data’)

#print(items)
list1=[]

result={}
result['id']=items[0]['PostId']
result['职位']=items[0]['RecruitPostName']
result['工作地点']=items[0]['LocationName']
result['工作职责']=items[0]['Responsibility']
result['工作要求']=items[0]['Requirement']
result['发布时间']=items[0]['LastUpdateTime']
result['url']=items[0]['PostURL']
list1.append(result)
# print(list1)
#saveData(list1)
return  list1

def spider_url(post,num):

    url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1591689554351&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword='+post+'&pageIndex='+str(num)+'&pageSize=10&language=zh-cn&area=cn'
    #url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1591681767428&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex='+str(num)+'&pageSize=10&language=zh-cn&area=cn'
    headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
    response=requests.get(url,headers=headers)

    html=response.text

    jsonobj=json.loads(html)

    items=jsonpath.jsonpath(jsonobj,'$.Data.Posts[*]')
    list=[]
    for i  in items:
        postid=i['PostId']
        list.append(postid)
    return  list

if name==‘main’:
data_list=[]
post=str(input(‘输入要查找的职位:’))
num=int(input(‘输入要爬取的页数:’))
for i in range(1,num+1):
for j in spider_url(post,i):
data_list.append(spider_page(j))

# for i in range(len(data_list)):
#     data=data_list[i]
#     for j in data:
#         print(j['id'])


saveData(data_list)
print('爬取完毕')

打铃小分队

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫