爬取腾讯招聘网-写入Excel

import requests
import jsonpath
import json
import xlwt

def saveData(data_list):
book=xlwt.Workbook(encoding=‘utf-8’,style_compression=0)
sheet=book.add_sheet(‘tengxun2’)
col=(“postid”,“职位名”,“工作地点”,“工作职责”,“工作要求”,“发布时间”,“url”)
for i in range(7):
sheet.write(0,i,col[i])
for i in range(len(data_list)):
data=data_list[i]
print(data)
for j in data:
# print(data)
print(j)
# for j in range(len(data)):C
sheet.write(i+1,0,j[‘id’])
sheet.write(i+1,1,j[‘职位’])
sheet.write(i+1,2,j[‘工作地点’])
sheet.write(i+1,3,j[‘工作职责’])
sheet.write(i+1,4,j[‘工作要求’])
sheet.write(i+1,5,j[‘发布时间’])
sheet.write(i+1,6,j[‘url’])

book.save('tengxun2.xls')

def spider_page(j):
url=‘https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=&postId=’+str(j)+’&language=zh-cn’
response = requests.get(url)
jsonobj = json.loads(response.text)
items = jsonpath.jsonpath(jsonobj, ‘$.Data’)

#print(items)
list1=[]

result={}
result['id']=items[0]['PostId']
result['职位']=items[0]['RecruitPostName']
result['工作地点']=items[0]['LocationName']
result['工作职责']=items[0]['Responsibility']
result['工作要求']=items[0]['Requirement']
result['发布时间']=items[0]['LastUpdateTime']
result['url']=items[0]['PostURL']
list1.append(result)
# print(list1)
#saveData(list1)
return  list1

def spider_url(post,num):

    url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1591689554351&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword='+post+'&pageIndex='+str(num)+'&pageSize=10&language=zh-cn&area=cn'
    #url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1591681767428&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex='+str(num)+'&pageSize=10&language=zh-cn&area=cn'
    headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
    response=requests.get(url,headers=headers)

    html=response.text

    jsonobj=json.loads(html)

    items=jsonpath.jsonpath(jsonobj,'$.Data.Posts[*]')
    list=[]
    for i  in items:
        postid=i['PostId']
        list.append(postid)
    return  list

if name==‘main’:
data_list=[]
post=str(input(‘输入要查找的职位:’))
num=int(input(‘输入要爬取的页数:’))
for i in range(1,num+1):
for j in spider_url(post,i):
data_list.append(spider_page(j))

# for i in range(len(data_list)):
#     data=data_list[i]
#     for j in data:
#         print(j['id'])


saveData(data_list)
print('爬取完毕')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值