简单的爬取了鹅肠在不同城市的招聘岗位和要求,包括大多数城市,没有什么反爬,所以伪装浏览器都给省了。
import requests
import json
import time
class TxJob():
def __init__(self,page,direction,city):
self.page = page
self.direction = direction
self.city = city
self.cityid ={
'北京': 2,
'清远':2601,
'深圳':1,
'张家口':2602,
'广州':5,
'上海':3,
'成都':8,
'武汉':6,
'香港':37,
'台湾':46,
'长沙':18,
'重庆':14,
'长春':25,
'天津':31,
'大连':39,
'福州':4,
'贵阳':17,
'哈尔滨':20,
'杭州':7,
'合肥':30,
'呼和浩特':23,
'济南':24,
'昆明':16,
'兰州':21,
'南宁':27,
'南昌':19,
'南京':11,
'沈阳':10,
'石家庄':28,
'太原':26,
'乌鲁木齐':22,
'西安':9,
'西宁':15,
'厦门':29,
'郑州':12,
'青岛':47,
'无锡':48,
'烟台':49,
'苏州':50,
'海口':53,
'淄博':54,
'宁波':55,
'银川':56,
'扬州':57,
'汕尾':58,
'顺德':84,
'桂林':86,
'澳门':87,
'珠海':89,
'贵安':91,
'佛山':93,
'保定':94,
'雄安新区':95,
'南通':96,
'拉萨':97
}
def get_data(self):
for index in range(1,self.page+1):
url1 = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1613905943241&cityId={}&keyword={}&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(self.cityid[self.city],direction,index)
resp1 = requests.get(url1)
data1 = json.loads(resp1.text)
for i in data1['Data']['Posts']:
postid = i['PostId']
url2 = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp={}&postId={}&language=zh-cn'.format(
str(int(time.time())), postid)
resp2 = requests.get(url2)
data2 = json.loads(resp2.text)
RecruitPostName = data2['Data']['RecruitPostName']
Responsibility = data2['Data']['Responsibility']
Requirement = data2['Data']['Requirement']
self.save_data(RecruitPostName,Responsibility,Requirement)
def save_data(self,RecruitPostName,Responsibility,Requirement):
try:
with open('./tx岗位招聘/python岗位.txt', 'a') as f:
f.write(RecruitPostName)
f.write('\n')
f.write('岗位职责:\n')
f.write(Responsibility)
f.write('\n\n')
f.write('任职要求:\n')
f.write(Requirement)
f.write('\n\n')
except Exception as e:
print(e)
def main(self):
self.get_data()
if __name__ == '__main__':
page = int(input('要爬取的页数:'))
direction = input('要从事什么方向:')
city = input('工作城市:')
txjob = TxJob(page,direction,city)
txjob.main()