from threading import Thread import json import requests # urls = [ # 'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=天津&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a', # 'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=北京&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a', # 'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=上海&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a', # 'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=广州&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a', # 'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=杭州&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a', # 'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=浙江&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a', # 'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=深圳&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a' # ] #智联招聘的api,我选的是python工作 # # proxy = {'114.225.218.68':'8118'} #代理的地址,我选的是西刺代理的免费api # class Grasp(Thread): # def __init__(self): # super(Grasp,self).__init__() #继承多线程方法 # self.i =int(str(self.name).split('-')[-1]) #将线程的数字与urls的下标对应 # # def run(self): # try: #此处为了将异常处理掉 # while len(urls) > 0: # rs = json.loads(requests.get(urls[self.i-1],proxies=proxy).text)['data'] # urls.remove(urls[self.i-1]) #为了防止urls中的url被重复使用,这里将使用过的url删除掉 # res = dict(rs) # lis = res['results'] #将url中的数据存储起来,变成列表 # item = {} # print(self.name,'start') # for i in range(0,len(lis)): #循环遍历列表中的数据 # item['workName'] = lis[i]['jobName'] #获取目标数据,将数据以字典格式存储起来,为了将数据转为json格式作前提 # item['workPay'] = lis[i]['salary'] # item['workPosition'] = lis[i]['city']['display'] # item['degree'] = lis[i]['eduLevel']['name'] # item['Company'] = lis[i]['company']['name'] # with open('./jobs.json','a',encoding='utf-8') as f: #将线程获得的数据写入json文件 # print(self.name,'storing') # f.write(json.dumps(item,ensure_ascii=False)+'\n') #为了让数据好看点 # print(item) # print(self.name,'stored') # print(self.name,'success') # except: # print('loading......') # # # for i in range(len(urls)+1): 循环执行线程 # g= Grasp() # g.start() # # # def read(): #读取数据 # with open('./jobs.json', 'r',encoding='utf-8') as f: # l = f.read().split('\n') # for i in range(0,len(l)): # try: # print(json.loads(l[i])) # except: # p=2 # read()
爬虫14天小练手这是数据截图:
只需要找到网站中传输的json数据流就可以获取该网站的数据,所以打开谷歌浏览器,耐心查看网络传输的包即可。例如:
如果没有刷新一下网页,即可出现传输的数据流。
接下来我们就可以完成上头所要的数据了。