多线程爬取招聘网站

from threading import Thread
import json
import requests
# urls = [
#     'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=天津&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a',
#     'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=北京&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a',
#     'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=上海&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a',
#     'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=广州&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a',
#     'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=杭州&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a',
#     'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=浙江&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a',
#     'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=深圳&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Python&kt=3&=0&at=081a4411244a4e9c80d393212650f005&rt=6cc8df0863c944a88cbc303fa5d7dd40&_v=0.56963230&userCode=1041847897&x-zp-page-request-id=1dba50fde35b475b99fc09aa009dbee1-1568818384291-188677&x-zp-client-id=412ece5e-7595-4148-8838-3b957ac4202a'
# ] #智联招聘的api,我选的是python工作
#
# proxy = {'114.225.218.68':'8118'} #代理的地址,我选的是西刺代理的免费api
# class Grasp(Thread):
#     def __init__(self):
#         super(Grasp,self).__init__() #继承多线程方法
#         self.i =int(str(self.name).split('-')[-1]) #将线程的数字与urls的下标对应
#
#     def run(self):
#         try: #此处为了将异常处理掉
#             while len(urls) > 0:
#                 rs = json.loads(requests.get(urls[self.i-1],proxies=proxy).text)['data']
#                 urls.remove(urls[self.i-1]) #为了防止urls中的url被重复使用,这里将使用过的url删除掉
#                 res = dict(rs)
#                 lis = res['results'] #将url中的数据存储起来,变成列表
#                 item = {}
#                 print(self.name,'start')
#                 for i in range(0,len(lis)): #循环遍历列表中的数据
#                     item['workName'] = lis[i]['jobName']   #获取目标数据,将数据以字典格式存储起来,为了将数据转为json格式作前提
#                     item['workPay'] = lis[i]['salary']
#                     item['workPosition'] = lis[i]['city']['display']
#                     item['degree'] = lis[i]['eduLevel']['name']
#                     item['Company'] = lis[i]['company']['name']
#                     with open('./jobs.json','a',encoding='utf-8') as f: #将线程获得的数据写入json文件
#                         print(self.name,'storing')
#                         f.write(json.dumps(item,ensure_ascii=False)+'\n') #为了让数据好看点
#                         print(item)
#                         print(self.name,'stored')
#                 print(self.name,'success')
#         except:
#             print('loading......')
#
#
# for i in range(len(urls)+1): 循环执行线程
#     g= Grasp()
#     g.start()
#
#
# def read(): #读取数据
#     with open('./jobs.json', 'r',encoding='utf-8') as f:
#         l = f.read().split('\n')
#         for i in range(0,len(l)):
#             try:
#                 print(json.loads(l[i]))
#             except:
#                 p=2
# read()

 爬虫14天小练手这是数据截图:

 

 只需要找到网站中传输的json数据流就可以获取该网站的数据,所以打开谷歌浏览器,耐心查看网络传输的包即可。例如:

 

 如果没有刷新一下网页,即可出现传输的数据流。

接下来我们就可以完成上头所要的数据了。

 

转载于:https://www.cnblogs.com/superSmall/p/11569231.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值