思路
✅创建多个队列,每一个队列存放不同级别页面的url
✅分别从不同的队列中获取url地址,并找到对应的解析函数解析提取数据
✅二级队列及以上,队列中获取url地址时需要使用timeout参数
测试网址: 搜索 | 腾讯招聘
可以看到网站的动态加载的,进行抓包,找到当前网页的数据包
进详情页分析一手
内容在data里面
这些都是我们要爬的
导入库
import time,requests
from threading import Thread,Lock
from queue import Queue
from fake_useragent import UserAgent
from urllib import parse
创建准备、请求响应
def __init__(self):
self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1644370587575&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={}&pageIndex=1&pageSize={}&language=zh-cn&area=cn'
self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1644370614815&postId={}&language=zh-cn'
# 创建2个队列
self.one_q = Queue()
self.two_q = Queue()
# 创建1把锁
self.lock1 = Lock()
self.lock2 = Lock()
# 计数
self.number = 0 # 初始值为0
def get_html(self, url):
"""功能函数1:获取响应内容"""
headers = {'User-Agent':UserAgent().random}
html = requests.get(url=url, headers=headers).json()
return html
大页面的数据请求和职业总数
def parse_one_page(self):
"""一级页面url入队列"""
keyword = input('请输入职位类别:')
keyword = parse.quote(keyword)
total = self.get_total(keyword)
for page in range(1, total+1):
url = self.one_url.format(keyword, page)
self.one_q.put(url)
def get_total(self,keyword):
"""获取某个类别的总页数"""
url = self.one_url.format(keyword, 1)
html = self.get_html(url=url)
count = html['Data']['Count']
total = count//10 if count%10==0 else count//10 +1
return total
大页面提取
def parse_one_page(self):
"""一级页面解析函数:提取postid,并拼接二级页面url地址,入队列"""
while True:
self.lock1.acquire()
if not self.one_q.empty():
one_url = self.one_q.get()
self.lock1.release()
one_html = self.get_html(url=one_url)
# one_html中有10个postid
for one_job in one_html['Data']['Posts']:
post_id = one_job['PostId']
job_url = self.two_url.format(post_id)
# 将职位详情页链接交给二级队列
self.two_q.put(job_url)
else:
self.lock1.release()
break
详情页面提取
def parse_two_page(self):
"""二级页面解析函数:提取具体的职位信息"""
while True:
self.lock2.acquire()
if not self.two_q.empty():
try:
two_url = self.two_q.get(tuneout=1)
self.lock2.release()
two_html = self.get_html(url=two_url)
item = {}
item['name'] = two_html['Data']['RecruitPostName']
item['type'] = two_html['Data']['CategoryName']
item['address'] = two_html['Data']['LocationName']
item['duty'] = two_html['Data']['Responsibility']
item['require'] = two_html['Data']['Requirement']
item['time'] = two_html['Data']['LastUpdataTime']
print(item)
self.lock2.acquire()
self.number += 1
self.lock2.release()
except Exception as e:
self.lock2.release()
break
创建线程
def run(self):
# 先让url地址入队列
self.url_in()
# 创建多线程运行
t1_list = []
t2_list = []
for i in range(2): # 创建2个线程
t1 = Thread(target=self.parse_one_page)
t1_list.append(t1)
t1.start()
for i in range(2): # 创建2个线程
t2 = Thread(target=self.parse_two_page)
t2_list.append(t2)
t2.start()
for ti in t1_list:
ti.join()
for t2 in t2_list:
t2.join()
print('number:',self.number) # 技术请求多少页
运行函数
if __name__ == ' __main__':
start_time = time.time()
spider = TencentSpider()
spider.run()
end_time = time.time()
print('time:%.2f' % (end_time - start_time)) # 看运行的时间
线程函数
def parse_html(self):
"""线程事件函数:获取url,请求,解析,处理数据"""
while True:
# 上锁
self.lock.acquire()
if not self.q.empty():
url = self.q.get()
# 放锁
self.lock.release()
headers = {'User-Agent': UserAgent().random}
html = requests.get(url=url, headers=headers)
完整代码
import time,requests
from threading import Thread,Lock
from queue import Queue
from fake_useragent import UserAgent
from urllib import parse
class TencentSpider:
def __init__(self):
self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1644370587575&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={}&pageIndex=1&pageSize={}&language=zh-cn&area=cn'
self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1644370614815&postId={}&language=zh-cn'
# 创建2个队列
self.one_q = Queue()
self.two_q = Queue()
# 创建1把锁
self.lock1 = Lock()
self.lock2 = Lock()
# 计数
self.number = 0 # 初始值为0
def get_html(self, url):
"""功能函数1:获取响应内容"""
headers = {'User-Agent':UserAgent().random}
html = requests.get(url=url, headers=headers).json()
return html
def parse_one_page(self):
"""一级页面url入队列"""
keyword = input('请输入职位类别:')
keyword = parse.quote(keyword)
total = self.get_total(keyword)
for page in range(1, total+1):
url = self.one_url.format(keyword, page)
self.one_q.put(url)
def get_total(self,keyword):
"""获取某个类别的总页数"""
url = self.one_url.format(keyword, 1)
html = self.get_html(url=url)
count = html['Data']['Count']
total = count//10 if count%10==0 else count//10 +1
return total
def parse_one_page(self):
"""一级页面解析函数:提取postid,并拼接二级页面url地址,入队列"""
while True:
self.lock1.acquire()
if not self.one_q.empty():
one_url = self.one_q.get()
self.lock1.release()
one_html = self.get_html(url=one_url)
# one_html中有10个postid
for one_job in one_html['Data']['Posts']:
post_id = one_job['PostId']
job_url = self.two_url.format(post_id)
# 将职位详情页链接交给二级队列
self.two_q.put(job_url)
else:
self.lock1.release()
break
def parse_two_page(self):
"""二级页面解析函数:提取具体的职位信息"""
while True:
self.lock2.acquire()
if not self.two_q.empty():
try:
two_url = self.two_q.get(tuneout=1)
self.lock2.release()
two_html = self.get_html(url=two_url)
item = {}
item['name'] = two_html['Data']['RecruitPostName']
item['type'] = two_html['Data']['CategoryName']
item['address'] = two_html['Data']['LocationName']
item['duty'] = two_html['Data']['Responsibility']
item['require'] = two_html['Data']['Requirement']
item['time'] = two_html['Data']['LastUpdataTime']
print(item)
self.lock2.acquire()
self.number += 1
self.lock2.release()
except Exception as e:
self.lock2.release()
break
def run(self):
# 先让url地址入队列
self.url_in()
# 创建多线程运行
t1_list = []
t2_list = []
for i in range(2): # 创建2个线程
t1 = Thread(target=self.parse_one_page)
t1_list.append(t1)
t1.start()
for i in range(2): # 创建2个线程
t2 = Thread(target=self.parse_two_page)
t2_list.append(t2)
t2.start()
for ti in t1_list:
ti.join()
for t2 in t2_list:
t2.join()
print('number:',self.number) # 技术请求多少页
def parse_html(self):
"""线程事件函数:获取url,请求,解析,处理数据"""
while True:
# 上锁
self.lock.acquire()
if not self.q.empty():
url = self.q.get()
# 放锁
self.lock.release()
headers = {'User-Agent': UserAgent().random}
html = requests.get(url=url, headers=headers)
if __name__ == ' __main__':
start_time = time.time()
spider = TencentSpider()
spider.run()
end_time = time.time()
print('time:%.2f' % (end_time - start_time)) # 看运行的时间