# 生产者消费者模式
# 队列
import threading,requests
from queue import Queue
import pymongo
# 创建生产者类,作用:访问页面,获取数据
class Productor(threading.Thread):
# 定义初始化函数
def __init__(self,page_queue,data_queue):
# 处理父类init
threading.Thread.__init__(self)
self.page_queue = page_queue
self.data_queue = data_queue
# 重写run方法
def run(self):
# 为了保证队列中的URL能够全部取出,写一个死循环
while True:
# 判断退出循环条件
if self.page_queue.empty():
break
# 取出URL
url = self.page_queue.get()
# 请求并获取数据
self.get_content(url)
# 定义获取数据函数
def get_content(self,url):
response = requests.get(url=url, headers=headers)
Posts = response.json()['Data']['Posts']
for post in Posts:
dic = {}
# 获取岗位名
RecruitPostName = post['RecruitPostName']
# 获取事业群
syq = post['BGName']
# 获取地点
LocationName = post['LocationName']
# 获取发布日期
LastUpdateTime = post['LastUpdateTime']
dic['RecruitPostName'] = RecruitPostName
dic['syq'] = syq
dic['LocationName'] = LocationName
dic['LastUpdateTime'] = LastUpdateTime
# print(dic)
# 保存数据到队列中
self.data_queue.put(dic)
# 创建消费者类,作用:从队列中取出数据,并保存
class Consumer(threading.Thread):
# 定义初始化函数
def __init__(self,data_queue,page_queue):
threading.Thread.__init__(self)
self.data_queue = data_queue
self.page_queue = page_queue
# 重写run方法
def run(self):
# 为了保证能够一直取数据,写死循环
while True:
# 判断循环终止条件
if self.data_queue.empty() and self.page_queue.empty() and switch==1:
break
try:
data = self.data_queue.get(timeout=10)
print(data)
self.save(data)
except:
break
# 定义保存数据函数
def save(self,data):
# 插入数据
col.insert(data)
pass
def __del__(self):
client.close()
pass
if __name__ == '__main__':
# 定义开关
switch = 0
# 建立连接
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
# 进入数据库
db = client['tencent']
# 进入集合
col = db['zhaopin']
# 定义请求头字典
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
# 定义基础URL
base_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?pageIndex={}&pageSize=10'
# 定义保存数据的队列
data_queue = Queue(1000)
# 定义存放URL的队列
page_queue = Queue(100)
# 将URL全部放入page_queue中
for i in range(1,51):
page_queue.put(base_url.format(i))
p_list = []
# 创建三个生产者线程
for i in range(3):
p = Productor(page_queue,data_queue)
p.start()
p_list.append(p)
# 创建三个消费者线程
for i in range(3):
c = Consumer(data_queue,page_queue)
c.start()
for p in p_list:
p.join()
switch = 1
pass
爬虫 腾讯招聘-多线程
最新推荐文章于 2024-10-08 12:37:10 发布