@TOC
分析
我们这里搜索Python的职位
打开控制台看数据
从图中可以看到这个请求有我们所需要的数据
代码实现
这里要用到mongodb数据库,所以需要下载模块 pymongo
pip install pymongo
多线程
import threading
import requests
import pymongo
from queue import Queue
class Tencent(threading.Thread):
def __init__(self,url,name,q_task):
super().__init__()
self.url=url
self.name=name
self.q_task=q_task
# 1.创建连接
self.client = pymongo.MongoClient(host='localhost', port=27017)
# 连接数据库
self.db = self.client['tencent_data']
#插入到mongo中
def write_to_mongo(self,data):
# 插入数据
self.db['招聘信息'].update({'PostId': data['PostId']}, {'$set': data}, upsert=True)
#获取相应数据
def get_rep(self,page):
#请求头
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36 Maxthon/5.2.6.1000'
}
params = {
'timestamp': '1599786948721',
'countryId': '',
'cityId': '',
'bgIds': '',
'productId': '',
'categoryId': '',
'parentCategoryId': '',
'attrId': '',
'keyword': 'python',
'pageIndex': str(page),
'pageSize': '10',
'language': 'zh-cn',
'area': 'cn',
}
return requests.get(self.url, headers=headers, params=params)
#解析json数据
def parse_json(self,json_data):
for data in json_data['Data']['Posts']:
self.write_to_mongo(data)
#获取页面
def parse_page(self,page):
rep = self.get_rep(page).json()
self.parse_json(rep)
def run(self):
while True:
if self.q_task.empty():
break
#取出页面
page=self.q_task.get()
#请求解析
print(f"=======================第{page}页==========={self.name}")
self.parse_page(page)
if __name__ == '__main__':
url = 'https://careers.tencent.com/tencentcareer/api/post/Query'
q_page = Queue()
for page in range(1, 100):
q_page.put(page)
#添加线程名称
threading_names=["线程1","线程2","线程3","线程4"]
#启动线程
for name in threading_names:
t=Tencent(url,name,q_page)
t.start()
生产者和消费者
import threading
import requests
import pymongo
from queue import Queue
#生产者
class Product(threading.Thread):
def __init__(self,url,q_page,name):
super().__init__()
self.url=url
self.q_page=q_page
self.name=name
def get_rep(self,page):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36 Maxthon/5.2.6.1000'
}
params = {
'timestamp': '1599786948721',
'countryId': '',
'cityId': '',
'bgIds': '',
'productId': '',
'categoryId': '',
'parentCategoryId': '',
'attrId': '',
'keyword': 'python',
'pageIndex': str(page),
'pageSize': '10',
'language': 'zh-cn',
'area': 'cn',
}
return requests.get(self.url, headers=headers, params=params)
def parse_page(self,page):
return self.get_rep(page).json()
def run(self):
while True:
if self.q_page.empty():
break
# 取出页面
page = self.q_page.get()
# 请求解析
print(f"|=======================第{page}页==========={self.name}| ")
#添加json
q_json.put(self.parse_page(page))
#消费者
class Consumer(threading.Thread):
def __init__(self,name):
super().__init__()
self.name=name
# 1.创建连接
self.client = pymongo.MongoClient(host='localhost', port=27017)
# 连接数据库
self.db = self.client['tencent_data']
def write_to_mongo(self,data):
# 插入数据
self.db['招聘信息'].update({'PostId': data['PostId']}, {'$set': data}, upsert=True)
def parse_json(self,json_data):
for data in json_data['Data']['Posts']:
self.write_to_mongo(data)
def run(self):
while True:
if q_json.empty() and flag:
break
try:
json_data=q_json.get(block=False)
print(f"|============{self.name}正在保存======================|")
#解析保存
self.parse_json(json_data)
except Exception:
continue
if __name__ == '__main__':
#轮询参数
flag=False #判断生产者是否完成
url = 'https://careers.tencent.com/tencentcareer/api/post/Query'
q_page = Queue()
q_json=Queue()
for page in range(1, 100):
q_page.put(page)
#添加生成者线程
Product_names=["生产者1","生产者2","生产者3","生产者4"]
Consumer_names=["消费者1","消费者2","消费者3","消费者4"]
#保存生产者的每个线程的引用
crawl_p=[]
#启动线程
for name in Product_names:
p=Product(url,q_page,name)
p.start()
crawl_p.append(p)
for name in Consumer_names:
p=Consumer(name)
p.start()
#阻塞生产者都完成
a=[p.join() for p in crawl_p]
flag=True