首先要跟大家讲一下什么是生产消费模式,顾名思义就是要有一个生产者生产某个东西,然后将这个东西放存放到一个地方等待被消费者消费掉,总的来说生产者和消费者之间还需要个缓冲区
- 有点抽象,举个例子
比如我们写信,我们本人就是生产者,信就是我们的生产所得物,然后我们把信放入信箱,而信箱就是存放的地方也叫做缓冲区,最后信箱中的信由邮递员取出进行邮递 (这里的邮递员就是消费者,取出信就是消费过程)
- 生产消费模式爬虫
刚才简单讲了一下什么是生产消费模式,那么生产消费模式的爬虫我们就可以理解为:在爬取的时候由生产者将解析提取的数据放入队列,然后消费者将队列中的数据存入数据库 (这里的生产者所做的事情:发送请求---接收响应---解析响应提取数据---放入缓冲区(队列),消费者所做的事情:从队列中取出数据---保存数据)
多线程这里就不多解释了,网上有很多讲解的。
- 接下来看代码
爬取的网站是 ‘好游快爆‘ 的排行榜top100的游戏详情、评论等内容 url: https://www.3839.com/top/hot.html
导入需要的模块
import requests
import threading
import os, re, json
from queue import Queue
from lxml import etree
我们先创建生产者类 Game_Top_Producer并继承 Thread,注释有解释
# 生产者
class Game_Top_Producer(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
" (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
def __init__(self, game_urls_queue, game_info_queue, *args, **kwargs):
super(Game_Top_Producer, self).__init__(*args, **kwargs)
self.game_urls_queue = game_urls_queue # 游戏详情url队列
self.game_info_queue = game_info_queue # 解析的数据放入该队列由消费者消费
self.game_info_dict = {} # 数据保存格式:字典
# 重写run方法
def run(self):
while True:
if self.game_urls_queue.empty(): # 判断游戏详情页的url队列是否为空
break
url = self.game_urls_queue.get() # 不为空获取url
# print(url)
self.parse_game_info(url) # 调用解析函数
def parse_game_info(self, url):
response = requests.get(url, self.headers) # 返回响应内容
response.encoding = "utf-8" # 该网站数据时utf-8编码格式
text = response.text # 转成文本格式
html = etree.HTML(text) # 转成html格式,方便使用xpath
# 游戏id
game_id = re.findall("\d+", url)[1] # 用正则提取游戏id,游戏详情页的url带有游戏id
if not game_id: # 由于有些游戏url没有id
return
self.game_info_dict["game_id"] = game_id # 存入字典
# 游戏名字
game_name = "".join(html.xpath("//div[@class='area']//h1/text()"))
self.game_info_dict["game_name"] = game_name
# 游戏图片url
game_logo_pic = "".join(html.xpath("//div[@class='area']//div[@class='gameDesc']/img/@src"))
# print(game_logo_pic)
if not game_logo_pic: return # 有些游戏图片url不存在
if "https" not in game_logo_pic: # 某些url中没有https字符,需要加上
game_logo_pic = "https:" + game_logo_pic
self.game_info_dict["game_logo_pic"] = game_logo_pic
# 游戏介绍
game_introduce = "".join(html.xpath("//div[@class='area']//div[@id='zinfoc4']/text()"))
self.game_info_dict["game_introduce"] = game_introduce
# 游戏评分
game_score = "".join(html.xpath("//div[@class='area']//div[@class='card']/p[2]/text()"))
self.game_info_dict["game_score"] = game_score
# 游戏评论数
game_comment_count = "".join(html.xpath("//div[@class='area']//div[@class='tabArea']/a[@onclick='sel_tab(2)']/span/text()"))
self.game_info_dict["game_comment_count"] = game_comment_count
# print(game_info_dict, url)
# 游戏评论详情连接,这里评论不是直接能获取到的,它是个ajax请求,需要找到这个请求和规律,规律是携带当前游戏的id,数据格式:json
game_omment_detail_link = "https://www.3839.com/cdn/comment/view_v2-ac-json-pid-1-fid-"+ game_id +"-p-1-order-1-htmlsafe-1-urltype-1-audit-1.htm"
# 调用解析该请求的方法
self.parse_game_comment(game_omment_detail_link)
# 将数据以字典的形式存入队列
self.game_info_queue.put(self.game_info_dict)
# print( self.game_info_dict["game_logo_pic"])
# 解析游戏评论的方法
def parse_game_comment(self, game_omment_detail_link):
response = requests.get(game_omment_detail_link, self.headers)
json_text = response.text.encode('utf-8').decode('unicode_escape') # 将数据进行解码,得到中文
try:
# 将json转为字典
json_comment_dict = json.loads(json_text)
except Exception as e:
return
# print(json_comment_dict)
# 评论id
comment_id = json_comment_dict["content"][0]['id']
self.game_info_dict['comment_id'] = comment_id
# 用户id
user_id = json_comment_dict["content"][0]['uid']
self.game_info_dict['user_id'] = user_id
# 用户名称
username = json_comment_dict["content"][0]['username']
self.game_info_dict['username'] = username
# 用户头像
portrait = "http:"+"//imga.3839.com/{}".format(user_id)
self.game_info_dict['portrait'] = portrait
# 评论时间
comment_create_time = json_comment_dict["content"][0]['time']
self.game_info_dict['comment_create_time'] = comment_create_time
# 评论内容
comment_content = json_comment_dict["content"][0]['comment']
self.game_info_dict['comment_content'] = comment_content
# 评论回复数
reply_count = 0
try:
replys = json_comment_dict["content"][0]['reply']
reply_count = len(replys)
except Exception as e:
reply_count = 0
self.game_info_dict['reply_count'] = reply_count
- 定义消费者类并继承 Thread
# 消费者
class Game_Top_Consumer(threading.Thread):
def __init__(self, game_urls_queue, game_info_queue, *args, **kwargs):
super(Game_Top_Consumer, self).__init__(*args, **kwargs)
self.game_urls_queue = game_urls_queue # 游戏详情url队列
self.game_info_queue = game_info_queue # 游戏数据队列
# 创建文件夹
if not os.path.exists("D:\\爬虫测试"): os.makedirs("D:\\爬虫测试")
if not os.path.exists("D:\\爬虫测试\\游戏logo图片"): os.makedirs("D:\\爬虫测试\\游戏logo图片")
if not os.path.exists("D:\\爬虫测试\\用户头像"): os.makedirs("D:\\爬虫测试\\用户头像")
# 重写run方法
def run(self):
while True:
# 如果url队列和数据队列都为空就结束循环
if self.game_urls_queue.empty() and \
self.game_info_queue.empty(): break
# 获取队列数据
info_json = self.game_info_queue.get()
game_name = info_json["game_name"] # 获取游戏名
with open("D:\\爬虫测试\\game_info.txt", "a+", encoding='utf-8') as f: # 追加写入文件
f.write(str(info_json).replace("<br>", "")) # json格式写入
print("%s游戏详情保存完成" % game_name)
game_logo_pic_url = info_json["game_logo_pic"] # 游戏logo图片url
with open("D:\\爬虫测试\\游戏logo图片\\{}".format(game_name+".jpg") , "wb") as f:
f.write(requests.get(game_logo_pic_url).content) # 下载图片并保存
print("%s 游戏logo下载并保存完成" % game_name)
username = str(info_json["username"]).replace("*", "")
portrait_url = info_json["portrait"] # 用户头像链接
print(username)
with open("D:\\爬虫测试\\用户头像\\{}".format(username+".jpg"), "wb") as f:
f.write(requests.get(portrait_url).content) # 下载头像并保存
print("%s 用户头像下载并保存完成" % username)
定义一个主方法获取所有详情链接和创建启动多个线程
创建两个队列,一个是game_urls_queue用于存放详情url,一个是存放详情内容数据给消费者消费
# 定义一个主方法,获取所有游戏链接
def main(start_url):
game_urls_queue = Queue(100) # 游戏url对列
game_info_queue = Queue(100) # 游戏数据对列:用于消费
res = requests.get(start_url)
res.encoding = "utf-8"
text = res.text
html = etree.HTML(text)
games_url_li = html.xpath("//ul[@class='ranking-game ranking-list']/li/a[1]/@href")
for game_url_li in games_url_li:
game_urls_queue.put("http:"+str(game_url_li))
# 创建3个线程生产
for x in range(4):
t = Game_Top_Producer(game_urls_queue, game_info_queue)
t.start()
# 创建3个线程消费
for x in range(4):
t = Game_Top_Consumer(game_urls_queue,game_info_queue)
t.start()
if __name__ == '__main__':
start_url = "https://www.3839.com/top/hot.html"
main(start_url)
好了,今天就到这里,谢谢。。。!