import requests
from lxml import etree
import threading
import pandas as pd
from queue import Queue
import time
kv = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
# 获取列表页
def get_html(url):
response = requests.get(url, headers=kv).content.decode('utf-8')
tree = etree.HTML(response)
detail_url_list = set(tree.xpath('.//em[@class="name"]/a/@href'))
print(len(detail_url_list))
print(detail_url_list)
for detail_url in detail_url_list:
print('正在爬取{}'.format('https://' + detail_url[2:]))
q.put('https://' + detail_url[2:])
# 生产者类,负责请求页面
class Production(threading.Thread):
def __init__(self, name=None):
super().__init__()
self.name = name
# 线程执行
def run(self):
while True:
# 如果page队列不为空就执行解析方法
if q.empty():
print('生产者{}结束'.format(self.name))
break
try:
self.get_detail()
except:
pass
# 获取详情页
def get_detail(self):
detail_url = q.get()
response = requests.get(detail_url, headers=kv).content.decode('utf-8')
tree = etree.HTML(response)
q_tree.put((tree, detail_url))
# 消费者
class Consumer(threading.Thread):
def __init__(self, i=None):
super().__init__()
self.name = i
# 线程执行
def run(self):
while True:
if q_tree.empty() and flag:
print('消费者{}结束'.format(self.name))
break
# 设置不等待,解决两个消费者共同解析一个tree页面问题,如果解析同一个,另外一个直接跳过,继续判断,达到结束目的
try:
self.parse_data()
except:
pass
# 解析数据
def parse_data(self):
# 设置不等待,解决两个消费者共同解析一个页面问题,如果解析同一个,另外一个直接跳过,继续判断
dic = {}
item_tuple = q_tree.get(timeout=0.5)
# queue.Empty=队列空报错
tree = item_tuple[0]
detail_url = item_tuple[1]
# 游戏id
dic['game_id'] = detail_url.split('/')[-1][:-4]
# 游戏名
dic['game_name'] = tree.xpath('.//h1[@class="name"]/text()')[0]
# 游戏介绍
dic['game_introduce'] = "".strip().join(tree.xpath('.//div[@class="bd txtCon"]/p/text()'))
# 游戏图标连接
dic['game_ico'] = tree.xpath('.//div[@class="gameDesc"]/img/@src')[0]
# 游戏评分
dic['game_grade'] = tree.xpath('.//div[@class="card"]/p[@class="score"]/text()')[0]
# 评分人数
dic['grade_person'] = tree.xpath('.//div[@class="card"]/p[@class="num"]/text()')[0]
# 玩游戏人数
try:
dic['play_num'] = tree.xpath('.//p[@class="num"]/text()')[1]
except:
dic['play_num'] = '暂无数据'
# 游戏大小
dic['game_size'] = tree.xpath('.//div[@class="gameTable"]//td/text()')[1]
# 版本
dic['game_versions'] = tree.xpath('.//div[@class="gameTable"]//td/text()')[0]
# 系统要求
dic['versions_require'] = tree.xpath('.//div[@class="gameTable"]//td/text()')[2]
msg_list.append(dic)
print('消费者线程{},解析{}完成'.format(self.name, dic['game_name']))
# csv保存
def save_csv(file_name):
df = pd.DataFrame(msg_list,
columns=['game_id', 'game_name', 'game_introduce', 'game_ico', 'game_grade', 'grade_person',
'play_num', 'game_size', 'game_versions',
'versions_require'])
df.to_csv('{}.csv'.format(file_name), index=0)
# 流程方法
def flow(file_name):
# 设置标志位,供消费者退出判断
global flag
flag = False
# 开始10个线程爬取页面
for i in range(10):
q.put(i)
# 生产者线程列表
production_list = []
# 创建10个生产者线程请求数据
for production_name in range(1000, 1011):
if q.empty():
break
production = Production(str(production_name))
# 开启线程
production.start()
# 存值生产者线程列表,方便Join
production_list.append(production)
# 创建消费者线程
# 消费者线程列表
consumer_list = []
# 创建3个消费者线程解析页面
for consumer_name in range(1, 4):
consumer = Consumer(consumer_name)
consumer.start()
consumer_list.append(consumer)
# 设置生产者阻塞
for production_thread in production_list:
production_thread.join()
if q.empty():
flag = True
print('生产者执行完毕.')
# 设置消费者阻塞
for consumer_thread in consumer_list:
consumer_thread.join()
# 保存csv文件
save_csv(file_name)
if __name__ == '__main__':
msg_list = []
# 生产者队列
q = Queue()
# tree队列,存放生产的tree
q_tree = Queue()
# 人气榜
url1 = 'https://www.3839.com/top/hot.html'
url2 = 'https://www.3839.com/top/sugar.html'
choice = input('请选择要爬取的网站\n 1.好游快爆人气榜\n 2.好游快爆飙升榜\n')
if choice == '1':
get_html(url1)
# 线程流程
flow('好游快爆人气榜')
elif choice == '2':
get_html(url2)
# 线程流程
flow('好游快爆飙升榜')
else:
print('输入错误')
多线程爬取案例
最新推荐文章于 2022-02-21 22:09:31 发布