多线程爬取案例

最新推荐文章于 2022-02-21 22:09:31 发布
Faquir_Ying
最新推荐文章于 2022-02-21 22:09:31 发布
阅读量201
点赞数
分类专栏：小王的python日常
本文链接：https://blog.csdn.net/W_Ying_/article/details/103041704
版权
小王的python日常专栏收录该内容
4 篇文章 0 订阅
订阅专栏
	import requests
	from lxml import etree
	import threading
	import pandas as pd
	from queue import Queue
	import time
	
	kv = {
	    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
	}
	
	
	# 获取列表页
	def get_html(url):
	    response = requests.get(url, headers=kv).content.decode('utf-8')
	    tree = etree.HTML(response)
	    detail_url_list = set(tree.xpath('.//em[@class="name"]/a/@href'))
	    print(len(detail_url_list))
	    print(detail_url_list)
	    for detail_url in detail_url_list:
	        print('正在爬取{}'.format('https://' + detail_url[2:]))
	        q.put('https://' + detail_url[2:])
	
	
	# 生产者类，负责请求页面
	class Production(threading.Thread):
	    def __init__(self, name=None):
	        super().__init__()
	
	        self.name = name
	        # 线程执行
	
	    def run(self):
	        while True:
	            # 如果page队列不为空就执行解析方法
	            if q.empty():
	                print('生产者{}结束'.format(self.name))
	                break
	            try:
	                self.get_detail()
	            except:
	                pass
	
	    # 获取详情页
	    def get_detail(self):
	        detail_url = q.get()
	        response = requests.get(detail_url, headers=kv).content.decode('utf-8')
	        tree = etree.HTML(response)
	        q_tree.put((tree, detail_url))
	
	
	# 消费者
	
	class Consumer(threading.Thread):
	    def __init__(self, i=None):
	        super().__init__()
	        self.name = i
	
	    # 线程执行
	    def run(self):
	        while True:
	            if q_tree.empty() and flag:
	                print('消费者{}结束'.format(self.name))
	                break
	            # 设置不等待,解决两个消费者共同解析一个tree页面问题,如果解析同一个，另外一个直接跳过，继续判断,达到结束目的
	            try:
	                self.parse_data()
	            except:
	                pass
	
	    # 解析数据
	    def parse_data(self):
	        # 设置不等待,解决两个消费者共同解析一个页面问题,如果解析同一个，另外一个直接跳过，继续判断
	        dic = {}
	        item_tuple = q_tree.get(timeout=0.5)
	        # queue.Empty=队列空报错
	        tree = item_tuple[0]
	        detail_url = item_tuple[1]
	        # 游戏id
	        dic['game_id'] = detail_url.split('/')[-1][:-4]
	        # 游戏名
	        dic['game_name'] = tree.xpath('.//h1[@class="name"]/text()')[0]
	        # 游戏介绍
	        dic['game_introduce'] = "".strip().join(tree.xpath('.//div[@class="bd txtCon"]/p/text()'))
	        # 游戏图标连接
	        dic['game_ico'] = tree.xpath('.//div[@class="gameDesc"]/img/@src')[0]
	        # 游戏评分
	        dic['game_grade'] = tree.xpath('.//div[@class="card"]/p[@class="score"]/text()')[0]
	        # 评分人数
	        dic['grade_person'] = tree.xpath('.//div[@class="card"]/p[@class="num"]/text()')[0]
	        # 玩游戏人数
	        try:
	            dic['play_num'] = tree.xpath('.//p[@class="num"]/text()')[1]
	        except:
	            dic['play_num'] = '暂无数据'
	        # 游戏大小
	        dic['game_size'] = tree.xpath('.//div[@class="gameTable"]//td/text()')[1]
	        # 版本
	        dic['game_versions'] = tree.xpath('.//div[@class="gameTable"]//td/text()')[0]
	        # 系统要求
	        dic['versions_require'] = tree.xpath('.//div[@class="gameTable"]//td/text()')[2]
	        msg_list.append(dic)
	        print('消费者线程{},解析{}完成'.format(self.name, dic['game_name']))
	
	
	# csv保存
	def save_csv(file_name):
	    df = pd.DataFrame(msg_list,
	                      columns=['game_id', 'game_name', 'game_introduce', 'game_ico', 'game_grade', 'grade_person',
	                               'play_num', 'game_size', 'game_versions',
	                               'versions_require'])
	    df.to_csv('{}.csv'.format(file_name), index=0)
	
	
	# 流程方法
	def flow(file_name):
	    # 设置标志位，供消费者退出判断
	    global flag
	    flag = False
	    # 开始10个线程爬取页面
	    for i in range(10):
	        q.put(i)
	    # 生产者线程列表
	    production_list = []
	    # 创建10个生产者线程请求数据
	    for production_name in range(1000, 1011):
	        if q.empty():
	            break
	        production = Production(str(production_name))
	        # 开启线程
	        production.start()
	        # 存值生产者线程列表，方便Join
	        production_list.append(production)
	    # 创建消费者线程
	    # 消费者线程列表
	    consumer_list = []
	    # 创建3个消费者线程解析页面
	    for consumer_name in range(1, 4):
	        consumer = Consumer(consumer_name)
	        consumer.start()
	        consumer_list.append(consumer)
	    # 设置生产者阻塞
	    for production_thread in production_list:
	        production_thread.join()
	    if q.empty():
	        flag = True
	        print('生产者执行完毕.')
	    # 设置消费者阻塞
	    for consumer_thread in consumer_list:
	        consumer_thread.join()
	    # 保存csv文件
	    save_csv(file_name)
	
	
	if __name__ == '__main__':
	    msg_list = []
	    # 生产者队列
	    q = Queue()
	    # tree队列，存放生产的tree
	    q_tree = Queue()
	    # 人气榜
	    url1 = 'https://www.3839.com/top/hot.html'
	    url2 = 'https://www.3839.com/top/sugar.html'
	    choice = input('请选择要爬取的网站\n 1.好游快爆人气榜\n 2.好游快爆飙升榜\n')
	    if choice == '1':
	        get_html(url1)
	        # 线程流程
	        flow('好游快爆人气榜')
	    elif choice == '2':
	        get_html(url2)
	        # 线程流程
	        flow('好游快爆飙升榜')
	    else:
	        print('输入错误')