Python3爬虫多线程,词云

整理于2020年10月下旬,献给不甘平凡的你
更多企业级爬虫知识请查收于:

https://blog.csdn.net/weixin_45316122/article/details/109840745

Trick:纯demo,心在哪里,结果就在那里

 

# -*- coding: utf-8 -*-
# Author       :   szy
# Create Date  :   2019/11/29

#请求头函数构造
方式1
# from fake_useragent import UserAgent
# import random
# ua = UserAgent()
# for i in range(20):
#     # print(ua.random)

方式2
def get_random_ua() -> str:
    ua_list = [
        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
        "MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
        "Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        "Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Mobile/8J2",
        "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; SAMSUNG; OMNIA7)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; XBLWP7; ZuneWP7)",
        "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
        "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
        "Mozilla/4.0 (compatible; MSIE 60; Windows NT 5.1; SV1; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
        "Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; TheWorld)"]
    return ua_list[random.randint(0, len(ua_list))]

print(get_random_ua())

首先为什么选择github上这篇文章。因为好,稍微做些注释,加点自己的东西,便是属于我的了。哈哈哈(模块化,多线程,日志,词云模板)

github地址https://github.com/TM0831/Spiders/tree/master/Bilibili

 

任务

Python 写个爬虫来爬取 B 站直播时的弹幕吧!

爬取分析:

  首先打开 Bilibili,然后找到英雄联盟比赛的直播间:我得到的直播间的链接为:https://live.bilibili.com/6?broadcast_type=0&visit_id=8abcmywu95s0#/,这个链接中的 broadcast_type 和 visit_id 是随机生成的,不过对我们的爬取也没影响,只要找到直播间的链接就好了。 打开开发者工具,切换到 NetWork,点选上 XHR,在其中能找到一个请求:https://api.live.bilibili.com/ajax/msg。这个请求需要四个参数(roomid,csrf_token,csrf,visit_id),其中 roomid 为直播间的 id,csrf_token 和 csrf 可以从浏览器上 copy,visit_id 为空。该请求返回的结果中包含十条弹幕信息,包括弹幕内容、弹幕发送人昵称等等。所以要获得更多弹幕内容,我们只需要一直发送这个请求就 OK 了!

爬虫实现:

通过前面的分析可以发现要爬取 B 站直播弹幕还是很轻松的,但是要得到大量弹幕可能就需要考虑使用多线程了。对于爬取到的弹幕,还要及时地保存下来,这里我选择使用 MongoDB 数据库来保存弹幕信息。在爬取直播弹幕的时候,我开了四个线程来爬取,开了两个线程来解析和保存数据,线程之间使用队列来处理数据。

接下来代码演示。注释很详细
 

import re
import time
import jieba
import logging
import pymongo
import requests
import threading
from queue import Queue
from collections import Counter
#需要你下载一个包,再pip安装
from wordcloud import WordCloud


MONGO_HOST = "127.0.0.1"
MONGO_PORT = 27017
MONGO_DB = "Spiders"
MONGO_COL = "bilibili"

logging.basicConfig(filename="run.log", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO,
                    format="%(asctime)s - %(name)s - %(module)s: %(message)s")

"""
建了两个类 CrawlThread 和 ParseThread,CrawThread 是用于爬取弹幕的线程,ParseThread 是用于解析
和保存弹幕的线程,两个类都继承了 threading.Thread,并重写了 run() 方法。
"""

#CrawThread 是用于爬取弹幕的线程
class CrawlThread(threading.Thread):
    def __init__(self, url: str, name: str, data_queue: Queue):
        """
        initial function
        :param url: room url
        :param name: thread name
        :param data_queue: data queue
        """

        super(CrawlThread, self).__init__()#采用继承threading模块实现多线程
        self.room_url = url
        self.room_id = re.findall(r"/(\d+)\?", url)[0] #正则,返回列表,[0]python优秀之处,切片
        self.headers = {
            "Accept": "application/json, text/plain, */*",
            "Content-Type": "application/x-www-form-urlencoded",
            "Origin": "https://live.bilibili.com",
            "Referer": "",
            "Sec-Fetch-Mode": "cors",
            "UserAgent": get_random_ua()#导入上面的get_random_ua()函数
        }
        self.name = name
        self.data_queue = data_queue

    def run(self):
        """
        send request and receive response
        :return:
        """
        while 1:
            try:
                time.sleep(1)
                msg_url = "https://api.live.bilibili.com/ajax/msg"
                # set referer
                self.headers["Referer"] = self.room_url
                # set data
                data = {
                    "roomid": self.room_id,
                    "csrf_token": "e7433feb8e629e50c8c316aa52e78cb2",
                    "csrf": "e7433feb8e629e50c8c316aa52e78cb2",
                    "visit_id": ""
                }
                res = requests.post(msg_url, headers=self.headers, data=data)
                self.data_queue.put(res.json()["data"]["room"])#发送post请求,返回json数据
            except Exception as e:
                logging.error(self.name, e) #log

#ParseThread 是用于解析和保存弹幕的线程
class ParseThread(threading.Thread):
    def __init__(self, url: str, name: str, data_queue: Queue):
        """
        initial function
        :param url: room url
        :param name: thread name
        :param data_queue: data queue
        """
        super(ParseThread, self).__init__()
        self.name = name
        self.data_queue = data_queue
        self.room_id = re.findall(r"/(\d+)\?", url)[0]
        client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT)  #连接数据库
        self.col = client[MONGO_DB][MONGO_COL + self.room_id]

    def run(self):
        """
        get data from queue
        :return:
        """
        while 1:
            comments = self.data_queue.get()
            logging.info("Comment count: {}".format(len(comments)))
            self.parse(comments)

    def parse(self, comments):
        """
        parse comment to get message
        :return:
        """
        for x in comments:
            comment = {
                "text": x["text"],
                "time": x["timeline"],
                "username": x["nickname"],
                "user_id": x["uid"]
            }
            # print(comment)
            self.save_msg(comment)

    def save_msg(self, msg: dict):
        """
        save comment to MongoDB
        :param msg: comment
        :return:
        """
        try:#这一部分如何考虑一下用save去重的话会更好
            self.col.insert_one(msg)
        except Exception as e:
            logging.info(msg)
            logging.error(e)

#开始创建线程
def create_crawl_thread(url: str, data_queue: Queue):
    """
    create thread to crawl comments
    :param url: room url
    :param data_queue: data queue
    :return:
    """
    crawl_name = ['crawler_1', 'crawler_2', 'crawler_3', 'crawler_4']
    for name in crawl_name:
        crawl_list.append(CrawlThread(url, name, data_queue))

#开始创建线程
def create_parse_thread(url: str, data_queue: Queue):
    """
    create thread to parse comments
    :param url: room url
    :param data_queue: data queue
    :return:
    """
    parse_name = ['parser_1', 'parser_2']
    for name in parse_name:
        parse_list.append(ParseThread(url, name, data_queue))


def is_chinese(word: str) -> bool:
    """
    judge it is Chinese or not
    :param word: word
    :return:
    """
    for ch in word:
        if '\u4e00' <= ch <= '\u9fff':
            return True
    return False


def get_words(txt: str) -> str:
    """
    use jieba to cut words
    :param txt: input text
    :return:
    """
    # cut words
    seg_list = jieba.cut(txt)
    c = Counter()
    # count words
    for x in seg_list:
        if len(x) > 1 and x != '\r\n':
            c[x] += 1
    result = ""
    for (k, v) in c.most_common(300):
        # print('%s %d' % (k, v))
        result += "\n" + k
    return result


def cut_text(url: str):
    """
    query data from database
    :param url: room url
    :return:
    """
    room_id = re.findall(r"/(\d+)\?", url)[0]
    client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT)
    col = client[MONGO_DB][MONGO_COL + room_id]
    # query
    data = [i["text"] for i in col.find({}, {"_id": 0, "text": 1})]
    txt = ""
    for text in data:
        for x in text:
            if x.isalpha() or is_chinese(x):
                txt += x
    jieba.load_userdict("userdict.txt")
    text = get_words(txt)
    generate_word_cloud(text)


def generate_word_cloud(text):
    """
    generate word cloud
    :param text: text
    :return:
    """
    # text cleaning
    with open("stopwords.txt", "r", encoding='utf-8') as f:
        stopwords = set(f.read().split("\n"))
    wc = WordCloud(
        font_path="font.ttf",
        background_color="white",
        width=1200,
        height=800,
        max_words=100,
        max_font_size=200,
        min_font_size=10,
        stopwords=stopwords,  # 设置停用词
    )
    # generate word cloud
    wc.generate("".join(text))
    # save as an image
    wc.to_file("rng_vs_skt.png")


if __name__ == "__main__":
    # the room href
    href = "https://live.bilibili.com/6?broadcast_type=0&visit_id=8abcmywu95s0#/"
    # create queue
    queue = Queue()
    crawl_list, parse_list = [], []
    create_crawl_thread(href, queue)
    create_parse_thread(href, queue)
    logging.info("Crawl Start!")
    # thread start
    for i in crawl_list:
        i.start()
    for i in parse_list:
        i.start()
    # thread run
    for i in crawl_list:
        i.join()
    for i in parse_list:
        i.join()
    cut_text(href)

 

 

 

 

 

 

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值