整理于2020年10月下旬,献给不甘平凡的你
更多企业级爬虫知识请查收于:
https://blog.csdn.net/weixin_45316122/article/details/109840745
Trick:纯demo,心在哪里,结果就在那里
# -*- coding: utf-8 -*-
# Author : szy
# Create Date : 2019/11/29
#请求头函数构造
方式1
# from fake_useragent import UserAgent
# import random
# ua = UserAgent()
# for i in range(20):
# # print(ua.random)
方式2
def get_random_ua() -> str:
ua_list = [
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
"Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Mobile/8J2",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; SAMSUNG; OMNIA7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; XBLWP7; ZuneWP7)",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 60; Windows NT 5.1; SV1; .NET CLR 2.0.50727)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; TheWorld)"]
return ua_list[random.randint(0, len(ua_list))]
print(get_random_ua())
首先为什么选择github上这篇文章。因为好,稍微做些注释,加点自己的东西,便是属于我的了。哈哈哈(模块化,多线程,日志,词云模板)
github地址https://github.com/TM0831/Spiders/tree/master/Bilibili
任务
Python 写个爬虫来爬取 B 站直播时的弹幕吧!
爬取分析:
首先打开 Bilibili,然后找到英雄联盟比赛的直播间:我得到的直播间的链接为:https://live.bilibili.com/6?broadcast_type=0&visit_id=8abcmywu95s0#/,这个链接中的 broadcast_type 和 visit_id 是随机生成的,不过对我们的爬取也没影响,只要找到直播间的链接就好了。 打开开发者工具,切换到 NetWork,点选上 XHR,在其中能找到一个请求:https://api.live.bilibili.com/ajax/msg。这个请求需要四个参数(roomid,csrf_token,csrf,visit_id),其中 roomid 为直播间的 id,csrf_token 和 csrf 可以从浏览器上 copy,visit_id 为空。该请求返回的结果中包含十条弹幕信息,包括弹幕内容、弹幕发送人昵称等等。所以要获得更多弹幕内容,我们只需要一直发送这个请求就 OK 了!
爬虫实现:
通过前面的分析可以发现要爬取 B 站直播弹幕还是很轻松的,但是要得到大量弹幕可能就需要考虑使用多线程了。对于爬取到的弹幕,还要及时地保存下来,这里我选择使用 MongoDB 数据库来保存弹幕信息。在爬取直播弹幕的时候,我开了四个线程来爬取,开了两个线程来解析和保存数据,线程之间使用队列来处理数据。
接下来代码演示。注释很详细
import re
import time
import jieba
import logging
import pymongo
import requests
import threading
from queue import Queue
from collections import Counter
#需要你下载一个包,再pip安装
from wordcloud import WordCloud
MONGO_HOST = "127.0.0.1"
MONGO_PORT = 27017
MONGO_DB = "Spiders"
MONGO_COL = "bilibili"
logging.basicConfig(filename="run.log", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO,
format="%(asctime)s - %(name)s - %(module)s: %(message)s")
"""
建了两个类 CrawlThread 和 ParseThread,CrawThread 是用于爬取弹幕的线程,ParseThread 是用于解析
和保存弹幕的线程,两个类都继承了 threading.Thread,并重写了 run() 方法。
"""
#CrawThread 是用于爬取弹幕的线程
class CrawlThread(threading.Thread):
def __init__(self, url: str, name: str, data_queue: Queue):
"""
initial function
:param url: room url
:param name: thread name
:param data_queue: data queue
"""
super(CrawlThread, self).__init__()#采用继承threading模块实现多线程
self.room_url = url
self.room_id = re.findall(r"/(\d+)\?", url)[0] #正则,返回列表,[0]python优秀之处,切片
self.headers = {
"Accept": "application/json, text/plain, */*",
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "https://live.bilibili.com",
"Referer": "",
"Sec-Fetch-Mode": "cors",
"UserAgent": get_random_ua()#导入上面的get_random_ua()函数
}
self.name = name
self.data_queue = data_queue
def run(self):
"""
send request and receive response
:return:
"""
while 1:
try:
time.sleep(1)
msg_url = "https://api.live.bilibili.com/ajax/msg"
# set referer
self.headers["Referer"] = self.room_url
# set data
data = {
"roomid": self.room_id,
"csrf_token": "e7433feb8e629e50c8c316aa52e78cb2",
"csrf": "e7433feb8e629e50c8c316aa52e78cb2",
"visit_id": ""
}
res = requests.post(msg_url, headers=self.headers, data=data)
self.data_queue.put(res.json()["data"]["room"])#发送post请求,返回json数据
except Exception as e:
logging.error(self.name, e) #log
#ParseThread 是用于解析和保存弹幕的线程
class ParseThread(threading.Thread):
def __init__(self, url: str, name: str, data_queue: Queue):
"""
initial function
:param url: room url
:param name: thread name
:param data_queue: data queue
"""
super(ParseThread, self).__init__()
self.name = name
self.data_queue = data_queue
self.room_id = re.findall(r"/(\d+)\?", url)[0]
client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT) #连接数据库
self.col = client[MONGO_DB][MONGO_COL + self.room_id]
def run(self):
"""
get data from queue
:return:
"""
while 1:
comments = self.data_queue.get()
logging.info("Comment count: {}".format(len(comments)))
self.parse(comments)
def parse(self, comments):
"""
parse comment to get message
:return:
"""
for x in comments:
comment = {
"text": x["text"],
"time": x["timeline"],
"username": x["nickname"],
"user_id": x["uid"]
}
# print(comment)
self.save_msg(comment)
def save_msg(self, msg: dict):
"""
save comment to MongoDB
:param msg: comment
:return:
"""
try:#这一部分如何考虑一下用save去重的话会更好
self.col.insert_one(msg)
except Exception as e:
logging.info(msg)
logging.error(e)
#开始创建线程
def create_crawl_thread(url: str, data_queue: Queue):
"""
create thread to crawl comments
:param url: room url
:param data_queue: data queue
:return:
"""
crawl_name = ['crawler_1', 'crawler_2', 'crawler_3', 'crawler_4']
for name in crawl_name:
crawl_list.append(CrawlThread(url, name, data_queue))
#开始创建线程
def create_parse_thread(url: str, data_queue: Queue):
"""
create thread to parse comments
:param url: room url
:param data_queue: data queue
:return:
"""
parse_name = ['parser_1', 'parser_2']
for name in parse_name:
parse_list.append(ParseThread(url, name, data_queue))
def is_chinese(word: str) -> bool:
"""
judge it is Chinese or not
:param word: word
:return:
"""
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
def get_words(txt: str) -> str:
"""
use jieba to cut words
:param txt: input text
:return:
"""
# cut words
seg_list = jieba.cut(txt)
c = Counter()
# count words
for x in seg_list:
if len(x) > 1 and x != '\r\n':
c[x] += 1
result = ""
for (k, v) in c.most_common(300):
# print('%s %d' % (k, v))
result += "\n" + k
return result
def cut_text(url: str):
"""
query data from database
:param url: room url
:return:
"""
room_id = re.findall(r"/(\d+)\?", url)[0]
client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT)
col = client[MONGO_DB][MONGO_COL + room_id]
# query
data = [i["text"] for i in col.find({}, {"_id": 0, "text": 1})]
txt = ""
for text in data:
for x in text:
if x.isalpha() or is_chinese(x):
txt += x
jieba.load_userdict("userdict.txt")
text = get_words(txt)
generate_word_cloud(text)
def generate_word_cloud(text):
"""
generate word cloud
:param text: text
:return:
"""
# text cleaning
with open("stopwords.txt", "r", encoding='utf-8') as f:
stopwords = set(f.read().split("\n"))
wc = WordCloud(
font_path="font.ttf",
background_color="white",
width=1200,
height=800,
max_words=100,
max_font_size=200,
min_font_size=10,
stopwords=stopwords, # 设置停用词
)
# generate word cloud
wc.generate("".join(text))
# save as an image
wc.to_file("rng_vs_skt.png")
if __name__ == "__main__":
# the room href
href = "https://live.bilibili.com/6?broadcast_type=0&visit_id=8abcmywu95s0#/"
# create queue
queue = Queue()
crawl_list, parse_list = [], []
create_crawl_thread(href, queue)
create_parse_thread(href, queue)
logging.info("Crawl Start!")
# thread start
for i in crawl_list:
i.start()
for i in parse_list:
i.start()
# thread run
for i in crawl_list:
i.join()
for i in parse_list:
i.join()
cut_text(href)