今天依旧是对数据库的运用的学习与实践,但由于还在等待数据,因此能够做的事情比较少。
由于对爬虫比较感兴趣,参考博客编写了一份简单的爬虫代码,使用scrapy框架,对b站动画区的弹幕进行了爬取:
import scrapy
import json
import time
import os
import random
from bs4 import BeautifulSoup
class BangumiPageSpider(scrapy.Spider):
name = "bangumipage"
bangumipage_dirname = 'D:\\pycharm\\bangumi\\danmu\\data\\page'
bangumilist_filename = 'D:\\pycharm\\bangumi\\danmu\\data\\bangumi_all.json'
html_postfix = '.html'
json_postfix = '.json'
keystring = 'window.__INITIAL_STATE__='
keyendl = ';'
def start_requests(self):
with open(self.bangumilist_filename, 'r',encoding='utf-8') as f:
bangumi_all_list = json.load(fp = f)
for bangumi in bangumi_all_list:
time.sleep(0.1+0.1*random.random())
yield scrapy.Request(url = bangumi['link'], meta = bangumi, callback = self.parse)
def parse(self, response):
filename = os.path.join(self.bangumipage_dirname, str(response.meta['season_id']) + self.html_postfix)
soup = BeautifulSoup(response.text, 'html.parser')
soup_string = soup.prettify()
with open(filename, 'w',encoding='utf-8') as f:
f.write(soup.prettify())
self.log(f'Saved file {filename}')
if response.status != 200:
return
content_list = response.css('script::text').getall()
content_list = [s[len(self.keystring):s.find(self.keyendl)] for s in content_list if self.keystring in s]
content = json.loads(content_list[0])
filename = os.path.join(self.bangumipage_dirname, str(response.meta['season_id']) + self.json_postfix)
with open(filename, 'w',encoding='utf-8') as f:
json.dump(content, fp = f, ensure_ascii = False, indent = 2)
self.log(f'Saved file {filename}')
import scrapy
import json
import time
import os
import random
class BangumilistSpider(scrapy.Spider):
name = "bangumilist"
bangumilist_filename = 'D:\\pycharm\\bangumi\\danmu\\data\\bangumi_all.json'
bangumi_all_list = []
start_urls = ['https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1']
# def start_requests(self):
# first_page = 'https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1'
# yield scrapy.Request(url=first_page, callback=self.parse)
def parse(self, response):
body = json.loads(response.body)
one_page_list = body['data']['list']
self.bangumi_all_list += one_page_list
if body['data']['has_next']:
next_page_num = int(body['data']['num']) + 1
next_page = f'https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page={next_page_num}&season_type=1&pagesize=20&type=1'
time.sleep(0.1+0.1*random.random())
yield scrapy.Request(url=next_page, callback=self.parse)
else:
with open(self.bangumilist_filename, 'w',encoding='utf-8') as f:
json.dump(self.bangumi_all_list, fp = f, ensure_ascii = False, indent = 2)
self.log(f'Saved file {self.bangumilist_filename}')
import scrapy
import json
import time
import os
import random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
class BangumiCommentSpider(scrapy.Spider):
name = "bangumicomment"
bangumipage_dirname = 'D:\\pycharm\\bangumi\\danmu\\data\\page'
bangumicomment_dirname = 'D:\\pycharm\\bangumi\\danmu\\data\\page'
bangumilist_filename = 'D:\\pycharm\\bangumi\\danmu\\data\\bangumi_all.json'
html_postfix = '.html'
json_postfix = '.json'
xml_postfix = '.xml'
keystring = 'window.__INITIAL_STATE__='
keyendl = ';'
comment_api = 'https://comment.bilibili.com/'
header = {}
ua = UserAgent(use_cache_server=False)
ua = UserAgent()
def start_requests(self):
with open('D:\\pycharm\\bangumi\\verified_1.txt', 'r',encoding='utf-8') as f:
PROXIES = f.readlines()
files = os.listdir(self.bangumipage_dirname)
for file in files:
if self.json_postfix in file:
with open(os.path.join(self.bangumipage_dirname, file), 'r',encoding='utf-8') as f:
bangumi = json.load(fp = f)
ep_list = bangumi['epList']
del bangumi['epList']
for ep in ep_list:
bangumi_danmu_path = os.path.join(self.bangumicomment_dirname, str(bangumi['mediaInfo']['ssId']), str(ep['title']) + self.json_postfix)
if os.path.exists(bangumi_danmu_path):
self.log(f"Existing Path: {bangumi_danmu_path}")
continue
url = self.comment_api + str(ep['cid']) + '.xml'
proxy = 'http://' + random.choice(PROXIES).strip()
useragent = self.ua.random
self.log(f"Proxy: {proxy} User-Agent: {useragent}")
time.sleep(5 + 1*random.random())
yield scrapy.Request(url = url, meta = {'bangumi':bangumi, 'ep': ep}, headers = {'User-Agent': useragent}, callback = self.parse)
def parse(self, response):
bangumi_season = response.meta['bangumi']['h1Title']
bangumi_season_id = response.meta['bangumi']['mediaInfo']['ssId']
bangumi_season_path = os.path.join(self.bangumicomment_dirname, str(bangumi_season_id))
if not os.path.exists(bangumi_season_path):
os.makedirs(bangumi_season_path)
bangumi_episode = response.meta['ep']['titleFormat']
bangumi_episode_num = response.meta['ep']['title']
bangumi_episode_path = os.path.join(bangumi_season_path, str(bangumi_episode_num) + self.xml_postfix)
soup = BeautifulSoup(response.text, 'lxml')
soup_string = soup.prettify()
with open(bangumi_episode_path, 'w',encoding='utf-8') as f:
f.write(soup.prettify())
self.log(f'Saved file {bangumi_episode_path}')
item_list = soup.find_all('d')
danmu_list = []
for item in item_list:
danmu = {}
attributes = item['p'].split(',')
danmu['bangumi_season'] = bangumi_season
danmu['bangumi_episode'] = bangumi_episode
danmu['bangumi_episode_num'] = bangumi_episode_num
danmu['appear_timestamp'] = float(attributes[0])
danmu['mode'] = int(attributes[1])
danmu['size'] = int(attributes[2])
danmu['color'] = int(attributes[3])
danmu['send_timestamp'] = int(attributes[4])
danmu['danmu_pool'] = int(attributes[5])
danmu['user_hash'] = str(attributes[6])
danmu['global_id'] = int(attributes[7])
danmu['text'] = item.text
danmu['second'] = int(danmu['appear_timestamp'])
danmu['minute'] = danmu['second'] // 60
danmu['second'] = danmu['second'] % 60
danmu['hour'] = danmu['minute'] // 60
danmu['minute'] = danmu['minute'] % 60
temp_time = str(danmu['second']) + "秒"
if danmu['minute'] > 0:
temp_time = str(danmu['minute']) + "分" + temp_time
if danmu['hour'] > 0:
temp_time = str(danmu['hour']) + "时" + temp_time
danmu['appear_time'] = temp_time
danmu_list.append(danmu)
bangumi_danmu_path = os.path.join(bangumi_season_path, str(bangumi_episode_num) + self.json_postfix)
with open(bangumi_danmu_path, 'w',encoding='utf-8') as f:
json.dump(danmu_list, fp = f, ensure_ascii = False, indent = 2)
self.log(f'Saved file {bangumi_danmu_path}')
爬虫文件有三个,分别对动画页面、列表、弹幕进行爬取,使用到了代理池,最终爬取了的数据如下:
做了一个简单的词云:
通过对scrapy框架的学习,我对网站爬虫的编写更加熟悉,接下来继续等待后端同学获取的数据。