山东大学暑期实训（七）

最新推荐文章于 2021-07-30 13:01:56 发布

从-1开始的咸鱼生活

最新推荐文章于 2021-07-30 13:01:56 发布

阅读量67

点赞数

本文链接：https://blog.csdn.net/weixin_43317485/article/details/119244272

版权

今天依旧是对数据库的运用的学习与实践，但由于还在等待数据，因此能够做的事情比较少。
由于对爬虫比较感兴趣，参考博客编写了一份简单的爬虫代码，使用scrapy框架，对b站动画区的弹幕进行了爬取：

import scrapy
import json
import time
import os
import random
from bs4 import BeautifulSoup

class BangumiPageSpider(scrapy.Spider):
    name = "bangumipage"

    bangumipage_dirname = 'D:\\pycharm\\bangumi\\danmu\\data\\page'
    bangumilist_filename = 'D:\\pycharm\\bangumi\\danmu\\data\\bangumi_all.json'
    html_postfix = '.html'
    json_postfix = '.json'
    keystring = 'window.__INITIAL_STATE__='
    keyendl = ';'

    def start_requests(self):

        with open(self.bangumilist_filename, 'r',encoding='utf-8') as f:
            bangumi_all_list = json.load(fp = f)

        for bangumi in bangumi_all_list:
            time.sleep(0.1+0.1*random.random())
            yield scrapy.Request(url = bangumi['link'], meta = bangumi, callback = self.parse)

    def parse(self, response):

        filename = os.path.join(self.bangumipage_dirname, str(response.meta['season_id']) + self.html_postfix)
        soup = BeautifulSoup(response.text, 'html.parser')
        soup_string = soup.prettify()
        with open(filename, 'w',encoding='utf-8') as f:
            f.write(soup.prettify())
            self.log(f'Saved file {filename}')

        if response.status != 200:
            return

        content_list = response.css('script::text').getall()
        content_list = [s[len(self.keystring):s.find(self.keyendl)] for s in content_list if self.keystring in s]
        content = json.loads(content_list[0])

        filename = os.path.join(self.bangumipage_dirname, str(response.meta['season_id']) + self.json_postfix)
        with open(filename, 'w',encoding='utf-8') as f:
            json.dump(content, fp = f, ensure_ascii = False, indent = 2)
            self.log(f'Saved file {filename}')

import scrapy
import json
import time
import os
import random

class BangumilistSpider(scrapy.Spider):
    name = "bangumilist"

    bangumilist_filename = 'D:\\pycharm\\bangumi\\danmu\\data\\bangumi_all.json'

    bangumi_all_list = []

    start_urls = ['https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1']

    # def start_requests(self):
    #     first_page = 'https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1'
    #     yield scrapy.Request(url=first_page, callback=self.parse)

    def parse(self, response):
        body = json.loads(response.body)
        one_page_list = body['data']['list']
        self.bangumi_all_list += one_page_list

        if body['data']['has_next']:
            next_page_num = int(body['data']['num']) + 1
            next_page = f'https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page={next_page_num}&season_type=1&pagesize=20&type=1'
            time.sleep(0.1+0.1*random.random())
            yield scrapy.Request(url=next_page, callback=self.parse)
        else:
            with open(self.bangumilist_filename, 'w',encoding='utf-8') as f:
                json.dump(self.bangumi_all_list, fp = f, ensure_ascii = False, indent = 2)
            self.log(f'Saved file {self.bangumilist_filename}')

import scrapy
import json
import time
import os
import random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

class BangumiCommentSpider(scrapy.Spider):
    name = "bangumicomment"

    bangumipage_dirname = 'D:\\pycharm\\bangumi\\danmu\\data\\page'
    bangumicomment_dirname = 'D:\\pycharm\\bangumi\\danmu\\data\\page'
    bangumilist_filename = 'D:\\pycharm\\bangumi\\danmu\\data\\bangumi_all.json'
    html_postfix = '.html'
    json_postfix = '.json'
    xml_postfix = '.xml'
    keystring = 'window.__INITIAL_STATE__='
    keyendl = ';'
    comment_api = 'https://comment.bilibili.com/'
    header = {}

    ua = UserAgent(use_cache_server=False)
    ua = UserAgent()

    def start_requests(self):

        with open('D:\\pycharm\\bangumi\\verified_1.txt', 'r',encoding='utf-8') as f:
            PROXIES = f.readlines()

        files = os.listdir(self.bangumipage_dirname)
        for file in files:
            if self.json_postfix in file:
                with open(os.path.join(self.bangumipage_dirname, file), 'r',encoding='utf-8') as f:
                    bangumi = json.load(fp = f)
                    ep_list = bangumi['epList']
                    del bangumi['epList']

                    for ep in ep_list:
                        
                        bangumi_danmu_path = os.path.join(self.bangumicomment_dirname, str(bangumi['mediaInfo']['ssId']), str(ep['title']) + self.json_postfix)
                        if os.path.exists(bangumi_danmu_path):
                            self.log(f"Existing Path: {bangumi_danmu_path}")
                            continue
                        
                        url = self.comment_api + str(ep['cid']) + '.xml'
                        proxy = 'http://' + random.choice(PROXIES).strip()
                        useragent = self.ua.random
                        self.log(f"Proxy: {proxy} User-Agent: {useragent}")
                        time.sleep(5 + 1*random.random())
                        
                        yield scrapy.Request(url = url, meta = {'bangumi':bangumi, 'ep': ep}, headers = {'User-Agent': useragent}, callback = self.parse)

    def parse(self, response):

        bangumi_season = response.meta['bangumi']['h1Title']
        bangumi_season_id = response.meta['bangumi']['mediaInfo']['ssId']
        bangumi_season_path = os.path.join(self.bangumicomment_dirname, str(bangumi_season_id))

        if not os.path.exists(bangumi_season_path):
            os.makedirs(bangumi_season_path)

        bangumi_episode = response.meta['ep']['titleFormat']
        bangumi_episode_num = response.meta['ep']['title']
        bangumi_episode_path = os.path.join(bangumi_season_path, str(bangumi_episode_num) + self.xml_postfix)
        soup = BeautifulSoup(response.text, 'lxml')
        soup_string = soup.prettify()
        with open(bangumi_episode_path, 'w',encoding='utf-8') as f:
            f.write(soup.prettify())
            self.log(f'Saved file {bangumi_episode_path}')

        item_list = soup.find_all('d')
        danmu_list = []
        for item in item_list:
            danmu = {}
            attributes = item['p'].split(',')

            danmu['bangumi_season'] = bangumi_season
            danmu['bangumi_episode'] = bangumi_episode
            danmu['bangumi_episode_num'] = bangumi_episode_num
            danmu['appear_timestamp'] = float(attributes[0])
            danmu['mode'] = int(attributes[1])
            danmu['size'] = int(attributes[2])
            danmu['color'] = int(attributes[3])
            danmu['send_timestamp'] = int(attributes[4])
            danmu['danmu_pool'] = int(attributes[5])
            danmu['user_hash'] = str(attributes[6])
            danmu['global_id'] = int(attributes[7])
            danmu['text'] = item.text

            danmu['second'] = int(danmu['appear_timestamp'])
            danmu['minute'] = danmu['second'] // 60
            danmu['second'] = danmu['second'] % 60
            danmu['hour'] = danmu['minute'] // 60
            danmu['minute'] = danmu['minute'] % 60
            temp_time = str(danmu['second']) + "秒"
            if danmu['minute'] > 0:
                temp_time = str(danmu['minute']) + "分" + temp_time
            if danmu['hour'] > 0:
                temp_time = str(danmu['hour']) + "时" + temp_time
            danmu['appear_time'] = temp_time

            danmu_list.append(danmu)
        
        bangumi_danmu_path = os.path.join(bangumi_season_path, str(bangumi_episode_num) + self.json_postfix)
        with open(bangumi_danmu_path, 'w',encoding='utf-8') as f:
            json.dump(danmu_list, fp = f, ensure_ascii = False, indent = 2)
            self.log(f'Saved file {bangumi_danmu_path}')

爬虫文件有三个，分别对动画页面、列表、弹幕进行爬取，使用到了代理池，最终爬取了的数据如下：

做了一个简单的词云：

通过对scrapy框架的学习，我对网站爬虫的编写更加熟悉，接下来继续等待后端同学获取的数据。