blibli爬虫

blibli爬虫

爬取blibli视频评论数据,嘻嘻

首先爬取blibli的视频cid,视频cid是视频的唯一标识

import os
import requests
import json
import time
import random

def crawl_bilibili_popular(start_page=1, total_pages=1000, ps=20):
    # 请求头信息
    headers = {
        'Accept': '*/*',
        'Cookie': 'your_cookie_here',  # 请替换成您的 Cookie
        'Origin': 'https://www.bilibili.com',
        'Referer': 'https://www.bilibili.com/v/popular/all/?spm_id_from=333.1007.0.0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
    }

    # 判断文件是否存在,如果存在则读取记录
    record_file = 'crawl_record.txt'
    start_page = load_record(record_file) if os.path.exists(record_file) else start_page

    # 初始化链接集合
    existing_links = load_existing_links('bilibli_cid.txt')

    # 循环爬取每一页的内容
    for page in range(start_page, total_pages + 1):
        # 存储当前页视频的短链接
        short_links = []

        # 构造请求 URL
        url = f'https://api.bilibili.com/x/web-interface/popular?ps={ps}&pn={page}&web_location=333.934'

        # 发送 GET 请求前随机生成请求间隔时间
        random_sleep = random.uniform(1, 3)
        time.sleep(random_sleep)

        # 发送 GET 请求
        response = requests.get(url, headers=headers)

        # 打印获取状态
        print(f"正在获取第 {page} 页的内容...")

        # 如果请求成功,则提取短链接
        if response.status_code == 200:
            data = response.json()
            short_links = [item['short_link_v2'] for item in data['data']['list']]
            # 打印进度
            print(f"第 {page} 页内容获取成功,当前已获取 {len(short_links)} 条短链接。")
        else:
            print(f"请求第 {page} 页失败,状态码:", response.status_code)
            # 如果获取失败,跳过进行下次请求
            continue

        # 添加新链接到集合中
        new_links = set()
        for link in short_links:
            if link not in existing_links:
                existing_links.add(link)
                new_links.add(link)

        # 保存链接到文件
        if new_links:
            save_links_to_file('bilibli_cid.txt', new_links)

        # 更新记录
        update_record(record_file, page)

def load_record(record_file):
    # 读取记录文件中的页数
    with open(record_file, 'r', encoding='utf-8') as f:
        start_page = int(f.read().strip())
    return start_page

def update_record(record_file, page):
    # 更新记录文件中的页数
    with open(record_file, 'w', encoding='utf-8') as f:
        f.write(str(page))

def load_existing_links(file_path):
    # 加载已存在的链接集合
    existing_links = set()
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            existing_links = set(line.strip() for line in f)
    return existing_links

def save_links_to_file(file_path, links):
    # 将新链接写入文件
    with open(file_path, 'a', encoding='utf-8') as f:
        for link in links:
            f.write(link + '\n')

if __name__ == "__main__":
    # 调用函数爬取1000页内容
    crawl_bilibili_popular(start_page=1, total_pages=1000)

    # 输出结果
    print("所有视频的短链接已保存到 bilibli_cid.txt 文件中。")

其次爬取视频评论

import requests
import re
from bs4 import BeautifulSoup
import operator
import traceback
import os
import pandas as pd
from lxml import etree
from time import sleep

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36',
    'Referer': 'https://www.bilibili.com/'
}

timeout = 5


def getHTML(url):
    try:
        response = requests.get(url=url, headers=headers,
                                timeout=timeout)
        # 自适应编码
        response.encoding = response.apparent_encoding
        return response.text
        # 下句作用等同于上两句
        # return response.text.encode(response.encoding).decode('utf-8')
    except:
        print(f"reqeuset url : {url} error...")
        print(traceback.format_exc())
        return None


def parsePage(page):
    try:
        print("parsing...")
        html_ = etree.HTML(page)
        meta_title = html_.xpath('//meta[@name="title"]/@content')[0]
        if meta_title == '视频去哪了呢?_哔哩哔哩_bilibili':
            print(f'视频 404 not found')
            return [], '视频 404 not found'
        syntax = [':', '=']
        flag = 0
        keys = re.findall(r'"cid":[\d]*', page)
        if not keys:
            keys = re.findall(r'cid=[\d]*', page)
            flag = 1
        comments, title = {}, None
        keys = [keys[1]]
        for index, item in enumerate(keys):
            key = item.split(syntax[flag])[1]
            print(f'{index + 1}/{len(keys)}: {key}')
            comment_url = f'https://comment.bilibili.com/{key}.xml'  # 弹幕地址
            comment_text = getHTML(comment_url)
            bs4 = BeautifulSoup(comment_text, "html.parser")
            if not title:
                title = BeautifulSoup(page, "html.parser").find('h1').get_text().strip()
            for comment in bs4.find_all('d'):
                time = float(comment.attrs['p'].split(',')[0])
                time = timeFormatter(time)
                comments[time] = comment.string
        sorted_comments = sorted(comments.items(), key=operator.itemgetter(0))  # 排序
        comments = dict(sorted_comments)
        print("parse finish")
        return comments, title
    except:
        print("parse error")
        print(traceback.format_exc())


def validateTitle(title):
    re_str = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    new_title = re.sub(re_str, "_", title)  # 替换为下划线
    return new_title


def timeFormatter(param):
    minute = int(param) // 60
    second = float(param) - minute * 60
    return f'{str(minute).zfill(2)}:{str(second).zfill(5)}'


def main():
    bvs = ['BV1pq421A7UU', 'BV1CC4y1a7ee', 'BV1hx411e7KP', 'BV16F411B7Ek']
    for bv in bvs:
        url = f"https://www.bilibili.com/video/{bv}"
        save_folder = "BarRage"
        if not os.path.exists(save_folder):
            os.mkdir(save_folder)
        comments, title = parsePage(getHTML(url))
        if len(comments) == 0:
            continue
        title = validateTitle(title)
        df = pd.DataFrame({'时刻': list(comments.keys()), '弹幕文本': list(comments.values())})
        df.drop_duplicates(subset=['时刻', '弹幕文本'], keep='first', inplace=True)
        df.to_csv(f"{save_folder}/{title}.csv", index=False, encoding='utf-8-sig')
        print(f'已经保存 {df.shape[0]} 条弹幕到 {save_folder}/{title}.csv\n\n')

        sleep(10)


if __name__ == '__main__':
    main()

改进读取文本爬取,有点小问题,欢迎建议,修改,嘻嘻

def read_cid_from_file(filename,n=0):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            cid_array = file.readlines()
            # 去除每行末尾的换行符
            cid_array = [cid.strip() for cid in cid_array]
        return cid_array[n:]
    except FileNotFoundError:
        print(f"文件 '{filename}' 不存在。")
        return []
import requests
import re
from bs4 import BeautifulSoup
import operator
import traceback
import os
import pandas as pd
from lxml import etree
from time import sleep

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36',
    'Referer': 'https://www.bilibili.com/'
}

timeout = 5


def getHTML(url):
    try:
        response = requests.get(url=url, headers=headers,
                                timeout=timeout)
        # 自适应编码
        response.encoding = response.apparent_encoding
        return response.text
        # 下句作用等同于上两句
        # return response.text.encode(response.encoding).decode('utf-8')
    except:
        print(f"reqeuset url : {url} error...")
        print(traceback.format_exc())
        return None


def parsePage(page):
    try:
        print("parsing...")
        html_ = etree.HTML(page)
        meta_title = html_.xpath('//meta[@name="title"]/@content')[0]
        if meta_title == '视频去哪了呢?_哔哩哔哩_bilibili':
            print(f'视频 404 not found')
            return [], '视频 404 not found'
        syntax = [':', '=']
        flag = 0
        keys = re.findall(r'"cid":[\d]*', page)
        if not keys:
            keys = re.findall(r'cid=[\d]*', page)
            flag = 1
        comments, title = {}, None
        keys = [keys[1]]
        for index, item in enumerate(keys):
            key = item.split(syntax[flag])[1]
            print(f'{index + 1}/{len(keys)}: {key}')
            comment_url = f'https://comment.bilibili.com/{key}.xml'  # 弹幕地址
            comment_text = getHTML(comment_url)
            bs4 = BeautifulSoup(comment_text, "html.parser")
            if not title:
                title = BeautifulSoup(page, "html.parser").find('h1').get_text().strip()
            for comment in bs4.find_all('d'):
                time = float(comment.attrs['p'].split(',')[0])
                time = timeFormatter(time)
                comments[time] = comment.string
        sorted_comments = sorted(comments.items(), key=operator.itemgetter(0))  # 排序
        comments = dict(sorted_comments)
        print("parse finish")
        return comments, title
    except:
        print("parse error")
        print(traceback.format_exc())


def validateTitle(title):
    re_str = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    new_title = re.sub(re_str, "_", title)  # 替换为下划线
    return new_title


def timeFormatter(param):
    minute = int(param) // 60
    second = float(param) - minute * 60
    return f'{str(minute).zfill(2)}:{str(second).zfill(5)}'


def main(bvs):
    bvs = bvs
    for bv in bvs:
        try:
            url = bv
            save_folder = "BarRage"
            if not os.path.exists(save_folder):
                os.mkdir(save_folder)
            comments, title = parsePage(getHTML(url))
            if len(comments) == 0:
                continue
            csv_filename = f"{save_folder}/{title}.csv"
            if os.path.exists(csv_filename):
                print(f"CSV文件 '{csv_filename}' 已存在,跳过...")
                continue
            title = validateTitle(title)
            df = pd.DataFrame({'时刻': list(comments.keys()), '弹幕文本': list(comments.values())})
            df.drop_duplicates(subset=['时刻', '弹幕文本'], keep='first', inplace=True)
            df.to_csv(f"{save_folder}/{title}.csv", index=False, encoding='utf-8-sig')
            print(f'已经保存 {df.shape[0]} 条弹幕到 {save_folder}/{title}.csv\n\n')

            sleep(5)
        except:
            print(f"文件 解析错误,跳过 。")
            continue


if __name__ == '__main__':
    # 使用示例
    filename = 'bilibli_cid.txt'
    n = 1
    cids = read_cid_from_file(filename,n)
    main(cids)
    # print(cids)

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值