大数据项目实践 数据采集清洗存储——b站编程课程采集与分析_大数据数据清洗界面

先自我介绍一下,小编浙江大学毕业,去过华为、字节跳动等大厂,目前阿里P7

深知大多数程序员,想要提升技能,往往是自己摸索成长,但自己不成体系的自学效果低效又漫长,而且极易碰到天花板技术停滞不前!

因此收集整理了一份《2024年最新大数据全套学习资料》,初衷也很简单,就是希望能够帮助到想自学提升又不知道该从何学起的朋友。
img
img
img
img
img

既有适合小白学习的零基础资料,也有适合3年以上经验的小伙伴深入学习提升的进阶课程,涵盖了95%以上大数据知识点,真正体系化!

由于文件比较多,这里只是将部分目录截图出来,全套包含大厂面经、学习笔记、源码讲义、实战项目、大纲路线、讲解视频,并且后续会持续更新

如果你需要这些资料,可以添加V获取:vip204888 (备注大数据)
img

正文

4、爬虫代码
import pymysql
import requests
from lxml import etree
import csv
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.charts import Bar


#连接数据库
db = pymysql.connect(
    host='localhost',
    port=3306,
    user='root',
    password='123456',
    database='BiliBili',
)
cursor = db.cursor()

#爬虫准备工作
base_url = 'https://search.bilibili.com/all?vt=77434542&keyword=%E7%BC%96%E7%A8%8B%E8%AF%BE%E7%A8%8B&from_source=webtop_search&spm_id_from=333.1007&search_source=5'
params = {}
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
}
page_number = 1
o_number = 0


# 将包含“万”的字符串转换为实际数字的函数
def convert_to_actual_number(string_with_unit):
    try:
        if '万' in string_with_unit:
            # 如果字符串中包含“万”,则将其转换为对应的数字
            return float(string_with_unit.replace('万', '')) * 10000
        else:
            # 如果字符串中不包含“万”,则直接转换为数字
            return float(string_with_unit)
    except ValueError:
        # 处理无法转换的情况
        return None


# 设置空集合,目的是存入链接进行比较,删减重复数据
unique_links = set()
video_data_by_keyword = {'C语言', 'C++', 'Python', 'PHP', '算法', 'Java','go语言','Mysql','C#','Scratch','web','计算机'}
k=0

#绘图
keyword_count = {keyword: 0 for keyword in video_data_by_keyword}

# 打开文件,并将爬取到的数据保存到文件中
with open('video_data.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
    csv_writer = csv.writer(csvfile)
    # 写入列名
    csv_writer.writerow(['视频链接', '视频名称', '作者', '类别', '播放量', '评论数', '时长'])

    while True:
        # 设置参数,实现翻页
        params['page'] = str(page_number)
        params['o'] = str(o_number)

        response = requests.get(base_url, params=params, headers=headers)

        html = response.text
        html = response.content.decode('utf-8')
        parse = etree.HTMLParser(encoding='utf-8')
        print(response.url)
        contentPath = []
        contentname = []
        contentauthor = []
        doc = etree.HTML(html)
        doc.xpath('//div[@class="bili-video-card__info--right"]//a/@href')
        #视频路径
        contentPath = doc.xpath('//div[@class="bili-video-card__info--right"]/a/@href')
        #视频名称
        contentname = doc.xpath('//div[@class="bili-video-card__info--right"]//h3[@class="bili-video-card__info--tit"]/@title')
        #视频创作者
        contentauthor = doc.xpath('//div[@class="bili-video-card__info--right"]//span[@class="bili-video-card__info--author"]/text()')
        #Video View(VV):视频播放量 
        contentVV = doc.xpath('//div[@class="bili-video-card__stats--left"]/span[@class="bili-video-card__stats--item"][1]/span/text()')
        #Comment(CM):视频评论数
        contentCM = doc.xpath('//div[@class="bili-video-card__stats--left"]/span[@class="bili-video-card__stats--item"][2]/span/text()')
        #Duration(DR):视频时长
        contentDR = doc.xpath('//div[@class="bili-video-card__stats"]/span[@class="bili-video-card__stats__duration"]/text()')

        # print(contentPath)
        # print(contentname)
        # print(contentauthor)
        # print(contentVV)
        # print(contentDR)

        # print(len(contentPath))
        # print(len(contentname))



        # 将数据写入csv文件和数据库中,若已存在或报错“无权限”需删除csv文件和数据库数据
        for link, name, author, vv, cm, dr in zip(contentPath, contentname, contentauthor, contentVV, contentCM , contentDR):
            category_found = False
            lower_name = name.lower()  # 将视频名称转换为小写
            matched_keyword = None  # 存储匹配的关键词
            vv = convert_to_actual_number(vv)
            cm = convert_to_actual_number(cm)
            for keyword in video_data_by_keyword:
                lower_keyword = keyword.lower()  # 将关键词转换为小写
                if lower_keyword in lower_name:
                    category_found = True
                    matched_keyword = keyword

            # 在这里处理匹配后的操作,例如写入 CSV 和数据库
            if category_found and matched_keyword:
                if link not in unique_links:
                    csv_writer.writerow([link, name, author, matched_keyword, vv, cm, dr])
                    sql = "INSERT INTO videos(VideoName, VideoAuther, Category, VideoView, Comment, Duration) VALUES (%s, %s, %s, %s, %s, %s )"
                    values = (name, author, matched_keyword, vv, cm, dr)
                    cursor.execute(sql, values)
                    db.commit()
                    unique_links.add(link)


        # 将数据按关键词分类并统计数据绘图
        for i, (link, name, author) in enumerate(zip(contentPath, contentname, contentauthor), start=1):
            #设置一个bool值判断数据是否在类型中
            category_found = False  
            
            for keyword in video_data_by_keyword:
                if keyword in name:
                    category_found = True
                    keyword_count[keyword] += 1
                if category_found == True:
                    keyword_list = list(keyword_count.items()) 
                    pie = (
                            Pie()
                            .add("", keyword_list)
                            .set_colors(['red', 'blue', 'green', 'purple', 'orange', 'cyan', 'pink', 'brown'])
                            .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
                            .set_global_opts(title_opts=opts.TitleOpts(title="主要视频统计"))
                        )
                    pie.render('keyword_pie_chart.html')

        # 进入下一页
        page_number += 1
        o_number += 24
        print(page_number)
        print(o_number)

        if not contentPath:
            break


# 执行查询获取VV值最大的前20条记录
sql_query = "SELECT Category, VideoView FROM videos ORDER BY VideoView DESC LIMIT 20"
cursor.execute(sql_query)
result = cursor.fetchall()

# 提取结果中的数据
video_category = [row[0] for row in result]
vv_values = [row[1] for row in result]



# 使用 Pyecharts 生成横向柱状图
bar = (
    Bar()
    .add_xaxis(video_category)
    .add_yaxis("Views", vv_values)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="播放量最高的前20条视频的类型"),
        xaxis_opts=opts.AxisOpts(
            axislabel_opts=opts.LabelOpts(
                rotate=-45,
                font_size=11,
                interval=0,
            )
        ),
    )
)
bar.render("videoview.html")

#评论前20的视频类型
# 执行查询获取cm值最大的前20条记录
sql_query1 = "SELECT Category, Comment FROM videos ORDER BY Comment DESC LIMIT 20"
cursor.execute(sql_query1)
result = cursor.fetchall()
# 关闭数据库连接
db.close()
# 提取结果中的数据
video_category = [row[0] for row in result]
vv_comment = [row[1] for row in result]
# 使用 Pyecharts 生成横向柱状图
bar = (
    Bar()
    .add_xaxis(video_category)  # 将视频名称作为 x 轴数据
    .add_yaxis("Views", vv_comment)  # 将视频数据作为 y 轴数据
    .reversal_axis()  # 将 x 轴和 y 轴交换
    .set_global_opts(
        title_opts=opts.TitleOpts(title="评论量最高的前20条视频"),
        xaxis_opts=opts.AxisOpts(
            axislabel_opts=opts.LabelOpts(
                font_size=11,  # 调整字体大小
                interval=0,  # 设置标签显示的间隔
            )
        ),
    )
)
bar.render("comment.html")

这个代码包括了爬取(requests模块)和对数据进行标签分类、生成图表(pyecharts模块),同时将数据保存在了对应的数据库中,并且还生成了一个csv文件保存对应数据。但是代码仍有缺陷,可能会导致爬取错误。

爬取并分类好的数据大致如下:

5、界面设计

使用QtDesigner进行界面设计

设计界面如下:

1、采集数据界面

2、数据可视化界面

3、数据分析界面

如果对于QtDesigner不熟悉的,可以看看我的这篇文章利用PySide2模块以及Qt设计师(Qt-Designer)设计简易的系统窗体

6、QThread多线程设计

想要实现爬取数据动态显示到界面上,我们需要使用Qt中的QThread类实现对于线程的创建与管理。

我们需要新建一个派生类DataThread(名称任意),该派生类由QThread这个基类派生,可以使用QThread中的相关成员函数,同时我们可以在DataThread这个由我们自己定义的派生类中进行相关修改。

DataThread派生类大致如下:

from PyQt5.QtCore import *

class DataThread(QThread):
    signal = pyqtSignal(str, str, str, str, int, int, str)
    def __init__(self):
        QThread.__init__(self)
        self.state = 1

    def run(self):
        pass

    def Stop(self):
        self.state = 0

1、其中 signal = pyqtSignal(str, str, str, str, int, int, str) 对应的每一个类型为我们所爬取的数据的类型,比如我使用的数据为:

VideoID(视频编号)、VideoName(视频名称)、VideoAuther(视频作者)、Category(视频类型)、VideoView(观看量)、Comment(评论数)、Duration(视频时长),对应的数据类型为字符串(str),字符串(str),字符串(str),字符串(str),整型(int),整型(int),字符串(str)。

2、其中 def __init__(self): 这个是初始化函数,可以用来设置全局变量,例如 self.state 表示的是当前线程的状态,1为进行中,0为停止,初始化为1。

3、其中 def run(self): 这个函数比较重要,我们需要将爬虫代码写在run函数中,并且每爬取一条数据,进行数据处理、分类后都要将数据传递出去

4、其中  def Stop(self): 这个函数用来控制线程的停止,当我需要停止的时候就调用该函数,那么线程就会停止,爬虫也会随之停止

加上爬虫代码并根据个人的需求完善代码,完整DataThread类的代码如下:

from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
import requests
from lxml import etree

class DataThread(QThread):
    signal = pyqtSignal(str, str, str, str, int, int, str)
    def __init__(self):
        QThread.__init__(self)
        self.state = 1
        self.page_number = 1
        self.o_number = 0
        self.key = []
        self.value = []

    def run(self):
        # while(self.state):
            # 爬虫准备工作
        base_url = 'https://search.bilibili.com/all?vt=77434542&keyword=%E7%BC%96%E7%A8%8B%E8%AF%BE%E7%A8%8B&from_source=webtop_search&spm_id_from=333.1007&search_source=5'
        params = {}

        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
        }
            # page_number = 1
            # o_number = 0
        unique_links = set()
        video_data_by_keyword = ['C语言', 'C++', 'Python', 'PHP', '算法', 'Java', 'go语言','Mysql','C#','Scratch','web','计算机']
        while self.page_number <= 34:
            params['page'] = str(self.page_number)
            params['o'] = str(self.o_number)
            response = requests.get(base_url, params=params, headers=headers)
            html = response.text
            html = response.content.decode('utf-8')
            parse = etree.HTMLParser(encoding='utf-8')
            contentPath = []
            contentname = []
            contentauthor = []
            contentVV = []
            contentCM = []
            contentDR = []
            doc = etree.HTML(html)
            doc.xpath('//div[@class="bili-video-card__info--right"]//a/@href')
            contentPath = doc.xpath('//div[@class="bili-video-card__info--right"]/a/@href')
            contentname = doc.xpath('//div[@class="bili-video-card__info--right"]//h3[@class="bili-video-card__info--tit"]/@title')
            contentauthor = doc.xpath('//div[@class="bili-video-card__info--right"]//span[@class="bili-video-card__info--author"]/text()')
            contentVV = doc.xpath('//div[@class="bili-video-card__stats--left"]/span[@class="bili-video-card__stats--item"][1]/span/text()')
            contentCM = doc.xpath('//div[@class="bili-video-card__stats--left"]/span[@class="bili-video-card__stats--item"][2]/span/text()')
            contentDR = doc.xpath('//div[@class="bili-video-card__stats"]/span[@class="bili-video-card__stats__duration"]/text()')
            # print(contentVV)
            # print(contentCM)
            for link, name,auther,vv,cm,dr in zip(contentPath,contentname,contentauthor,contentVV,contentCM,contentDR):
                category_found = False
                VideoID = str(self.data)
                VideoName = name
                VideoAuther = auther
                if vv[-1] == '万':
                    num = float(vv[0:-1])
                    num *= 10000
                    VideoView = int(num)
                else:
                    VideoView = int(vv)
                if cm[-1] == '万':
                    num = float(cm[0:-1])
                    num *= 10000
                    Comment = int(num)
                else:
                    Comment = int(cm)
                Duration = dr
                Category = None
                for keyword in video_data_by_keyword:
                    lower_keyword = keyword.lower()  # 将关键词转换为小写
                    if lower_keyword in name.lower():
                        Category = keyword
                        if link not in unique_links:
                            if self.state:
                                self.signal.emit(VideoID, VideoName, VideoAuther, Category, VideoView, Comment, Duration)
                                self.data += 1
                                time.sleep(0.1)
                            unique_links.add(link)
                            break
            self.page_number += 1
            self.o_number += 24

    def Stop(self):
        self.state = 0

在这个代码中,爬虫代码不断爬取数据,通过 self.signal.emit(VideoID, VideoName, VideoAuther, Category, VideoView, Comment, Duration) 这行代码将数据传递到界面上。

7、通过UI界面实时显示数据

到最为关键的一步了,我们需要对设计好的UI界面进行初始化,并利用爬取好传递过来的数据来生成可视化图表,并实现实时自动刷新图表以及手动刷新图表。

还是和DataThread派生类一样,我们可以创建一个自己的窗口类MyWindow,继承于我们通过QtDesigner设计好的界面生成的类Ui_MainWindow,同时还要继承于PyQt5中的QMainWindow类。

class MyWindow(Ui_MainWindow,QMainWindow):
    signal = pyqtSignal(str, str, str, str, int, int, str)
    def __init__(self):
        QMainWindow.__init__(self)
        self.setupUi(self)
        self.Operate()
        self.dt = {'C语言': 0, 'C++': 0, 'Python': 0, 'PHP': 0, '算法': 0, 'Java': 0, 'go语言': 0, 'Mysql': 0, 'C#': 0, 'Scratch': 0, 'web': 0, '计算机': 0}
        self.showflag = 0

    # 初始化并指明各个函数
    def Operate(self):
        self.InitTable()
        self.ConnectDB()
        self.CreateThread()
        

    # 初始化各个图表
    def InitTable(self):
    # 设置列个数
        self.tableWidget_show_all_datas.setColumnCount(7)
        # 标题
        self.tableWidget_show_all_datas.setHorizontalHeaderLabels(
            ['视频ID号', '视频名称', '视频作者', '相关分类', '视频观看量', '评论数', '视频时长'])
        # 设置整行选中模式
        self.tableWidget_show_all_datas.setSelectionBehavior(True)
        # 设置列宽度
        self.tableWidget_show_all_datas.setColumnWidth(0, 147)
        self.tableWidget_show_all_datas.setColumnWidth(1, 630)
        self.tableWidget_show_all_datas.setColumnWidth(2, 247)
        self.tableWidget_show_all_datas.setColumnWidth(3, 147)
        self.tableWidget_show_all_datas.setColumnWidth(4, 120)
        self.tableWidget_show_all_datas.setColumnWidth(5, 120)
        self.tableWidget_show_all_datas.setColumnWidth(6, 147)
        # 最后一列自动填充剩余宽度
        # self.tableWidget_show_all_datas.horizontalHeader().setStretchLastSection(True)
        # 设置标题带排序
        self.tableWidget_show_all_datas.setSortingEnabled(True)
        # 隐藏默认行号
        self.tableWidget_show_all_datas.verticalHeader().setHidden(True)

    def ConnectDB(self):
        self.con = GetConn()
        self.cur = self.con.cursor()
        sql = "delete from videos"
        try:
            self.cur.execute(sql)
            self.con.commit()
            print('清空数据成功')
        except Exception as e:
            print('清空数据失败', e)


    def ShowSelWindow(self):
        selindex = self.tabWidget.currentIndex()
        self.tabWidget.setCurrentIndex(selindex)

    # 创建多线程
    def CreateThread(self):
        self.datathread = DataThread()
        self.datathread.data = 1
        self.datathread.signal.connect(self.DoWork)
        self.signal.connect(self.datathread.Stop)

    # 启动多线程
    def StartThread(self):  # 启动多线程
        text = self.pButton_data_collection.text()
        if text == '开始采集':
            self.pButton_data_collection.setText('停止采集')
            self.datathread.state = 1
            self.datathread.start()
        else:
            self.pButton_data_collection.setText('开始采集')
            self.datathread.Stop()

    def DoWork(self, VideoID, VideoName, VideoAuther, Category, VideoView, Comment, Duration):
        pass

我的MyWindow类大体架构就是这样,在这个类中,最最关键的成员函数是DoWork函数。第5步多线程中通过爬虫代码爬取的每一条数据都会依次传递到DoWork函数中。多线程在进行中状态下会不断调用DoWork函数,当多线程停止后才停止对DoWork函数的调用。

8、效果展示

b站编程课程爬虫多线程客户端示例展示视频

9.完整代码

bilibili_UI.py

# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'bilibili_UI.ui'
#
# Created by: PyQt5 UI code generator 5.15.9
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again.  Do not edit this file unless you know what you are doing.


from PyQt5 import QtCore, QtGui, QtWidgets


class Ui_MainWindow(object):
    def setupUi(self, MainWindow):
        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(1450, 853)
        self.centralwidget = QtWidgets.QWidget(MainWindow)
        self.centralwidget.setObjectName("centralwidget")
        self.layoutWidget = QtWidgets.QWidget(self.centralwidget)
        self.layoutWidget.setGeometry(QtCore.QRect(0, 0, 1451, 29))
        self.layoutWidget.setObjectName("layoutWidget")
        self.horizontalLayout = QtWidgets.QHBoxLayout(self.layoutWidget)
        self.horizontalLayout.setContentsMargins(0, 0, 0, 0)
        self.horizontalLayout.setObjectName("horizontalLayout")
        spacerItem = QtWidgets.QSpacerItem(40, 20, QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Minimum)
        self.horizontalLayout.addItem(spacerItem)
        self.label = QtWidgets.QLabel(self.layoutWidget)
        font = QtGui.QFont()
        font.setFamily("华文楷体")
        font.setPointSize(15)
        font.setBold(True)
        font.setWeight(75)
        self.label.setFont(font)
        self.label.setObjectName("label")
        self.horizontalLayout.addWidget(self.label)
        spacerItem1 = QtWidgets.QSpacerItem(40, 20, QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Minimum)
        self.horizontalLayout.addItem(spacerItem1)
        self.tabWidget = QtWidgets.QTabWidget(self.centralwidget)
        self.tabWidget.setGeometry(QtCore.QRect(0, 30, 1451, 791))
        font = QtGui.QFont()
        font.setBold(True)
        font.setWeight(75)
        self.tabWidget.setFont(font)
        self.tabWidget.setObjectName("tabWidget")
        self.tab = QtWidgets.QWidget()
        self.tab.setObjectName("tab")
        self.pButton_data_collection = QtWidgets.QPushButton(self.tab)
        self.pButton_data_collection.setGeometry(QtCore.QRect(0, 10, 121, 41))
        font = QtGui.QFont()
        font.setFamily("华文仿宋")
        font.setPointSize(9)
        font.setBold(True)
        font.setItalic(False)
        font.setUnderline(False)
        font.setWeight(75)
        font.setStrikeOut(False)
        font.setKerning(False)
        self.pButton_data_collection.setFont(font)
        self.pButton_data_collection.setObjectName("pButton_data_collection")
        self.tableWidget_show_all_datas = QtWidgets.QTableWidget(self.tab)
        self.tableWidget_show_all_datas.setGeometry(QtCore.QRect(0, 50, 1451, 331))
        self.tableWidget_show_all_datas.setObjectName("tableWidget_show_all_datas")
        self.tableWidget_show_all_datas.setColumnCount(0)
        self.tableWidget_show_all_datas.setRowCount(0)
        self.Label_current_collection_data = QtWidgets.QLabel(self.tab)
        self.Label_current_collection_data.setGeometry(QtCore.QRect(0, 380, 1441, 31))
        self.Label_current_collection_data.setText("")
        self.Label_current_collection_data.setObjectName("Label_current_collection_data")
        self.WebEngineView_show_data_pie = QtWebEngineWidgets.QWebEngineView(self.tab)
        self.WebEngineView_show_data_pie.setGeometry(QtCore.QRect(0, 420, 811, 351))
        self.WebEngineView_show_data_pie.setStyleSheet("border: 1px solid black;")
        self.WebEngineView_show_data_pie.setObjectName("WebEngineView_show_data_pie")
        self.label_pie = QtWidgets.QLabel(self.tab)
        self.label_pie.setGeometry(QtCore.QRect(820, 420, 621, 341))
        font = QtGui.QFont()
        font.setFamily("华文仿宋")
        font.setPointSize(17)
        font.setBold(True)
        font.setWeight(75)
        self.label_pie.setFont(font)
        self.label_pie.setText("")
        self.label_pie.setObjectName("label_pie")
        self.tabWidget.addTab(self.tab, "")
        self.tab_2 = QtWidgets.QWidget()
        self.tab_2.setObjectName("tab_2")
        self.WebEngineView_show_vv_Bar = QtWebEngineWidgets.QWebEngineView(self.tab_2)
        self.WebEngineView_show_vv_Bar.setGeometry(QtCore.QRect(0, 50, 701, 361))
        self.WebEngineView_show_vv_Bar.setStyleSheet("border: 1px solid black;")
        self.WebEngineView_show_vv_Bar.setObjectName("WebEngineView_show_vv_Bar")
        self.WebEngineView_show_cm_Bar = QtWebEngineWidgets.QWebEngineView(self.tab_2)
        self.WebEngineView_show_cm_Bar.setGeometry(QtCore.QRect(0, 410, 701, 341))
        self.WebEngineView_show_cm_Bar.setStyleSheet("border: 1px solid black;")
        self.WebEngineView_show_cm_Bar.setObjectName("WebEngineView_show_cm_Bar")
        self.pButton_show_four_picture = QtWidgets.QPushButton(self.tab_2)
        self.pButton_show_four_picture.setGeometry(QtCore.QRect(10, 0, 121, 41))
        self.pButton_show_four_picture.setObjectName("pButton_show_four_picture")
        self.WebEngineView_show_ca_Tunnel = QtWebEngineWidgets.QWebEngineView(self.tab_2)
        self.WebEngineView_show_ca_Tunnel.setGeometry(QtCore.QRect(700, 50, 751, 361))
        self.WebEngineView_show_ca_Tunnel.setStyleSheet("border: 1px solid black;")
        self.WebEngineView_show_ca_Tunnel.setObjectName("WebEngineView_show_ca_Tunnel")
        self.WebEngineView_show_ca_Cloud = QtWebEngineWidgets.QWebEngineView(self.tab_2)
        self.WebEngineView_show_ca_Cloud.setGeometry(QtCore.QRect(700, 410, 751, 341))
        self.WebEngineView_show_ca_Cloud.setStyleSheet("border: 1px solid black;")
        self.WebEngineView_show_ca_Cloud.setObjectName("WebEngineView_show_ca_Cloud")
        self.tabWidget.addTab(self.tab_2, "")
        self.tab_3 = QtWidgets.QWidget()
        self.tab_3.setObjectName("tab_3")
        self.tabWidget1 = QtWidgets.QTabWidget(self.tab_3)
        self.tabWidget1.setGeometry(QtCore.QRect(0, 0, 1441, 761))
        font = QtGui.QFont()
        font.setBold(True)
        font.setWeight(75)
        self.tabWidget1.setFont(font)
        self.tabWidget1.setObjectName("tabWidget1")
        self.tab_4 = QtWidgets.QWidget()
        self.tab_4.setObjectName("tab_4")
        self.WebEngineView_show_vv_Bar2 = QtWebEngineWidgets.QWebEngineView(self.tab_4)
        self.WebEngineView_show_vv_Bar2.setGeometry(QtCore.QRect(0, 60, 921, 661))
        self.WebEngineView_show_vv_Bar2.setStyleSheet("border: 1px solid black;")
        self.WebEngineView_show_vv_Bar2.setObjectName("WebEngineView_show_vv_Bar2")
        self.pButton_show1 = QtWidgets.QPushButton(self.tab_4)
        self.pButton_show1.setGeometry(QtCore.QRect(20, 10, 111, 41))
        self.pButton_show1.setObjectName("pButton_show1")
        self.label_vv = QtWidgets.QLabel(self.tab_4)
        self.label_vv.setGeometry(QtCore.QRect(920, 60, 511, 661))
        font = QtGui.QFont()
        font.setFamily("华文仿宋")
        font.setPointSize(15)
        font.setBold(True)
        font.setWeight(75)
        self.label_vv.setFont(font)
        self.label_vv.setText("")
        self.label_vv.setObjectName("label_vv")
        self.tabWidget1.addTab(self.tab_4, "")
        self.tab_5 = QtWidgets.QWidget()
        self.tab_5.setObjectName("tab_5")
        self.pButton_show2 = QtWidgets.QPushButton(self.tab_5)
        self.pButton_show2.setGeometry(QtCore.QRect(30, 10, 111, 41))
        self.pButton_show2.setObjectName("pButton_show2")
        self.WebEngineView_show_cm_Bar2 = QtWebEngineWidgets.QWebEngineView(self.tab_5)
        self.WebEngineView_show_cm_Bar2.setGeometry(QtCore.QRect(0, 60, 981, 661))
        self.WebEngineView_show_cm_Bar2.setStyleSheet("border: 1px solid black;")
        self.WebEngineView_show_cm_Bar2.setObjectName("WebEngineView_show_cm_Bar2")
        self.label_cm = QtWidgets.QLabel(self.tab_5)
        self.label_cm.setGeometry(QtCore.QRect(990, 60, 441, 661))
        font = QtGui.QFont()
        font.setFamily("华文仿宋")
        font.setPointSize(15)
        font.setBold(True)
        font.setWeight(75)
        self.label_cm.setFont(font)
        self.label_cm.setText("")
        self.label_cm.setObjectName("label_cm")
        self.tabWidget1.addTab(self.tab_5, "")
        self.tab_6 = QtWidgets.QWidget()
        self.tab_6.setObjectName("tab_6")
        self.pButton_show3 = QtWidgets.QPushButton(self.tab_6)
        self.pButton_show3.setGeometry(QtCore.QRect(20, 10, 111, 41))
        self.pButton_show3.setObjectName("pButton_show3")
        self.WebEngineView_show_ca_Tunnel2 = QtWebEngineWidgets.QWebEngineView(self.tab_6)
        self.WebEngineView_show_ca_Tunnel2.setGeometry(QtCore.QRect(0, 60, 961, 341))
        self.WebEngineView_show_ca_Tunnel2.setStyleSheet("border: 1px solid black;")
        self.WebEngineView_show_ca_Tunnel2.setObjectName("WebEngineView_show_ca_Tunnel2")
        self.WebEngineView_show_ca_Cloud2 = QtWebEngineWidgets.QWebEngineView(self.tab_6)
        self.WebEngineView_show_ca_Cloud2.setGeometry(QtCore.QRect(0, 400, 961, 321))
        self.WebEngineView_show_ca_Cloud2.setStyleSheet("border: 1px solid black;")
        self.WebEngineView_show_ca_Cloud2.setObjectName("WebEngineView_show_ca_Cloud2")
        self.label_ca = QtWidgets.QLabel(self.tab_6)
        self.label_ca.setGeometry(QtCore.QRect(970, 60, 451, 661))
        font = QtGui.QFont()
        font.setFamily("华文仿宋")
        font.setPointSize(15)
        font.setBold(True)
        font.setWeight(75)
        self.label_ca.setFont(font)
        self.label_ca.setText("")
        self.label_ca.setObjectName("label_ca")
        self.tabWidget1.addTab(self.tab_6, "")
        self.tabWidget.addTab(self.tab_3, "")
        MainWindow.setCentralWidget(self.centralwidget)
        self.menubar = QtWidgets.QMenuBar(MainWindow)
        self.menubar.setGeometry(QtCore.QRect(0, 0, 1450, 26))
        self.menubar.setObjectName("menubar")
        MainWindow.setMenuBar(self.menubar)
        self.statusbar = QtWidgets.QStatusBar(MainWindow)
        self.statusbar.setObjectName("statusbar")
        MainWindow.setStatusBar(self.statusbar)

        self.retranslateUi(MainWindow)
        self.tabWidget.setCurrentIndex(2)
        self.tabWidget1.setCurrentIndex(2)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)

    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
        self.label.setText(_translate("MainWindow", "B站编程课程采集与分析"))
        self.pButton_data_collection.setText(_translate("MainWindow", "开始采集"))
        self.tabWidget.setTabText(self.tabWidget.indexOf(self.tab), _translate("MainWindow", "采集课程数据"))
        self.pButton_show_four_picture.setText(_translate("MainWindow", "数据可视化"))
        self.tabWidget.setTabText(self.tabWidget.indexOf(self.tab_2), _translate("MainWindow", "数据可视化"))
        self.pButton_show1.setText(_translate("MainWindow", "显示分析"))
        self.tabWidget1.setTabText(self.tabWidget1.indexOf(self.tab_4), _translate("MainWindow", "分析点1"))
        self.pButton_show2.setText(_translate("MainWindow", "显示分析"))
        self.tabWidget1.setTabText(self.tabWidget1.indexOf(self.tab_5), _translate("MainWindow", "分析点2"))
        self.pButton_show3.setText(_translate("MainWindow", "显示分析"))
        self.tabWidget1.setTabText(self.tabWidget1.indexOf(self.tab_6), _translate("MainWindow", "分析点3"))
        self.tabWidget.setTabText(self.tabWidget.indexOf(self.tab_3), _translate("MainWindow", "数据分析"))
from PyQt5 import QtWebEngineWidgets

main.py

import sys
from PyQt5.QtWidgets import *
from Function import *

if __name__ == '__main__':
    app = QApplication([])
    mywin = MyWindow()
    mywin.show()
    sys.exit(app.exec_())

DB.py

import pymysql

def GetConn():
    try:
        conn = pymysql.connect(host='localhost',user='root',password='123456',database='bilibili')
    except Exception as e:
        print("连接失败!\n",e)
        print()
    else:
        print("连接成功!\n")
        return conn

def CloseConn(cur,conn):
    try:
        if cur:
            cur.close()
        if conn:
            conn.close()
    except Exception as e:
        print("操作异常!!!\n")

Function.py

from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtWebEngineWidgets import *
import datetime
import time
from bilibili_UI import *
from DB import *
import pymysql
import requests
from lxml import etree
from pyecharts import options as opts
from pyecharts.charts import Funnel
from pyecharts.render import make_snapshot
from pyecharts.globals import ThemeType
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.charts import Bar
from pyecharts.charts import WordCloud


class DataThread(QThread):
    signal = pyqtSignal(str, str, str, str, int, int, str)
    def __init__(self):
        QThread.__init__(self)
        self.state = 1
        self.page_number = 1
        self.o_number = 0
        self.key = []
        self.value = []
    def run(self):
        # while(self.state):
            # 爬虫准备工作
        base_url = 'https://search.bilibili.com/all?vt=77434542&keyword=%E7%BC%96%E7%A8%8B%E8%AF%BE%E7%A8%8B&from_source=webtop_search&spm_id_from=333.1007&search_source=5'
        params = {}

        headers = {


**网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。**

**需要这份系统化的资料的朋友,可以添加V获取:vip204888 (备注大数据)**
![img](https://img-blog.csdnimg.cn/img_convert/220cf4e2c7ddbdb0b9c826a71d2baf77.png)

**一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!**

ort WordCloud


class DataThread(QThread):
    signal = pyqtSignal(str, str, str, str, int, int, str)
    def __init__(self):
        QThread.__init__(self)
        self.state = 1
        self.page_number = 1
        self.o_number = 0
        self.key = []
        self.value = []
    def run(self):
        # while(self.state):
            # 爬虫准备工作
        base_url = 'https://search.bilibili.com/all?vt=77434542&keyword=%E7%BC%96%E7%A8%8B%E8%AF%BE%E7%A8%8B&from_source=webtop_search&spm_id_from=333.1007&search_source=5'
        params = {}

        headers = {


**网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。**

**需要这份系统化的资料的朋友,可以添加V获取:vip204888 (备注大数据)**
[外链图片转存中...(img-VgHr2oNU-1713186009243)]

**一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!**

  • 14
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值