Python——爬虫+词云+数据库

最新推荐文章于 2024-04-21 15:51:20 发布

I_love_hanser_QAQ

最新推荐文章于 2024-04-21 15:51:20 发布

阅读量2.4k

点赞数

文章标签： python 爬虫数据库 mysql 正则表达式

本文链接：https://blog.csdn.net/YQ15161839467/article/details/122279004

版权

爬取指定新闻网站，将爬取到的数据做词频统计生成词云图，做相应的词频分析饼状图，柱状图，散点图。最后将词频数据存至MySQL数据库。

编译环境：PyCharm

数据库下载：AppServ8.5

需用到的模块有：

import time
from PySide2.QtCore import QFile
from PySide2.QtGui import QPixmap
from PySide2.QtWidgets import QApplication, QGraphicsScene, QGraphicsPixmapItem
from PySide2.QtUiTools import QUiLoader
from urllib import request
from bs4 import BeautifulSoup
import wordcloud  # 词云图
import collections
import jieba
import re
import numpy as np
from PIL import Image
import threading    # 多线程
from string import punctuation as Englishpunctuation    # 获取英文标点集合
from zhon.hanzi import punctuation as zhonpunctuation       # 获取中文标点集合
from selenium import webdriver
import csv
import xlwt
import pymysql
import pyecharts.options as opts
from pyecharts.charts import Pie,Bar,EffectScatter
from pyecharts.globals import ThemeType

主程序：

import time
from PySide2.QtCore import QFile                    # .ui文件获取
from PySide2.QtGui import QPixmap                   # 加载图片
from PySide2.QtWidgets import QApplication, QGraphicsScene, QGraphicsPixmapItem    # 加载图片
from PySide2.QtUiTools import QUiLoader
from urllib import request      # 爬取URL
from bs4 import BeautifulSoup   # 筛取爬完的html文件
import wordcloud  # 词云图
import collections      # 词频计数
import jieba            # 分词
import re
import numpy as np
from PIL import Image   # 加载图片
import threading    # 多线程
from string import punctuation as Englishpunctuation    # 获取英文标点集合
from zhon.hanzi import punctuation as zhonpunctuation       # 获取中文标点集合
import showpie     # 自己定义
from selenium import webdriver  # 用浏览器打开html
import 数据库      # 自己定义
import CSV  # 自己定义
# 三个ui文件“UI.ui”,"denlu.ui","zhuce.ui"
# 词云图片保存为“wc.png”
# 词云数据保存为“word_counts_topALL.text”
# xls文件保存为“myexcel.xls”
# 数据库 URL:"http://localhost:8080/phpMyAdmin/"   登录名:root  密码:12345678  数据库名:db.db
url = []
url_sina = ['https://news.sina.com.cn/c/xl/2022-01-01/doc-ikyakumx7683060.shtml','https://news.sina.com.cn/o/2022-01-01/doc-ikyamrmz2472300.shtml','https://news.sina.com.cn/o/2022-01-01/doc-ikyamrmz2467548.shtml',
     'https://news.sina.com.cn/c/2022-01-01/doc-ikyamrmz2515302.shtml','https://news.sina.com.cn/c/2021-12-31/doc-ikyamrmz2441156.shtml','https://finance.sina.com.cn/roll/2022-01-01/doc-ikyakumx7644198.shtml',
     'https://news.sina.com.cn/c/2021-12-31/doc-ikyakumx7537811.shtml','https://news.sina.com.cn/w/2021-12-30/doc-ikyamrmz2171966.shtml','https://news.sina.com.cn/c/2022-01-01/doc-ikyakumx7666807.shtml',
     'https://news.sina.com.cn/c/xl/2021-12-30/doc-ikyakumx7357600.shtml']
url_163 = ['https://www.163.com/dy/article/GSG29CEJ05346RC6.html',
           'https://www.163.com/dy/article/GSIEC0U70514R9OJ.html', 'https://www.163.com/dy/article/GSGOQOQE05346RC6.html','https://www.163.com/dy/article/GSK18CJD0514R9OJ.html'
         'https://www.163.com/news/article/GSIBLSAH000189FH.html','https://www.163.com/dy/article/GSKR7DJ60514R9M0.html','https://www.163.com/dy/article/GSH9FIT90514R9M0.html?clickfrom=w_yw','https://www.163.com/gov/article/GD3TBM6R002399RB.html'
         'https://www.163.com/dy/article/GA4CC6I20512D3VJ.html','https://www.163.com/dy/article/GSKP64C80514R9OJ.html']
url_ifeng = ['https://news.ifeng.com/c/8CRYT9RnXii', 'https://news.ifeng.com/c/8CQQcdns5Jg','https://news.ifeng.com/c/8CRl1IGC2vG','https://news.ifeng.com/c/8CRE4AyY0NX','https://finance.ifeng.com/c/8CRE4AyY0P0',
             'https://news.ifeng.com/c/8CCxj8nMaVl','https://news.ifeng.com/c/8CQKzempy4j','https://news.ifeng.com/c/8CRa0Nir1bc','https://news.ifeng.com/c/8CRYT9RnXkL','https://news.ifeng.com/c/8CRU98XbWBp']
url.append(url_sina);url.append(url_163);url.append(url_ifeng)


class UIPython:
    def __init__(self):
        # 从文件中加载UI定义
        qfile = QFile("UI.ui")
        qfile.open(QFile.ReadOnly)
        qfile.close()
        # 从 UI 定义中动态 创建一个相应的窗口对象
        self.ui = QUiLoader().load(qfile)
        self.ui.ciyunButton.clicked.connect(self.ciyun)

    # 将爬取内容写入html文件 使用异常处理防止反爬导致程序崩溃停止
    def writetext(self, url_name, address):
        with open(address, mode='w', encoding='utf-8') as f:
            f.write('')
        for j in url_name:
            try:
                r = request.Request(j)
                r.add_header('User-agent', 'PyMOTW(https://pymotw.com/)')
                responce = request.urlopen(r)
            except Exception as e:
                print("页面加载失败{0}\n".format(j))
            data = responce.read().decode('utf-8', 'ignore')
            with open('try.html', mode='a', encoding='utf-8') as f:
                f.write(data)
            data = BeautifulSoup(data, 'lxml')
            data_title = list(data.find_all('title'))
            data_content = list(data.find_all('content'))
            data_page = list(data.find_all('p'))
            with open(address, mode='a', encoding='utf-8') as f:
                for i in data_title:
                    f.write(str(i.text) + '\n')
                for i in data_content:
                    f.write(str(i.text) + '\n')
                for i in data_page:
                    f.write(str(i.text) + '\n')

    # 定义词云格式
    def definewc(self):
        # 获取UI.ui界面内容
        max_words = self.ui.max_words.currentText()
        max_font_size = self.ui.max_font_size.currentText()
        colormap = self.ui.colormap.currentText()
        background_color = self.ui.background_color.currentText()
        font_path = self.ui.font_path.currentText()
        mask = self.ui.maskname.currentText()
        contour_color = self.ui.contour_color.currentText()
        contour_width =self.ui.contour_width.text()

        # 获取字体
        if font_path == '中文简体':
            font_path = 'fonts\simfang.ttf'
        elif font_path == '方正舒体':
            font_path='fonts\FZSTK.TTF'
        elif font_path == '华文行楷':
            font_path='fonts\STXINGKA.TTF'

        # 获取mask
        if mask == '中国地图':
            mask = np.array(Image.open('E://语音包//ChinaMap.png'))  # 定义词频背景
        elif mask == '爱丽丝':
            mask = np.array(Image.open('E://语音包//alice_mask.png'))

        # 设置词云格式
        wc = wordcloud.WordCloud(
            font_path=str(font_path),  # 设置字体格式
            background_color=str(background_color),
            mask=mask,  # 设置背景图
            colormap=str(colormap),
            max_words=int(max_words),  # 最多显示词数
            max_font_size=int(max_font_size),  # 字体最大值
            contour_color=contour_color,
            contour_width=int(contour_width)
        )
        return wc

    # 展示词云
    def showwordcloud(self, address):
        with open(address, mode='r', encoding='utf-8') as f:
            string_data = f.read()
        # 文本预处理
        for i in Englishpunctuation:
            string_data = string_data.replace(i, '')            # 删除标点符号
        for i in zhonpunctuation:
            string_data = string_data.replace(i, '')
        string_data = re.sub('[a-zA-Z]', '', string_data)       # 删除英文
        string_data = re.sub('[\d]', '', string_data)           # 删除数字
        # 文本分词
        seg_list_exact = jieba.cut(string_data, cut_all=False)  # 精确模式分词
        object_list = []
        remove_words = [u'的', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'中', u'在', u'了',u'通常', u'如果', u'我们', u'需要', u'他', u'要', u"\u3000", u'年', u'月', u'也', u'你'
                        , u'\n', u' ', u'▎']  # 自定义去除词库

        for word in seg_list_exact:  # 循环读出每个分词
            if word not in remove_words:  # 如果不在去除词库中
                object_list.append(word)  # 分词追加到列表

        # 词频统计
        word_counts = collections.Counter(object_list)  # 对分词做词频统计
        maxword_number = self.ui.number.currentText()
        word_counts_top = word_counts.most_common(int(maxword_number))  # 获取前10最高频的词
        word_counts_topall = word_counts.most_common()
        print(word_counts_top)  # 输出检查

        # 词频展示
        wc = self.definewc()
        wc.generate_from_frequencies(word_counts)  # 从字典生成词云
        wc.to_file('wc.png')

        # 将词云图加载到UI.ui界面
        self.ui.GraphView.scene_img = QGraphicsScene()
        self.imgShow = QPixmap()
        self.imgShow.load('wc.png')
        self.imgShowItem = QGraphicsPixmapItem()
        self.imgShowItem.setPixmap(QPixmap(self.imgShow))
        self.ui.GraphView.scene_img.addItem(self.imgShowItem)
        self.ui.GraphView.setScene(self.ui.GraphView.scene_img)
        self.ui.GraphView.fitInView(QGraphicsPixmapItem(QPixmap(self.imgShow)))

        # 将词云结果插入MySQL数据库
        数据库.insertdb(word_counts_topall)

        # 写入“word_counts_topALL.text”文本
        with open("word_counts_topALL.text", mode='w', encoding='utf8') as f:
            for i in word_counts_topall:
                f.write(i[0]+'\t'+str(i[1])+'\n')

        # 写入.csv和.xls
        CSV.writecsv(word_counts_topall)
        CSV.openxls()

        # 获取选择的图形颜色
        colour = self.ui.colour.currentText()
        if colour == 'blue':colour = '#abddff'
        elif colour == 'yellow':colour = '#ffff7f'
        elif colour == 'green':colour = '#7cff9d'
        elif colour == 'red':colour = '#ff0000'

        # 生成分析图形
        word = []
        for i in word_counts_top:
            word.append(list(i))
        tuxing = self.ui.tuxing.currentText()
        if tuxing == '饼状图':
            showpie.pietu(word, colour)
        elif tuxing == '柱状图':
            showpie.Bartu(word, colour)
        elif tuxing == '散点图':
            showpie.Scatter(word, colour)

        # 设置显示图片守护线程
        thread1 = threading.Thread(target=self.keepdriver)
        thread1.setDaemon(True)
        thread1.start()

    # 使html在游览器上循环出现
    def keepdriver(self):
        driver = webdriver.Edge()
        while(True):
            driver.get('file://C://Users//hp//Desktop//Python//课设//Lib//customized.html')
            driver.maximize_window()
            time.sleep(1000)

    # 词云主程序
    def ciyun(self):
        url_address=self.ui.news_address.currentText()

        if url_address == '新浪新闻':
            url_address = url[0]
            address = 'news_sina.html'
        elif url_address == '网易新闻':
            url_address = url[1]
            address = 'news_163.html'
        elif url_address == '凤凰新闻':
            url_address = url[2]
            address = 'news_ifeng.html'
        self.writetext(url_address, address)
        self.showwordcloud(address)


# 登陆界面
class Denlu:
    def __init__(self):
        # 从文件中加载UI定义
        qfile = QFile("denlu.ui")
        qfile.open(QFile.ReadOnly)
        qfile.close()
        # 从 UI 定义中动态 创建一个相应的窗口对象
        self.ui = QUiLoader().load(qfile)
        self.ui.acknowledge.clicked.connect(self.acknowledge)
        self.ui.delete_2.clicked.connect(self.ui.close)
        self.ui.zhuce.clicked.connect(self.zhuce)

    # 确认按钮程序
    def acknowledge(self):
        name = self.ui.name.text()
        password = self.ui.password.text()
        if 数据库.searchusername_password(name, password):
            self.state = UIPython()
            self.state.ui.show()
            self.ui.close()
        elif name == '' or password == '':
            self.ui.Error.setText('Error! Do not enter a user name or password')
        else:
            self.ui.Error.setText('Error! Incorrect user name or password')

    # 注册按钮程序
    def zhuce(self):
            self.zhuce = Zhuce()
            self.zhuce.ui.show()
            self.ui.close()


# 注册界面
class Zhuce:
    def __init__(self):
        # 从文件中加载UI定义
        qfile = QFile("zhuce.ui")
        qfile.open(QFile.ReadOnly)
        qfile.close()
        # 从 UI 定义中动态 创建一个相应的窗口对象
        self.ui = QUiLoader().load(qfile)
        self.ui.submit.clicked.connect(self.submit)
        self.ui.delete_2.clicked.connect(self.ui.close)

    # 提交按钮————MySQL数据库
    def submit(self):
        username = self.ui.username.text()
        password = self.ui.password.text()
        password1 = self.ui.password1.text()
        if username == '' or password == '' or password1 == '':
            self.ui.Error.setText('Error! Do not enter a user name or password')
        elif password != password1:
            self.ui.Error.setText('Error! Two times to enter the password do not match')
        else:
            if 数据库.searchusername(username):
                数据库.insertusername(username, password)
                self.denlu = Denlu()
                self.denlu.ui.show()
                self.ui.close()
            else:
                self.ui.Error.setText('Error! 用户名重复')


app = QApplication([])
denlu=Denlu()
denlu.ui.show()
# 设置守护线程
thread = threading.Thread(target=app.exec_())
thread.setDaemon(True)
thread.start()

展示分析图片程序：

import pyecharts.options as opts
from pyecharts.charts import Pie,Bar,EffectScatter
from pyecharts.globals import ThemeType
# 图形保存为“customized.html”


# 饼图
def pietu(data_pair,colour):
    data_pair.sort(key=lambda x: x[1])  # 排序
    c = (
        # 初始化
        Pie(init_opts=opts.InitOpts(
            width="900px",
            height="600px",
            theme=ThemeType.MACARONS))
            .add(
            series_name="访问来源",  # 系列名称
            data_pair=data_pair,  # 系列数据项，格式为 [(key1, value1), (key2, value2)]

            # 是否展示成南丁格尔图，通过半径区分数据大小，有'radius'和'area'两种模式。
            # radius：扇区圆心角展现数据的百分比，半径展现数据的大小
            # area：所有扇区圆心角相同，仅通过半径展现数据大小
            rosetype="radius",

            # 饼图的半径
            radius="55%",

            # 饼图的中心（圆心）坐标，数组的第一项是横坐标，第二项是纵坐标
            # 默认设置成百分比，设置成百分比时第一项是相对于容器宽度，第二项是相对于容器高度
            center=["50%", "50%"],

            # 标签配置项
            label_opts=opts.LabelOpts(is_show=False, position="center"),
        )

            # 全局配置项
            .set_global_opts(
            # 设置标题
            title_opts=opts.TitleOpts(
                title="Customized Pie",
                pos_left="center",
                pos_top="20",
                title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
            ),
            # 设置图例
            legend_opts=opts.LegendOpts(is_show=True),
        )

            # 系统配置项
            .set_series_opts(
            # 设置提示框
            tooltip_opts=opts.TooltipOpts(
                trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
            ),
            label_opts=opts.LabelOpts(color=colour),
        ).render("customized.html")
    )


# 柱状图
def Bartu(data_pair, colour):
    data_pair.sort(key=lambda x: x[1])  # 排序
    x1 = [];y1 = []
    for i in data_pair:
        x1.append(i[0]);y1.append(i[1])
    c = (
        Bar(init_opts=opts.InitOpts(
            width="900px",
            height="600px",
            theme=ThemeType.MACARONS)).add_xaxis(x1).add_yaxis('频率', y1).set_colors(colour).set_global_opts(
            title_opts=opts.TitleOpts(title="新闻词出现频率"),
            yaxis_opts=opts.AxisOpts(name="频率"),
            xaxis_opts=opts.AxisOpts(name="词"))
    ).render("customized.html")


# 散点图
def Scatter(data_pair, colour):
    x1 = [];y1 = []
    for i in data_pair:
        x1.append(i[0]);y1.append(i[1])
    c = (
        EffectScatter(init_opts=opts.InitOpts(
            width="900px",
            height="600px",
            theme=ThemeType.MACARONS)).add_xaxis(x1).add_yaxis('频率', y1).set_colors(colour).set_global_opts(
            title_opts=opts.TitleOpts(title="新闻词出现频率"),
            yaxis_opts=opts.AxisOpts(name="频率"),
            xaxis_opts=opts.AxisOpts(name="词"))
    ).render("customized.html")

数据库登陆方法：localhost:8080 / localhost | phpMyAdmin 4.6.4http://localhost:8080/phpMyAdmin/

数据库程序：

import pymysql


# 向MySQL服务器插入词云数据
def insertdb(data_pair):
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()

    cursor.execute("DROP TABLE IF EXISTS CIYUN")
    sql = """CREATE TABLE CIYUN(
            NAME CHAR(100) NOT NULL,
            NUMBER INT )"""
    cursor.execute(sql)

    sql = "INSERT INTO CIYUN(NAME,NUMBER) \
           VALUES (%s,%s)"

    data_pair1 = []
    for i in data_pair:
        data_pair1.append((i[0], str(i[1])))
    data_pair1 = tuple(data_pair1)

    for i in data_pair1:
        cursor.execute(sql, i)
    db.commit()
    db.close()


# 在MySQL服务器中搜索用户名和密码
def searchusername_password(usename,usepassword):
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()
    sql = "SELECT * FROM USERNAME"
    cursor.execute(sql)
    results = cursor.fetchall()
    for row in results:
        if row[0] == usename and row[1] == usepassword:
            db.close()
            return 1
    return 0


# 在MySQL服务器中搜索用户名
def searchusername(usename):
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()
    sql = "SELECT * FROM USERNAME"
    cursor.execute(sql)
    results = cursor.fetchall()
    for row in results:
        if row[0] == usename:
            db.close()
            return 0
    return 1


# 注册用户名和密码
def insertusername(username, usepassword):
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()
    sql = "INSERT INTO USERNAME(NAME,PASSWORD) \
            VALUES (%s,%s)"
    value = (str(username), str(usepassword))
    cursor.execute(sql, value)
    db.commit()
    db.close()


# 辅助测试用 主程序并未调用 可删除
def creatusername():
    db = pymysql.connect(host='localhost', user='root', password='12345678', db='DB', charset='utf8')
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS USERNAME")
    sql = """CREATE TABLE USERNAME(
                NAME CHAR(100) NOT NULL,
                PASSWORD  CHAR(100))"""

    cursor.execute(sql)
    sql = "INSERT INTO USERNAME(NAME,PASSWORD) \
            VALUES (%s,%s)"

    value = (('2537148609', '12345678'), ('yangqun', 'kuaile'))
    for i in value:
        cursor.execute(sql, i)
    db.commit()
    db.close()

.csv 文件和和 .xls文件程序：

import csv
import xlwt
# csv文件保存为“ciping.csv”
# xls文件保存为“myexcel.xls”


# 写csv文件
def writecsv(data_pair):
    headers = ['词语', '频率']
    data_pair1 = []
    for i in data_pair:
        data_pair1.append((i[0], str(i[1])))
    with open('ciping.csv',mode='w',encoding='utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(data_pair1)


# 写xls文件
def openxls():
    myexcel = xlwt.Workbook()
    # 新建sheet页
    mysheet = myexcel.add_sheet("testsheet")
    # 打开csv文件，事实证明file和open 效果一样的，网上建议用open打开
    with open("ciping.csv",mode='r',encoding='utf8') as csvfile:
        reader = csv.reader(csvfile)
        l = 0
        # 通过循环获取单行信息
        for line in reader:
            r = 0
            # 通过双重循环获取单个单元信息
            for i in line:
                # 通过双重循环写入excel表格
                mysheet.write(l, r, i)
                r += 1
            l += 1
        # 最后保存到excel
        myexcel.save("myexcel.xls")