爬虫爬取新闻，进行分词和词云图生成

最新推荐文章于 2023-04-28 20:23:06 发布

空白荏

最新推荐文章于 2023-04-28 20:23:06 发布

阅读量823

点赞数 2

分类专栏： python学习文章标签： sqlite python

本文链接：https://blog.csdn.net/Kong_bai_r/article/details/117092459

版权

python学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import requests as req
from nltk import *
import re
import sqlite3
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from IPython.display import display_html

#成都大学官网的新闻专区url
res= req.get("http://news.cdu.edu.cn/")
#print(res.text)
#数据爬取
from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text,"html.parser")

news = soup.select(".content__secon--list li")

#显示信息
i=0
src1=[]
#数据库连接
conn = sqlite3.connect('mydb.db')
cur = conn.cursor()
def getNewInfo(url):
    info = {}
    soup = BeautifulSoup(req.get(url).text, "html.parser")
    # 标题
    title = soup.select(".content__left .content__left--title")[0].text.strip().split("\n")[0]
    # 内容
    src =soup.select(".content__left .content__left--article")
    #获取新闻正文内容
    #正则表达式去除p标签和换行符
    dr = re.compile(r'<[^>]+>', re.S)
    dd1 = dr.sub('', str(src))
    srcReal = re.sub('\s+', " ", dd1)
    src1.append(srcReal)
    with open("test.txt", "a+") as f:
        f.write(src1[i])
    cur.execute("INSERT INTO mydb VALUES (?,?,?)", (i, title, srcReal))

    #结巴分词
    print("/".join(jieba.lcut(srcReal)))  # 精简模式，返回一个列表类型的结果
    print("/".join(jieba.lcut(srcReal, cut_all=True)))      # 全模式，使用 'cut_all=True' 指定
    print("/".join(jieba.lcut_for_search(srcReal)))     # 搜索引擎模式
#创建表
cur.execute("CREATE TABLE mydb("
                     "Id int NOT NULL,"
                     "title varchar(255) NOT NULL,"
                     "src varchar(2000) NOT NULL,"
                     "PRIMARY KEY (Id))")

#遍历返回的新闻信息
for new in news:
    try:

        url=new.select("h3 a")[0]['href']
        # print(url)
        getNewInfo(url)
        i=i+1
    except Exception as e:
        print("————————>",e)

#查询数据库文件
row = cur.execute("select * from mydb")

cur.execute("DROP TABLE mydb")
#数据库提交
conn.commit()
#关闭数据库
conn.close()

with open("test.txt" ,encoding='gb18030')as file:
    #1.读取文本内容
    t=file.read()
    file.close()
ls = jieba.lcut(t)
#词云图
fdist = FreqDist(ls)
fd_sort = sorted(fdist.items(), key=lambda d: d[1],reverse=True)

wc1 = WordCloud(
    background_color="white",width=600,
    height=300,max_words=50,
    font_path="C:\\Windows\\Fonts\\STFANGSO.ttf",#不加这一句显示口字形乱码
)
wc2 = wc1.generate(' '.join(ls))
#显示词云图
plt.imshow(wc2)
plt.axis("off")
plt.show()