import requests as req
from nltk import *
import re
import sqlite3
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from IPython.display import display_html
#成都大学官网的新闻专区url
res= req.get("http://news.cdu.edu.cn/")
#print(res.text)
#数据爬取
from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text,"html.parser")
news = soup.select(".content__secon--list li")
#显示信息
i=0
src1=[]
#数据库连接
conn = sqlite3.connect('mydb.db')
cur = conn.cursor()
def getNewInfo(url):
info = {}
soup = BeautifulSoup(req.get(url).text, "html.parser")
# 标题
title = soup.select(".content__left .content__left--title")[0].text.strip().split("\n")[0]
# 内容
src =soup.select(".content__left .content__left--article")
#获取新闻正文内容
#正则表达式去除p标签和换行符
dr = re.compile(r'<[^>]+>', re.S)
dd1 = dr.sub('', str(src))
srcReal = re.sub('\s+', " ", dd1)
src1.append(srcReal)
with open("test.txt", "a+") as f:
f.write(src1[i])
cur.execute("INSERT INTO mydb VALUES (?,?,?)", (i, title, srcReal))
#结巴分词
print("/".join(jieba.lcut(srcReal))) # 精简模式,返回一个列表类型的结果
print("/".join(jieba.lcut(srcReal, cut_all=True))) # 全模式,使用 'cut_all=True' 指定
print("/".join(jieba.lcut_for_search(srcReal))) # 搜索引擎模式
#创建表
cur.execute("CREATE TABLE mydb("
"Id int NOT NULL,"
"title varchar(255) NOT NULL,"
"src varchar(2000) NOT NULL,"
"PRIMARY KEY (Id))")
#遍历返回的新闻信息
for new in news:
try:
url=new.select("h3 a")[0]['href']
# print(url)
getNewInfo(url)
i=i+1
except Exception as e:
print("————————>",e)
#查询数据库文件
row = cur.execute("select * from mydb")
cur.execute("DROP TABLE mydb")
#数据库提交
conn.commit()
#关闭数据库
conn.close()
with open("test.txt" ,encoding='gb18030')as file:
#1.读取文本内容
t=file.read()
file.close()
ls = jieba.lcut(t)
#词云图
fdist = FreqDist(ls)
fd_sort = sorted(fdist.items(), key=lambda d: d[1],reverse=True)
wc1 = WordCloud(
background_color="white",width=600,
height=300,max_words=50,
font_path="C:\\Windows\\Fonts\\STFANGSO.ttf",#不加这一句显示口字形乱码
)
wc2 = wc1.generate(' '.join(ls))
#显示词云图
plt.imshow(wc2)
plt.axis("off")
plt.show()
爬虫爬取新闻,进行分词和词云图生成
最新推荐文章于 2023-04-28 20:23:06 发布