新浪新闻数据爬取与清洗+新浪新闻数据管理系统
设计要求
新浪新闻数据爬取与清洗
基本要求:完成新浪新闻排行中文章的数据爬取,包括标题、媒体、时间、内容。
进阶要求:对最近一周出现次数最多的关键字排名并进行可视化显示。
新浪新闻数据管理系统
基本要求:完成新浪新闻的展示功能,同时提供删除和查询功能,查询可以按照题目、时间、题目和内容关键字进行查询;完成用户的登录、注册、退出功能。
进阶要求:完成数据导出excel功能、分页功能。
运行部分截图
源码
运行代码之前,先在数据库创建一个如下所示的表格(库名:xinlangxinwen)
# Ciyun.py
import jieba
import wordcloud
class Ciyun:
def __init__(self, tablena, txt):
excluds = {'目前', '没有', '责任编辑', '标题', '他们', '已经'}
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
else:
rword = word
counts[rword] = counts.get(rword, 0) + 1
for word in excluds:
del counts[word]
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
text = ''
for i in range(20):
word, count = items[i]
text = text + ' ' + word
w = wordcloud.WordCloud(background_color='white', font_path='C/Windows/Font/simkai.ttf')
w.generate(text)
w.to_file('D:/python object/test/新浪新闻爬取/Ciyun/' + tablena + '.png')
# Excel.py
from openpyxl import Workbook
class Excel:
def __init__(self, tablena, list):
wb = Workbook()
ws = wb["Sheet"]
ws['A1'] = 'idnum'
ws['B1'] = 'title'
ws['C1'] = 'media'
ws['D1'] = 'date'
ws['E1'] = 'text'
for i in range(2, len(list) + 2):
for j in range(1, 6):
ws.cell(row=i, column=j).value = str(list[i - 2][j - 1]) # 写单元格
wb.save('D:/python object/test/新浪新闻爬取/Excel/' + tablena + '.xlsx')
# getsHTML.py
import re
import requests
from bs4 import BeautifulSoup
class getsHTML:
# 获取页面的HTML
def getHTML(self, url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
print('wrong! ' + e)
# 提取每一份排名的js子链接
def getJS(self, string, divid):
so = BeautifulSoup(string, 'html.parser')
Js = so.find(attrs={'id': divid})
jsurl = Js.script.attrs['src']
return jsurl
# 获取分享数排行的js子连接
def getfenxiang(self, string, divid):
so = BeautifulSoup(string, 'html.parser')
Js = so.find(attrs={'id': divid})
i = 0
for k in Js.find_all('script'):
i += 1
if len(Js.find_all('script')) == i:
jsurl = k.attrs['src']
return jsurl
# 提取js中的列表
def gettitle(self, jshtml):
pat = re.compile("\[\{.*?\}\]")
data = pat.findall(jshtml)
return data[0]
# 提取新闻内容
def gettext(self, xinwen):
jsurl = ''
so = BeautifulSoup(xinwen, 'html.parser')
mt = so.find(attrs={'name': 'mediaid'}).attrs['content']
for k in so.find_all('p'):
neirong = k.string
if neirong is None:
continue
if neirong[:5] == '电话:40' or neirong[:8] == '“掌”握科技鲜闻' or neirong[:9] == '24小时滚动播报最' or neirong[
:8] == '更多娱乐八卦、明' or neirong[
:6] == '更多猛料!欢' or neirong[
:11] == 'Copyright ©':
break
jsurl += neirong
return mt, jsurl
# 提取图片内容
def getPicture(self, title, picurl):
idnum = 0
text = ''
so = BeautifulSoup(picurl, 'html.parser')
Js = so.find(attrs={'id': 'eData'})
for k in Js.find_all('dl'):
idnum += 1
picU = k.find('dd').string
houzhui = picU[-4:]
pwd = 'D:/python object/test/新浪新闻爬取/Picture/' + title + str(idnum) + houzhui
with open(pwd, 'wb') as f:
r = requests.get(picU)
f.write(r.content)
f.close()
text += pwd + '; '
return text
# main.py
import getsHTML
import mySql
import zhuce
import TKinter
import os
import shutil
import tkinter as tk
from tkinter import messagebox
class main:
def mains(self, html, tablena, div):
if div[4] == '3':
jsurl = self.gH.getfenxiang(html, div)
else:
jsurl = self.gH.getJS(html, div)
jshtml = self.gH.getHTML(jsurl)
jslist = self.gH.gettitle(jshtml)
jslist = jslist.replace('false', '""') # 'Con92'可能会出现false,防止程序出错,去除(为防止其它也出现所以全部检验一遍)
lists = eval(jslist)
for i in range(len(lists)):
xinwenurl = lists[i].get('url')
xinwenurls = eval(repr(xinwenurl).replace('\\', '')) # 去除转义符
xinwen = self.gH.getHTML(xinwenurls)
try:
media, txt = self.gH.gettext(xinwen)
except:
continue
if txt is None:
continue
txt = txt.replace(' ', '') # 去除文章中的空格
title = lists[i].get('title')
date = lists[i].get('create_date')
self.idnum += 1
self.mS.inSql(tablena, self.idnum, title, media, date, txt)
print(self.idnum)
def maines(self, html, tablena, div):
shutil.rmtree('D:/python object/test/新浪新闻爬取/Picture') # 清除老旧图片,强制删除文件夹
os.mkdir('D:/python object/test/新浪新闻爬取/Picture') # 重新创建文件夹
jsurl = self.gH.getJS(html, div)
jshtml = self.gH.getHTML(jsurl)
jslist = self.gH.gettitle(jshtml)
jslist = jslist.replace('false', '""') # 'Con92'可能会出现false,防止程序出错,去除(为防止其它也出现所以全部检验一遍)
lists = eval(jslist)
for i in range(len(lists)):
xinwenurl = lists[i].get('url')
xinwenurls = eval(repr(xinwenurl).replace('\\', '')) # 去除转义符
xinwen = self.gH.getHTML(xinwenurls)
title = lists[i].get('title')
date = lists[i].get('create_date')
media = lists[i].get('media')
try:
txt = self.gH.getPicture(title, xinwen)
except:
continue
if txt is None:
continue
self.idnum += 1
self.mS.inSql(tablena, self.idnum, title, media, date, txt)
print(self.idnum)
def main(self):
self.idnum = 0
tableName = {
'Con11': '点击量总排行',
'Con12': '评论数总排行',
'Con15': '图片总排行',
'Con21': '国内新闻点击量',
'Con22': '国内新闻评论数',
'Con23': '国内新闻分享数',
'Con31': '国际新闻点击量',
'Con32': '国际新闻评论数',
'Con33': '国际新闻分享数',
'Con41': '社会新闻点击量',
'Con42': '社会新闻评论数',
'Con43': '社会新闻分享数',
'Con51': '体育新闻点击量',
'Con52': '体育新闻评论数',
'Con53': '体育新闻分享数',
'Con61': '科技新闻点击量',
'Con62': '科技新闻评论数',
'Con63': '科技新闻分享数',
'Con71': '财经新闻点击量',
'Con72': '财经新闻评论数',
'Con73': '财经新闻分享数',
'Con81': '娱乐新闻点击量',
'Con82': '娱乐新闻评论数',
'Con83': '娱乐新闻分享数',
'Con91': '军事新闻点击量',
'Con92': '军事新闻评论数',
'Con93': '军事新闻分享数'
}
list_se = self.mS.useSql(self.name.get())
if list_se == ():
messagebox.showerror(title='爬取失败', message='用户名不存在')
elif list_se[0][0] == self.passwd.get():
YN = messagebox.askokcancel('警告', '爬虫会覆盖原有数据,在提示完成前,请勿进行操作!') # 是/否,返回值yes/no
if YN:
html = self.gH.getHTML(r'http://news.sina.com.cn/hotnews/')
for i in range(1, 10):
for j in range(1, 4):
if i == 1 and j == 3:
continue
else:
tablena = tableName.get('Con' + str(i) + str(j))
self.mS.crSql(tablena)
self.mains(html, tablena, 'Con' + str(i) + str(j))
tablena = tableName.get('Con15')
self.mS.crSql(tablena)
self.maines(html, tablena, 'Con15')
messagebox.showinfo(title='成功', message='爬取完成')
else:
messagebox.showerror(title='爬取失败', message='密码错误')
def wins(self):
self.gH = getsHTML.getsHTML()
self.mS = mySql.mySql()
self.zc = zhuce.zhuce()
self.TK = TKinter.TKinter()
login = tk.Tk()
login.title('用户登录')
login.geometry('210x180+100+100')
tk.Label(login, text='用户登录').grid(row=0, column=0, columnspan=2, pady=5)
tk.Label(login, text='用户名:').grid(row=1, column=0, pady=5)
self.name = tk.Entry(login)
self.name.grid(row=1, column=1, pady=5)
tk.Label(login, text='密码:').grid(row=2, column=0, sticky=tk.E, pady=5)
self.passwd = tk.Entry(login, show='*')
self.passwd.grid(row=2, column=1, pady=5)
tk.Button(login, text='登录', command=self.logins).grid(row=3, column=1, pady=5)
tk.Button(login, text='注册', command=self.zhuce).grid(row=3, column=0, pady=5)
tk.Button(login, text='注销', command=self.zhuxiao).grid(row=4, column=0, pady=5)
tk.Button(login, text='爬虫', command=self.main).grid(row=4, column=1, pady=5)
login.mainloop()
def logins(self):
list_se = self.mS.useSql(self.name.get())
if list_se == ():
messagebox.showerror(title='登录失败', message='用户名不存在')
elif list_se[0][0] == self.passwd.get():
self.TK.wins()
else:
messagebox.showerror(title='登录失败', message='密码错误')
def zhuce(self):
self.zc.wins()
def zhuxiao(self):
list_se = self.mS.useSql(self.name.get())
if list_se == ():
messagebox.showerror(title='注销失败', message='用户名不存在')
elif list_se[0][0] == self.passwd.get():
YN = messagebox.askokcancel('警告', '注销后将无法找回') # 是/否,返回值yes/no
if YN:
self.mS.deUse(self.name.get())
messagebox.showinfo(title='成功', message='注销成功')
else:
messagebox.showerror(title='注销失败', message='密码错误')
ma = main()
ma.wins()
# mySql.py
import pymysql
import time
from tkinter import messagebox
import Ciyun
import Excel
class mySql:
# 建立连接
def __init__(self):
self.connection = pymysql.connect('localhost', 'root', '123456', 'xinlangxinwen')
self.cursor = self.connection.cursor()
# 建立数据库表格
def crSql(self, tablena):
sql = 'create table ' + tablena + '(idnum int not null, title varchar(50) not null, media varchar(50), date date, text text )'
self.cursor.execute('use xinlangxinwen')
self.cursor.execute('DROP TABLE IF EXISTS ' + tablena)
self.cursor.execute(sql)
# 数据导入数据库
def inSql(self, tablena, idnum, title, media, date, text):
title = title.replace('"', r'\"')
media = media.replace('"', r'\"')
text = text.replace('"', r'\"') # 去除双引号对输入的影响
insert = 'insert into ' + tablena + ' values ("' + str(
idnum) + '","' + title + '","' + media + '","' + date + '","' + text + '")'
try:
time.strptime(date, "%Y-%m-%d")
except:
messagebox.showerror(title='错误', message='日期格式不正确')
try:
self.cursor.execute(insert)
self.connection.commit()
except:
pass # 1406, "Data too long for column 'text' at row 1"
# 从数据库爬取新闻题目
def outSql(self, tablena, title, media, date, text, page):
select = 'select title from ' + tablena + ' where title like "%' + title + '%" and media like "%' + media + '%" and date like "%' + date + '%" and text like "%' + text + '%" limit ' + str(
page) + ',20'
self.cursor.execute(select)
list_se = self.cursor.fetchall()
return list_se
# 查询新闻的详细信息
def selSql(self, tablena, title):
select = 'select * from ' + tablena + ' where title = "' + title + '"'
self.cursor.execute(select)
list_se = self.cursor.fetchall()
return list_se
# 更新新闻信息
def upSql(self, tablena, idnum, title, media, date, text):
sql = 'update ' + tablena + ' set title = "' + title + '", media = "' + media + '", date = "' + date + '", text="' + text + '" where idnum = ' + str(
idnum)
self.cursor.execute(sql)
self.connection.commit()
# 删除新闻
def delSql(self, tablena, idnum):
sql = 'delete from ' + tablena + ' where idnum = ' + str(idnum)
self.cursor.execute(sql)
self.connection.commit()
# 用户登录验证
def useSql(self, name):
select = 'select password from user where name= "' + name + '"'
self.cursor.execute(select)
list_se = self.cursor.fetchall()
return list_se
# 用户注册
def crUse(self, name, passwd):
insert = 'insert into user values ("' + name + '" , "' + passwd + '")'
self.cursor.execute(insert)
self.connection.commit()
# 用户注销
def deUse(self, name):
sql = 'delete from user where name = "' + name + '"'
self.cursor.execute(sql)
self.connection.commit()
# 读取内容,生成词云
def Ciyun(self, tablena):
txt = ''
select = 'select text from ' + tablena
self.cursor.execute(select)
list_se = self.cursor.fetchall()
for i in list_se:
txt += str(i)
Ciyun.Ciyun(tablena, txt)
# 读取数据库全部数据,用于导出到Excel
def Excel(self, tablena):
select = 'select * from ' + tablena
self.cursor.execute(select)
list_se = self.cursor.fetchall()
Excel.Excel(tablena, list_se)
# 析构函数,关闭通道
def __del__(self):
self.cursor.close()
self.connection.close()
# TKinter.py
import mySql
import xiangxi
import time
import tkinter as tk
from tkinter import ttk
from tkinter import messagebox
class TKinter:
def wins(self):
self.idnum = 0
self.win = tk.Tk()
self.win.title('新闻总览')
self.win.geometry('500x700+350+50')
tk.Label(self.win, text='新闻类别:').grid(row=0, column=0, pady=5, sticky=tk.E)
self.cmb = ttk.Combobox(self.win, state='readonly') # 设置新闻类别的下拉框
self.cmb.grid(row=0, column=1, sticky=tk.W)
self.cmb['value'] = (
'点击量总排行', '评论数总排行', '图片总排行', '国内新闻点击量', '国内新闻评论数', '国内新闻分享数', '国际新闻点击量', '国际新闻评论数',
'国际新闻分享数',
'社会新闻点击量', '社会新闻评论数', '社会新闻分享数', '体育新闻点击量', '体育新闻评论数', '体育新闻分享数', '科技新闻点击量', '科技新闻评论数', '科技新闻分享数',
'财经新闻点击量',
'财经新闻评论数', '财经新闻分享数', '娱乐新闻点击量', '娱乐新闻评论数', '娱乐新闻分享数', '军事新闻点击量', '军事新闻评论数', '军事新闻分享数')
self.cmb.current(0)
self.mS = mySql.mySql()
tk.Label(self.win, text='标题:').grid(row=1, column=0, pady=5, sticky=tk.E)
self.titleEntry = tk.Entry(self.win, width=40)
self.titleEntry.grid(row=1, column=1, pady=5, sticky=tk.W)
tk.Label(self.win, text='媒体:').grid(row=2, column=0, pady=5, sticky=tk.E)
self.mediaEntry = tk.Entry(self.win, width=40)
self.mediaEntry.grid(row=2, column=1, pady=5, sticky=tk.W)
tk.Label(self.win, text='时间:').grid(row=3, column=0, pady=5, sticky=tk.E)
self.dateEntry = tk.Entry(self.win, width=40)
self.dateEntry.insert(tk.END, 'YYYY-MM-DD')
self.dateEntry.grid(row=3, column=1, pady=5, sticky=tk.W)
self.dateEntry.bind("<Button-1>", self.clear) # 设定点击事件,点击文本框清除默认值
tk.Label(self.win, text='内容:').grid(row=4, column=0, pady=5, sticky=tk.E)
self.txtEntry = tk.Entry(self.win, width=40)
self.txtEntry.grid(row=4, column=1, pady=5, sticky=tk.W)
self.creatButton = tk.Button(self.win, text='添加', width=10, height=1, command=self.Creat)
self.creatButton.grid(row=5, column=0, pady=5)
self.selectButton = tk.Button(self.win, text='查询', width=10, height=1, command=self.shouye)
self.selectButton.grid(row=5, column=1, pady=5)
self.selectButton = tk.Button(self.win, text='词云', width=10, height=1, command=self.ciyun)
self.selectButton.grid(row=6, column=0, pady=5)
self.selectButton = tk.Button(self.win, text='导出', width=10, height=1, command=self.daochu)
self.selectButton.grid(row=6, column=1, pady=5)
self.listBox = tk.Listbox(self.win, width=70, height=20)
self.listBox.grid(row=7, column=0, columnspan=2, pady=5)
self.listBox.bind('<<ListboxSelect>>', self.click) # 绑定点击事件
self.pageEntry = tk.Entry(self.win, width=5)
self.pageEntry.grid(row=8, column=0, columnspan=2, pady=5)
self.selectButton = tk.Button(self.win, text='上一页', width=10, height=1, command=self.last)
self.selectButton.grid(row=9, column=0)
self.selectButton = tk.Button(self.win, text='下一页', width=10, height=1, command=self.next)
self.selectButton.grid(row=9, column=1)
self.win.mainloop()
def clear(self, event):
if self.dateEntry.get() == 'YYYY-MM-DD':
self.dateEntry.delete(0, tk.END)
def Creat(self):
self.idnum -= 1
self.tablena = self.cmb.get()
title = self.titleEntry.get()
media = self.mediaEntry.get()
date = self.dateEntry.get()
txt = self.txtEntry.get()
if date == 'YYYY-MM-DD':
date = ''
else:
try:
time.strptime(date, "%Y-%m-%d")
except:
messagebox.showerror(title='错误', message='日期格式不正确')
return
if title == '' or media == '' or date == '' or txt == '':
messagebox.showerror(title='失败', message='添加信息有空值')
return
self.mS.inSql(self.tablena, self.idnum, title, media, date, txt)
messagebox.showinfo(title='成功', message='添加成功')
# 查询按钮调用方法,重置页面为首页
def shouye(self):
self.page = 0
self.Select()
self.pageEntry.delete(0, tk.END)
self.pageEntry.insert(0, str(int(self.page / 20 + 1)))
# 上一页
def last(self):
self.page -= 20
if self.page < 0:
self.page = 0
self.Select()
self.pageEntry.delete(0, tk.END)
self.pageEntry.insert(0, str(int(self.page / 20 + 1)))
# 下一页
def next(self):
self.page += 20
self.Select()
self.pageEntry.delete(0, tk.END)
self.pageEntry.insert(0, str(int(self.page / 20 + 1)))
def Select(self):
self.tablena = self.cmb.get()
self.listBox.delete(0, tk.END)
title = self.titleEntry.get()
media = self.mediaEntry.get()
date = self.dateEntry.get()
if date == 'YYYY-MM-DD' or date == '':
date = ''
else:
try:
time.strptime(date, "%Y-%m-%d")
except:
messagebox.showerror(title='错误', message='日期格式不正确')
return
txt = self.txtEntry.get()
list_se = self.mS.outSql(self.tablena, title, media, date, txt, self.page)
for i in list_se:
i = str(i)[2:-3]
self.listBox.insert(tk.END, i)
def ciyun(self):
self.mS.Ciyun(self.tablena)
messagebox.showinfo(title='成功', message='词云导出在Ciyun目录中')
def daochu(self):
self.tablena = self.cmb.get()
self.mS.Excel(self.tablena)
messagebox.showinfo(title='成功', message='表格导出在Excel目录中')
def click(self, event):
w = event.widget
title = w.get(w.curselection()) # 获取鼠标选中的标题
xx = xiangxi.xiangxi(self.tablena, title)
xx.wins()
# xiangxi.py
import mySql
import time
import tkinter as tk
from tkinter import messagebox
class xiangxi:
def __init__(self, tablena, title):
self.tablena = tablena
self.mS = mySql.mySql()
self.list_se = self.mS.selSql(tablena, title)
def wins(self):
self.win = tk.Tk()
self.win.title('详细信息')
self.win.geometry('650x500+870+150')
tk.Label(self.win, text='标题:').grid(row=0, column=0, pady=5, sticky=tk.E)
self.titleEntry = tk.Entry(self.win, width=40)
self.titleEntry.grid(row=0, column=1, pady=5, sticky=tk.W)
self.titleEntry.insert(tk.END, self.list_se[0][1])
tk.Label(self.win, text='媒体:').grid(row=1, column=0, pady=5, sticky=tk.E)
self.mediaEntry = tk.Entry(self.win, width=40)
self.mediaEntry.grid(row=1, column=1, pady=5, sticky=tk.W)
self.mediaEntry.insert(tk.END, self.list_se[0][2])
tk.Label(self.win, text='时间:').grid(row=2, column=0, pady=5, sticky=tk.E)
self.dateEntry = tk.Entry(self.win, width=40)
self.dateEntry.grid(row=2, column=1, pady=5, sticky=tk.W)
self.dateEntry.insert(tk.END, self.list_se[0][3])
tk.Label(self.win, text='内容:').grid(row=3, column=0, pady=5, sticky=tk.E)
self.txtText = tk.Text(self.win)
self.txtText.grid(row=3, column=1, pady=5, sticky=tk.W)
self.txtText.insert(tk.END, self.list_se[0][4])
self.updatetButton = tk.Button(self.win, text='修改', width=10, height=1, command=self.Update)
self.updatetButton.grid(row=4, column=0, columnspan=2, pady=5)
self.deleteButton = tk.Button(self.win, text='删除', width=10, height=1, command=self.Delete)
self.deleteButton.grid(row=4, column=1, columnspan=2, pady=5, sticky=tk.W)
self.win.mainloop()
def Update(self):
title = self.titleEntry.get()
media = self.mediaEntry.get()
date = self.dateEntry.get()
try:
time.strptime(date, "%Y-%m-%d")
except:
messagebox.showerror(title='错误', message='日期格式不正确')
return
text = self.txtText.get('1.0', tk.END)
self.mS.upSql(self.tablena, self.list_se[0][0], title, media, date, text)
messagebox.showinfo(title='成功', message='修改成功')
def Delete(self):
YN = messagebox.askokcancel('警告', '删除后信息无法恢复') # 是/否,返回值yes/no
if YN:
self.mS.delSql(self.tablena, self.list_se[0][0])
messagebox.showinfo(title='成功', message='删除成功')
# zhuce.py
import mySql
from tkinter import *
from tkinter import messagebox
class zhuce:
def wins(self):
self.registered = Tk()
self.registered.title('registered')
self.registered.geometry('230x150+100+380')
Label(self.registered, text='用户注册').grid(row=0, column=0, columnspan=2)
Label(self.registered, text='用户名:').grid(row=1, column=0, sticky=E)
self.names = Entry(self.registered)
self.names.grid(row=1, column=1)
Label(self.registered, text='密码:').grid(row=2, column=0, sticky=E)
self.passwds = Entry(self.registered, show='*')
self.passwds.grid(row=2, column=1)
Label(self.registered, text='确认密码:').grid(row=3, column=0)
self.repasswd = Entry(self.registered, show='*')
self.repasswd.grid(row=3, column=1)
Button(self.registered, text='注册', command=self.registeredes).grid(row=4, column=0, columnspan=2, pady=5)
self.registered.mainloop()
def registeredes(self):
mS = mySql.mySql()
name = self.names.get()
passwd = self.passwds.get()
re = mS.useSql(name)
if passwd != self.repasswd.get():
messagebox.showerror(title='注册失败', message='两次密码不一致')
elif name == '':
messagebox.showerror(title='注册失败', message='用户名为空')
elif passwd == '':
messagebox.showerror(title='注册失败', message='密码为空')
elif re:
messagebox.showerror(title='注册失败', message='用户名已存在')
else:
mS.crUse(name, passwd)
messagebox.showinfo(title='成功', message='注册成功')
self.registered.destroy()
请注意源码中的绝对路径,请做相应的修改