一:需要的模块
import re import time import requests import csv import json import pymysql.cursors import stylecloud import matplotlib.pyplot as plt
二:数据爬取
使用requests库爬取网站源代码
def spider(type):
url = "https://top.baidu.com/board?tab={}".format(type)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
response = requests.get(url, headers=headers) # 使用requests库进行爬取
text = response.text
write(text, type) # 进入保存数据方法
三:保存数据
保存数据到txt json csv中
def write(text, type):
# 正则匹配需要的内容
obj = re.compile(r'<div class="c-single-text-ellipsis">(?P<names>.*?)</div>.*?'
r'<div class="trend_2RttY">.*?<div class="hot-index_1Bl1a">(?P<ranks>.*?)</div>.*?<div class="text_1lUwZ">',
re.S)
res = obj.finditer(text)
# 循坏打印出来数据,并且存进数组
for i in res:
title = i.group("names")
rank = i.group("ranks")
data = {}
data['热搜名字'] = title
data['热搜指数'] = rank
data_list.append(data)
# 存为json
with open('data_json.json', 'a', encoding='utf-8') as f:
f.write(json.dumps(data, indent=4, ensure_ascii=False))
# json.dump(data, f, ensure_ascii=False, indent=4)
# 存为txt
with open('data.txt', 'a', encoding='utf-8') as f:
f.write(title + '\r')
# 保存为csv
with open('data_csv.csv', 'a', encoding='utf-8', newline='') as f:
csvwriter = csv.writer(f)
# for it in ress:
dic = i.groupdict()
csvwriter.writerow(dic.values())
get_con(title, rank, type) # 进入数据库方法
print("-------数据写入完成-------")
print("-------数据保存成功-------")
四:包爬取的数据保存到数据库中
def get_con(title, rank, type):
print("开始写入数据库")
with open('zhanghao.txt') as f: # 打开txt读取账号密码
line1 = f.readline()
line2 = f.readline()
name = line1.rstrip()
pwd = line2.rstrip()
connect = pymysql.connect(host="localhost", port=3306, user="{}".format(name), passwd="{}".format(pwd),
charset='utf8')
cursor = connect.cursor()
try:
cursor.execute('create database hotsearch') # 执行创建数据库账号密码
except Exception as e:
pass
time.sleep(3)
connect.select_db('hotsearch')
# 设置游标
# 创建表这里缺陷是语句不够智能
SqlLog1 = """
CREATE TABLE `car` (`name` varchar(255) NOT NULL,`time` datetime NOT NULL,`rank` int(255) NOT NULL) ENGINE=MyISAM DEFAULT CHARSET=utf8;
"""
SqlLog2 = """
CREATE TABLE `documentary` (`name` varchar(255) NOT NULL,`time` datetime NOT NULL,`rank` int(255) NOT NULL) ENGINE=MyISAM DEFAULT CHARSET=utf8;
"""
SqlLog3 = """
CREATE TABLE `game` (
`name` varchar(255) NOT NULL,
`time` datetime NOT NULL,
`rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;"""
SqlLog4 = """
CREATE TABLE `movie` (
`name` varchar(255) NOT NULL,
`time` datetime NOT NULL,
`rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
"""
SqlLog5 = """
CREATE TABLE `novel` (
`name` varchar(255) NOT NULL,
`time` datetime NOT NULL,
`rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
"""
SqlLog6 = """
CREATE TABLE `teleplay` (
`name` varchar(255) NOT NULL,
`time` datetime NOT NULL,
`rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
"""
SqlLog7 = """
CREATE TABLE `variety` (
`name` varchar(255) NOT NULL,
`time` datetime NOT NULL,
`rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
"""
SqlLog8 = """
CREATE TABLE `cartoon` (
`name` varchar(255) NOT NULL,
`time` datetime NOT NULL,
`rank` int(255) NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
"""
tables = [SqlLog1,
SqlLog2,
SqlLog3,
SqlLog4,
SqlLog5,
SqlLog6,
SqlLog7,
SqlLog8]
for i in tables:
try:
cursor.execute(i)
except Exception as e:
pass
# 写入数据库
try:
if 'novel' in type:
sql = "INSERT INTO novel(name,rank,time) VALUES (%s,%s,now())"
elif 'realtime' in type:
sql = "INSERT INTO realtime(name,rank,time) VALUES (%s,%s,now())"
elif 'movie' in type:
sql = "INSERT INTO movie(name,rank,time) VALUES (%s,%s,now())"
elif 'teleplay' in type:
sql = "INSERT INTO teleplay(name,rank,time) VALUES (%s,%s,now())"
elif 'variety' in type:
sql = "INSERT INTO variety(name,rank,time) VALUES (%s,%s,now())"
elif 'documentary' in type:
sql = "INSERT INTO documentary(name,rank,time) VALUES (%s,%s,now())"
elif 'car' in type:
sql = "INSERT INTO car(name,rank,time) VALUES (%s,%s,now())"
elif 'game' in type:
sql = "INSERT INTO game(name,rank,time) VALUES (%s,%s,now())"
data = (title, rank)
cursor.execute(sql, data)
# 调用执行
connect.commit()
except Exception as e:
print("插入失败")
# 回滚
connect.rollback()
print('成功插入', cursor.rowcount, '条数据')
# 查询数据放入数组
五:将数据从数据库中查询出来进行分析
select = "select name,rank from {}".format(type)
cursor.execute(select)
select_res = cursor.fetchall() # 结果为元组
nm = []
rk = []
for x in select_res:
nm.append(x[0])
rk.append(x[1])
# 开始分析,设置图表的尺寸样式
plt.rcParams['figure.figsize'] = (19.0, 12.0) # 设置尺寸
plt.rcParams['image.interpolation'] = 'nearest' # 设置style
plt.rcParams['image.cmap'] = 'gray' # 设置 颜色 style
plt.rcParams['font.family'] = ['sans-serif'] # 设置字体样式
plt.rcParams['font.sans-serif'] = ['SimHei']
# X,Y轴的数据
plt.bar(nm, rk)
plt.xticks(rotation=-18) # 标签旋转18度
plt.xlabel("百度热点", fontsize="5") # 设置字体大小
plt.ylabel("热度指数", fontsize="5")
# plt.xticks(fontsize=20,)#设置刻度大小
# plt.yticks(fontsize=20)
plt.show()
print("分析完成")
六:生成一个词云
def ciyun():
stopwords = open('data.txt', encoding='utf-8').read().split('\n')
stylecloud.gen_stylecloud(file_path='data.txt',
font_path='C:\Windows\Fonts\msyhbd.ttc',
output_name='CIYUN.png',
icon_name='fas fa-question-circle',
size=500,
custom_stopwords=stopwords)
print("-------词云制作完成-------")
七:运行结果