import re
import matplotlib
import requests
import xlwt
import matplotlib.pyplot as plt
import pandas as pd
def open_xlsx():
# 准备工作
# 加载Excel数据,处理数据
data = pd.read_excel('腾讯动漫排行榜.xlsx', sheet_name='腾讯动漫排行榜') # 读取Excel文件,创建DataFrame。
# 检查数据
print("查重:")
print(data.duplicated(subset='动漫名字')) #查重
print("查数据重复:")
print(data.isnull()) #查重复
# 处理缺失数据
e = data.fillna(value='空值')
print("处理缺失数据用空值代替缺失")
print(e)
# 再一次提醒检查缺失数据
r =data.isnull().sum().sort_values(ascending=False)
print("检查缺失数据所在的列和数目:")
print(r)
# 规范化数据类型
print("规范化数据类型:")
data['评分数'] = data['评分数'].astype('int')
data['收藏人数'] = data['收藏人数'].astype('int')
print(data['观看人数(亿)'])
print(data['收藏人数'])
# 删除有空值的不完整的行
print("删除有空值的不完整的行")
y = data.dropna(axis=0, how='any')
# 保存结果
y.to_excel('腾讯动漫排行榜.xlsx', sheet_name='腾讯动漫排行榜',index=False)
print("保存结果成功")
def we0(datalist):
x = []
y = []
for it in datalist[0:10]:
x.append(it[0])
y.append(int(it[6]))
# 绘图
# 1. 确定画布
plt.figure(figsize=(20,10)) # figsize:确定画布大小
# 2. 绘图
plt.scatter(x, # 横坐标
y, # 纵坐标
c='red', # 点的颜色
label='收藏人数') # 标签 即为点代表的意思
for a, b in zip(x, y):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
plt.title("腾讯动漫畅销榜前十收藏人数散点图")
# 3.展示图形
plt.legend() # 显示图例
plt.savefig('D:\画图\散点图.png')
plt.show() # 显示所绘图形
def we1(datalist):
x = []
y = []
for it in datalist[0:10]:
x.append(it[0])
y.append(float(it[5]))
plt.figure(figsize=(20, 10), dpi=100)
plt.plot(x, y, c='red')
plt.scatter(x, y, c='red')
plt.grid(True, linestyle='--', alpha=0.5)
plt.xlabel("动漫名", fontdict={'size': 16})
plt.ylabel("观看人数(亿)", fontdict={'size': 16})
plt.title("腾讯动漫畅销榜前十观看人数折线图", fontdict={'size': 20})
for a, b in zip(x, y):
plt.text(a, b, b, ha='center', va='bottom', fontsize=20)
plt.savefig('D:\画图\折线图.png')
plt.show()
def we(datalist):
x = []
y = []
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
for it in datalist[0:10]:
x.append(it[0])
y.append(it[2])
plt.pie(y, labels=x, labeldistance=1.1, autopct='%.2f%%', pctdistance=1.5)
plt.title("腾讯动漫畅销榜前十评价数占比")
plt.savefig('D:\画图\饼形图.png')
plt.show()
def histogram(datalist):
x=[]
y=[]
matplotlib.rc('font', family='SimHei', weight='bold')
plt.rcParams['axes.unicode_minus'] = False
for it in datalist[0:10]:
x.append(it[0])
y.append(float(it[2]))
# 数组反转。
x.reverse()
# 绘图。
fig, ax = plt.subplots()
b = ax.barh(range(len(x)), y, color='black')
# 为横向水平的柱图右侧添加数据标签。
for rect in b:
w = rect.get_width()
ax.text(w, rect.get_y() + rect.get_height() / 2, '%.1f' % float(w), ha='left', va='center')
# 设置Y轴纵坐标上的刻度线标签。
ax.set_yticks(range(len(x)))
ax.set_yticklabels(x)
# 不要X横坐标上的label标签。
plt.xticks(())
plt.title('腾讯动漫畅销榜前十评价', loc='center', fontsize='25',
fontweight='bold', color='red')
plt.savefig('D:\画图\条形图.png')
plt.show()
child_href_list = []
b = []
datalist = []
domain = "https://ac.qq.com/Rank/comicRank/type/pay"
resp = requests.get(domain)
#print(resp.text)
obj1 = re.compile(r'<li class=".*?">.*?<sub class="mod-rank-.*? ui-left">(?P<rank>.*?)</sub>.*?<a class="mod-rank-name ui-left text-overflow" title="(?P<name>.*?)".*?href="(?P<url>.*?)">(?P<name1>.*?)</a>.*?</li>',re.S)
obj2 = re.compile(r'<div class="works-cover ui-left">.*?title="(?P<name>.*?)">.*?<img src="(?P<picture>.*?)".*?评分:<strong class="ui-text-orange">(?P<grade>.*?)</strong>.*?<span>(?P<grade1>.*?)</span>人评分.*?作者:<em style="max-width: 168px;">(?P<writer>.*?) </span>.*?人气:<em>(?P<popular>.*?)亿</em>.*?收藏数:<em id="coll_count">(?P<collection>.*?)</em>',re.S)
result1 = obj1.finditer(resp.text)
for it in result1:
a = it.groups()
print(a)
b.append(it.group('url'))
# print(b)
domain1 = "https://ac.qq.com"
for itt in range(0,len(b)):
child_href = domain1 + b[itt]
child_href_list.append(child_href)
# print(child_href_list)
for ittt in child_href_list:
child_resp = requests.get(ittt)
#print(child_resp.text)
result2 = obj2.finditer(child_resp.text)
for zs in result2:
c = zs.groups()
datalist.append(c)
workbook = xlwt.Workbook("utf-8")
sheet = workbook.add_sheet("腾讯动漫排行榜")
col = ["排名","动漫名字", "封面链接", "评分", "评分数", "作者", "观看人数(亿)", "收藏人数"]
for i in range(0, 8):
sheet.write(0, i, col[i])
for i in range(0, len(datalist)):
sheet.write(i + 1, 0, i + 1)
new_data = datalist[i]
for j in range(0, 7):
sheet.write(i + 1,j+1, new_data[j])
workbook.save('腾讯动漫排行榜.xlsx')
print("保存完毕")
histogram(datalist)
we(datalist)
we1(datalist)
we0(datalist)
open_xlsx()
基于Python爬虫的腾讯动漫排行分析
于 2023-01-11 19:21:15 首次发布