一、爬虫部分
爬取红楼梦精彩点评
import requests # 导入网络请求模块requests
from lxml import etree#导入etree子模块
from bs4 import BeautifulSoup
import pandas as pd
from lxml import etree#导入etree子模块
import numpy
url = 'https://weread.qq.com/web/bookReview/list?bookId=0da329707210329f0da4d39' # 创建需要爬取网页的地址
# 创建头部信息
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers) # 发送网络请求
print(response.status_code)
# print(response.text)
#爬取并保存点评内容
html=etree.HTML(response.text)
title=html.xpath('//div[@class="bookReviewList_content"]/p/text()')
text=html.xpath('//ul[@class="bookReviewList_list"]/li[@class="bookReviewList_item clickable"]/p/text()')
# print(title)
# print(text)
f=open('红楼梦精彩点评.txt','w',encoding='utf-8')
f.writelines(title)
f.write('\n')
f.writelines(text)
f.close
print("写入成功")
爬取文学类小说数据
import requests # 导入网络请求模块requests
from lxml import etree#导入etree子模块
from bs4 import BeautifulSoup
import pandas as pd #导入Pandas模块
import numpy as np
import re
url = 'https://weread.qq.com/web/category/300000' # 创建需要爬取网页的地址
# 创建头部信息
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers) # 发送网络请求
# print(response.status_code)
# print(response.text)
html=etree.HTML(response.text)
#获取书名、作者、当日阅读人数、推荐值
data_book_name=html.xpath('//ul[@class="ranking_content_bookList"]/li/div[@class="wr_bookList_item_container"]/div[@class="wr_bookList_item_info"]/p[@class="wr_bookList_item_title"]/text()')
# print(data_book_name)
data_book_author=html.xpath('//ul[@class="ranking_content_bookList"]/li/div[@class="wr_bookList_item_container"]/div[@class="wr_bookList_item_info"]/p[@class="wr_bookList_item_author"]/a/text()')
# print(data_book_author)
data_reading_number=html.xpath('//ul[@class="ranking_content_bookList"]/li/div[@class="wr_bookList_item_container"]/div[@class="wr_bookList_item_info"]/p/span/span[@class="wr_bookList_item_reading_number"]/text()')
# print(data_reading_number)
percents=html.xpath('//ul[@class="ranking_content_bookList"]/li/div[@class="wr_bookList_item_container"]/div[@class="wr_bookList_item_info"]/p/span/span[@class="wr_bookList_item_reading_percent"]/text()')
# print(percents)
#将推荐值百分数去掉‘%’
pat='-?\d+\.?\d*e?-?\d*?'
data_reading_percent = []
for percent in percents:
percent1=re.findall(pat,percent)
data_reading_percent.append(percent1)
data_reading_percent=sum(data_reading_percent,[])
# print(data_reading_percent)
#将数据写入Excel
#设置数据显示的最大列数和宽度
pd.set_option("display.max_columns",1000)
pd.set_option('display.width',5000)
#解决数据输出时列名不对齐的问题
pd.set_option("display.unicode.east_asian_width",True)
databook=pd.DataFrame({
'书名':data_book_name,
'作者':data_book_author,
'单日阅读人数(单位:人)':data_reading_number,
'推荐值(单位:%)':data_reading_percent
})
data=float(databook.iloc[0,2])
databook.iloc[0,2]=data*10000
# 使用to_excel函数将数据存储为Excel文件里
databook.to_excel("微信读书文学类书籍榜单.xlsx",index=None)
print('写入成功')
print(databook)
二、数据处理
import seaborn as sns
df=pd.read_excel('微信读书文学类书籍榜单.xlsx')
#删除重复数据
df = df.drop_duplicates()
#查看数据信息
df.info()
#描述性统计
color_map = sns.light_palette('orange', as_cmap=True) # light_palette调色板
df.describe().style.background_gradient(color_map)
三、可视化分析
红楼梦精彩点评词云可视化
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('cn_stopwords.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords: # 判断如果不是停用词
if word != '\t':
outstr += word
outstr += " "
return outstr
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
inputs=open("红楼梦精彩点评.txt",'r',encoding='utf-8')#原始的中文文档
outputs = open('output.txt', 'w', encoding='utf-8') # 分词过后的中文文档
for line in inputs:
#print(line)
line_seg = seg_sentence(line) # 对每个句子进行分词
outputs.write(line_seg + '\n') # 将处理过后的文件进行保存
outputs.close()
mask = np.array(Image.open("mask.jpg")) # 模板图片
inputs = open('output.txt', 'r', encoding='utf-8')
mytext=inputs.read()
wordcloud=WordCloud(mask=mask, width=3000, height=3000, background_color="white", margin=1,
colormap='inferno', #指定颜色风格 viridis
max_words=300, min_font_size=10, max_font_size=None, repeat=False,
font_path="FZKaTong-M19S.ttf").generate(mytext) #生成云图
wordcloud.to_file('红楼梦精彩点评词云.jpg')
inputs.close()
plt.figure(dpi=150) #通过这里可以放大或缩小
plt.imshow(wordcloud) #用plt显示图片
plt.axis('off') #不显示坐标轴
plt.show() #显示图片
文学类作品榜单可视化
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
#
sns.set_style('darkgrid')#灰色网格
plt.rcParams['font.sans-serif']=["SimHei"]#解决中文乱码
df1=pd.read_excel("微信读书文学类书籍榜单.xlsx")#导入Excel文件
#绘制文学类榜单前二十本作品推荐率折线图
dfs=[df1['推荐值(单位:%)']]
sns.lineplot(data=dfs)
plt.title("文学类榜前20本文学类作品推荐值情况")
plt.savefig("文学类榜前20本文学类作品推荐值情况折线图.png")
plt.show()
#绘制散点图
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
sns.set_style('darkgrid')
plt.rcParams['font.sans-serif']=["SimHei"]#解决中文乱码
#读取书籍榜单,并对单日阅读人数(单位:人)和推荐值字段绘制散点图
data=pd.read_excel('微信读书文学类书籍榜单.xlsx')
sns.relplot(x='单日阅读人数(单位:人)',y='推荐值(单位:%)',data=data,color='r')
plt.title("文学类榜前20本文学类作品单日阅读人数与推荐值散点图")
plt.savefig("文学类榜前20本文学类作品单日阅读人数与推荐值散点图.png")
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
df1=pd.read_excel('微信读书文学类书籍榜单.xlsx')
plt.rcParams['font.sans-serif']=['SimHei']#解决中文乱码
plt.figure(figsize=(8,6))#设置画布大小
labels=df1['书名']
sizes=df1['单日阅读人数(单位:人)']
#设置饼形图每块的颜色
colors=['red','yellow','slateblue','green','magenta','cyan','darkorange','lawngreen','pink','gold']
plt.pie(sizes, #绘图数据
labels=labels, #添加区域水平标签
colors=colors, #设置饼图的自定义填充色
labeldistance=1.02, #设置各扇形标签(图例)与圆心的距离
autopct='%.1f%%', #设置百分比格式,这里保留一位小数
startangle=90, #设置饼图的初始角度
radius=0.5, #设置饼图的半径
center=(0.2,0.2), #设置饼图的原点
textprops={'fontsize':9,'color':'k'}, #设置文本标签的属性值
pctdistance=0.6) #设置百分比标签与圆心的距离
#设置x,y轴刻度保持一致,保证饼形图为圆形
plt.axis('equal')
plt.title("微信读书文学类榜单前20本作品单日阅读人数占比情况分析")
plt.savefig("单日阅读人数占比情况分析图.png")
plt.show()
#绘制分裂饼图
import pandas as pd
import matplotlib.pyplot as plt
df1=pd.read_excel('微信读书文学类书籍榜单.xlsx')
plt.rcParams['font.sans-serif']=['SimHei']#解决中文乱码
plt.figure(figsize=(8,6))#设置画布大小
labels=df1['书名']
sizes=df1['推荐值(单位:%)']
explode=(0.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
#设置饼形图每块的颜色
colors=['red','yellow','slateblue','green','magenta','cyan','darkorange','lawngreen','pink','gold']
plt.pie(sizes, #绘图数据
explode=explode, #设置饼形图距离中心的距离
labels=labels, #添加区域水平标签
colors=colors, #设置饼图的自定义填充色
shadow=True, #设置立体感带阴影
labeldistance=1.02, #设置各扇形标签(图例)与圆心的距离
autopct='%.1f%%', #设置百分比格式,这里保留一位小数
startangle=90, #设置饼图的初始角度
radius=0.5, #设置饼图的半径
center=(0.2,0.2), #设置饼图的原点
textprops={'fontsize':9,'color':'k'}, #设置文本标签的属性值
pctdistance=0.6) #设置百分比标签与圆心的距离
#设置x,y轴刻度保持一致,保证饼形图为圆形
plt.axis('equal')
plt.title("微信读书文学类榜单前20本作品推荐率占比情况分析")
plt.savefig("单日作品推荐率占比情况分析图.png")
plt.show()