实验内容:基于给定的图书出版数据完成以下任务(book_list.csv):
1. 根据数据分析和可视化需求对给定的图书数据进行预处理,构建新的数据文件processedBookInfo.csv对处理后的数据进行保存。
实验代码:
import csv
# 数据读取
r_file = open("book_list.csv", 'r')
bookInfo = csv.reader(r_file)
w_file = open("processedBookInfo.csv", 'w', newline='')
csvwriter = csv.writer(w_file)
csvwriter.writerow(['序号', '书名', '评分', '评价人数', '作者', '出版社', '出版年份', '价格', '货币单位', '人民币价格'])
# 处理数据
for line in bookInfo:
pubInfo = line[5]
bookName = line[1]
bookAuthor = line[4]
publishInfo = pubInfo.split('/')
if len(pubInfo) > 8:
# 出版社名字
publish = publishInfo[0]
publisherName = publish[6:].strip()
# 出版时间
publishDate = publishInfo[1].strip()
publishYear = publishDate[:4] # 出版年份
# 价格
bookPrice = publishInfo[2]
currencyUnit = "元" # 货币单位,默认为元
RMBPrice = bookPrice # 人民币价格
if "元" in bookPrice or "CNY" in bookPrice:
bookPrice = bookPrice.replace("元", "").strip()
bookPrice = bookPrice.replace("CNY", "").strip()
RMBPrice = bookPrice
currencyUnit = "元"
if "$" in bookPrice or "USD" in bookPrice:
bookPrice = bookPrice.replace("$", "").strip()
bookPrice = bookPrice.replace("USD", "").strip()
currencyUnit = "美元"
RMBPrice = float(bookPrice) * 7
print([line[0], line[1], line[2], line[3], publisherName, publishYear, bookPrice, currencyUnit])
# 写入数据
csvwriter.writerow([line[0], line[1], line[2], line[3], line[4], publisherName, publishYear, bookPrice, currencyUnit,RMBPrice])
r_file.close()
w_file.close()
运行结果:
2. 使用折线图对清华大学出版社、电子工业出版社和人民邮电出版社在[2005,2015]年间每年出版图书量的变化情况进行分析。
实验代码:
import csv
import matplotlib.pyplot as plt
# 设置字体,防止中文乱码
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = [u'SimHei']
# 数据读取
file = open("processedBookInfo.csv")
cont = csv.reader(file)
next(cont) # 跳过标题行
bookPublisher1_dict = {} # 清华大学出版社
bookPublisher2_dict = {} # 电子工业出版社
bookPublisher3_dict = {} # 人民邮电出版社
# 统计数据
# 防止某年份出版图书为0,导致画图时x与y个数不一致出错
year = ['2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
for i in year:
bookPublisher1_dict[i] = 0
bookPublisher2_dict[i] = 0
bookPublisher3_dict[i] = 0
for line in cont:
publisher = line[5] # 出版社
pubYear = line[6] # 出版年份
if int(pubYear) >= 2005 and int(pubYear) <= 2015:
if publisher in '清华大学出版社':
bookPublisher1_dict[pubYear] += 1
if publisher in '电子工业出版社':
bookPublisher2_dict[pubYear] += 1
if publisher in '人民邮电出版社':
bookPublisher3_dict[pubYear] += 1
# 按字典value进行排序(升序),调整顺序
bookPublisher1 = {key: bookPublisher1_dict[key] for key in sorted(bookPublisher1_dict)}
bookPublisher2 = {key: bookPublisher2_dict[key] for key in sorted(bookPublisher2_dict)}
bookPublisher3 = {key: bookPublisher3_dict[key] for key in sorted(bookPublisher3_dict)}
years = [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
bookPublisher1 = bookPublisher1.values()
bookPublisher2 = bookPublisher2.values()
bookPublisher3 = bookPublisher3.values()
# 绘制折线图
plt.plot(years, bookPublisher1, color='r', marker='*')
plt.plot(years, bookPublisher2, color='b', marker='o')
plt.plot(years, bookPublisher3, color='c', marker='v', linestyle='--')
plt.xlabel('出版年份', fontsize=14)
plt.ylabel('出版书籍数量', fontsize=14)
plt.title('出版社出版量统计', fontsize=14)
plt.yticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], fontproperties='SimHei', fontsize=14)
plt.legend(['清华大学出版社', '电子工业出版社', '人民邮电出版社'], fontsize=12)
plt.show()
file.close()
运行结果:
3. 对出版量前五的出版社的优秀图书的分布情况进行分析,要求使用气泡图:x,y和size分别为出版社、评分和评价数量;优秀图书的评价标准:评分在8分以上,评价数量在100以上。
实验代码:
import csv
import matplotlib.pyplot as plt
import numpy as np
# 设置字体,防止中文乱码
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
# 数据读取
file = open("processedBookInfo.csv")
cont = csv.reader(file)
next(cont)
publisherCount = {}
# 统计每个出版社的出版数量
for line in cont:
publisher = line[5]
bookRate = float(line[2])
bookCommentNumber = int(line[3])
if publisher in publisherCount:
publisherCount[publisher]['pubNum'] = publisherCount[publisher]['pubNum'] + 1
else:
publisherCount[publisher] = {'pubNum': 1, 'goodPub': []}
if bookRate > 8 and bookCommentNumber > 100:
publisherCount[publisher]['goodPub'].append((bookRate, bookCommentNumber))
# dict.items()格式:('天津人民出版社', {'pubNum': 1, 'goodPub': [(8.4, 457),(...),...]}),....
publisherCount = sorted(publisherCount.items(), key=lambda x: x[1]['pubNum'], reverse=True)
publisherCount = publisherCount[:5]
# print(publisherCount)
# 构建3个坐标轴的数据
bookPublisher = [] # 出版社
bookRate = [] # 评分
bookCommentsNumber = [] # 评论数量
for data in publisherCount:
for info in data[1]['goodPub']: # 注:data[1]['goodPub']取到的就是列表[(8.4, 457),(...),...],所以info为列表中的一个元组
# print(info)
bookPublisher.append(data[0])
bookRate.append(info[0])
bookCommentsNumber.append(info[1])
# print(bookPublisher)
# print(bookRate)
# print(bookCommentsNumber)
# print(len(bookPublisher))
# print(len(bookRate))
# print(len(bookCommentsNumber))
print("最大评论数:", max(bookCommentsNumber))
print("最小评论数:", min(bookCommentsNumber))
# 进行规范化处理 [102, 22914]->[0, 100] ,[L,R]->[l,r]
# v=(x-l)*(R-L)/(r-l) + L
# v=(x-102)* 100/22812
bookCommentsNumber = np.divide((np.array(bookCommentsNumber)-102)*100, 22812)
# 不进行规范化处理
# bookCommentsNumber = np.array(bookCommentsNumber)
plt.figure(figsize=(8, 6))
# 绘图
plt.scatter(bookPublisher, bookRate, c='r', s=bookCommentsNumber)
plt.xlabel("出版社")
plt.ylabel("图书评分")
plt.show()
file.close()
运行结果:
4. 使用饼状图对各图书评分区间的图书数量分布情况进行统计:[6,7),[7,8),[8,9),[9,10),要求显示图例并对比例最高的部分进行突出显示。
实验代码:
# 使用饼状图对各图书评分区间的图书数量分布情况进行统计:
# [6,7),[7,8),[8,9),[9,10)
# 要求显示图例并对比例最高的部分进行突出显示
import csv
import matplotlib.pyplot as plt
# 设置中文字体,防止中文乱码
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
# 数据读取
file = open("processedBookInfo.csv")
cont = csv.reader(file)
next(cont) # 跳过表头
count1 = 0 # 评分[6,7)之间的图书数量
count2 = 0 # 评分[7,8)之间的图书数量
count3 = 0 # 评分[8,9)之间的图书数量
count4 = 0 # 评分[9,10)之间的图书数量
# 统计各图书评分区间的图书数量
for line in cont:
grade = float(line[2])
if 6 <= grade < 7:
count1 += 1
elif grade < 8:
count2 += 1
elif grade < 9:
count3 += 1
elif grade < 10:
count4 += 1
label = ['[6,7)', '[7,8)', '[8,9)', '[9,10)'] # 各类别标签
color = ['greenyellow', 'lightcyan', 'lightcoral', 'moccasin'] # 各类别颜色
size = [count1, count2, count3, count4] # 各类别数量
explode = (0, 0, 0.1, 0) # 各类别的偏移半径,各部分离开中心点的距离
# 绘制饼状图, autopct在饼状图中显示出百分比
pie = plt.pie(size, colors=color, explode=explode, labels=label, autopct='%1.1f%%')
# 饼状图呈正圆
for font in pie[1]: # pie[1]:l_text,pie图外的文本
font.set_size(10) # 设置标签字体大小
for digit in pie[2]: # pie[2]:p_text,pie图内的文本
digit.set_size(12) # 设置百分比字体的大小
plt.axis('equal')
plt.title(u'各图书评分区间的图书数量', fontsize=12)
# 显示图例
plt.legend(bbox_to_anchor=(0.82, 1), prop='SimHei') # 图例
plt.show()
file.close()
运行结果: