实验2 探索性数据分析和可视化

对方正在长头发发

已于 2024-05-19 10:20:31 修改

阅读量318

点赞数 7

分类专栏：数据科学导论文章标签： python

于 2024-05-19 10:19:36 首次发布

本文链接：https://blog.csdn.net/m0_73968650/article/details/139038975

版权

数据科学导论专栏收录该内容

3 篇文章 0 订阅

订阅专栏

实验内容：基于给定的图书出版数据完成以下任务（book_list.csv）：

1. 根据数据分析和可视化需求对给定的图书数据进行预处理，构建新的数据文件processedBookInfo.csv对处理后的数据进行保存。

实验代码：

import csv

# 数据读取
r_file = open("book_list.csv", 'r')
bookInfo = csv.reader(r_file)

w_file = open("processedBookInfo.csv", 'w', newline='')
csvwriter = csv.writer(w_file)
csvwriter.writerow(['序号', '书名', '评分', '评价人数', '作者', '出版社', '出版年份', '价格', '货币单位', '人民币价格'])

# 处理数据
for line in bookInfo:
    pubInfo = line[5]
    bookName = line[1]
    bookAuthor = line[4]
    publishInfo = pubInfo.split('/')

    if len(pubInfo) > 8:
        # 出版社名字
        publish = publishInfo[0]
        publisherName = publish[6:].strip()
        # 出版时间
        publishDate = publishInfo[1].strip()
        publishYear = publishDate[:4]  # 出版年份
        # 价格
        bookPrice = publishInfo[2]
        currencyUnit = "元"  # 货币单位，默认为元
        RMBPrice = bookPrice  # 人民币价格
        if "元" in bookPrice or "CNY" in bookPrice:
            bookPrice = bookPrice.replace("元", "").strip()
            bookPrice = bookPrice.replace("CNY", "").strip()
            RMBPrice = bookPrice
            currencyUnit = "元"
        if "$" in bookPrice or "USD" in bookPrice:
            bookPrice = bookPrice.replace("$", "").strip()
            bookPrice = bookPrice.replace("USD", "").strip()
            currencyUnit = "美元"
            RMBPrice = float(bookPrice) * 7

        print([line[0], line[1], line[2], line[3], publisherName, publishYear, bookPrice, currencyUnit])

        # 写入数据
        csvwriter.writerow([line[0], line[1], line[2], line[3], line[4], publisherName, publishYear, bookPrice, currencyUnit,RMBPrice])

r_file.close()
w_file.close()

运行结果：

2. 使用折线图对清华大学出版社、电子工业出版社和人民邮电出版社在[2005,2015]年间每年出版图书量的变化情况进行分析。

实验代码：

import csv
import matplotlib.pyplot as plt

# 设置字体，防止中文乱码
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = [u'SimHei']

# 数据读取
file = open("processedBookInfo.csv")
cont = csv.reader(file)
next(cont)  # 跳过标题行
bookPublisher1_dict = {}   # 清华大学出版社
bookPublisher2_dict = {}   # 电子工业出版社
bookPublisher3_dict = {}   # 人民邮电出版社

# 统计数据
# 防止某年份出版图书为0，导致画图时x与y个数不一致出错
year = ['2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
for i in year:
    bookPublisher1_dict[i] = 0
    bookPublisher2_dict[i] = 0
    bookPublisher3_dict[i] = 0

for line in cont:
    publisher = line[5]  # 出版社
    pubYear = line[6]  # 出版年份
    if int(pubYear) >= 2005 and int(pubYear) <= 2015:
        if publisher in '清华大学出版社':
            bookPublisher1_dict[pubYear] += 1
        if publisher in '电子工业出版社':
            bookPublisher2_dict[pubYear] += 1
        if publisher in '人民邮电出版社':
            bookPublisher3_dict[pubYear] += 1

# 按字典value进行排序(升序)，调整顺序
bookPublisher1 = {key: bookPublisher1_dict[key] for key in sorted(bookPublisher1_dict)}
bookPublisher2 = {key: bookPublisher2_dict[key] for key in sorted(bookPublisher2_dict)}
bookPublisher3 = {key: bookPublisher3_dict[key] for key in sorted(bookPublisher3_dict)}

years = [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
bookPublisher1 = bookPublisher1.values()
bookPublisher2 = bookPublisher2.values()
bookPublisher3 = bookPublisher3.values()

# 绘制折线图
plt.plot(years, bookPublisher1, color='r', marker='*')
plt.plot(years, bookPublisher2, color='b', marker='o')
plt.plot(years, bookPublisher3, color='c', marker='v', linestyle='--')
plt.xlabel('出版年份', fontsize=14)
plt.ylabel('出版书籍数量', fontsize=14)
plt.title('出版社出版量统计', fontsize=14)
plt.yticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], fontproperties='SimHei', fontsize=14)
plt.legend(['清华大学出版社', '电子工业出版社', '人民邮电出版社'], fontsize=12)
plt.show()
file.close()

运行结果：

3. 对出版量前五的出版社的优秀图书的分布情况进行分析，要求使用气泡图：x,y和size分别为出版社、评分和评价数量；优秀图书的评价标准：评分在8分以上，评价数量在100以上。

实验代码：

import csv
import matplotlib.pyplot as plt
import numpy as np

# 设置字体，防止中文乱码
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']

# 数据读取
file = open("processedBookInfo.csv")
cont = csv.reader(file)
next(cont)

publisherCount = {}
# 统计每个出版社的出版数量
for line in cont:
    publisher = line[5]
    bookRate = float(line[2])
    bookCommentNumber = int(line[3])
    if publisher in publisherCount:
        publisherCount[publisher]['pubNum'] = publisherCount[publisher]['pubNum'] + 1
    else:
        publisherCount[publisher] = {'pubNum': 1, 'goodPub': []}
    if bookRate > 8 and bookCommentNumber > 100:
        publisherCount[publisher]['goodPub'].append((bookRate, bookCommentNumber))

# dict.items()格式：('天津人民出版社', {'pubNum': 1, 'goodPub': [(8.4, 457),(...),...]}),....
publisherCount = sorted(publisherCount.items(), key=lambda x: x[1]['pubNum'], reverse=True)
publisherCount = publisherCount[:5]
# print(publisherCount)

# 构建3个坐标轴的数据
bookPublisher = []  # 出版社
bookRate = []  # 评分
bookCommentsNumber = []  # 评论数量

for data in publisherCount:
for info in data[1]['goodPub']:  # 注：data[1]['goodPub']取到的就是列表[(8.4, 457),(...),...]，所以info为列表中的一个元组
        # print(info)
        bookPublisher.append(data[0])
        bookRate.append(info[0])
        bookCommentsNumber.append(info[1])

# print(bookPublisher)
# print(bookRate)
# print(bookCommentsNumber)
# print(len(bookPublisher))
# print(len(bookRate))
# print(len(bookCommentsNumber))
print("最大评论数：", max(bookCommentsNumber))
print("最小评论数：", min(bookCommentsNumber))

# 进行规范化处理 [102, 22914]->[0, 100] ,[L,R]->[l,r]
# v=(x-l)*(R-L)/(r-l) + L
# v=(x-102)* 100/22812
bookCommentsNumber = np.divide((np.array(bookCommentsNumber)-102)*100, 22812)
# 不进行规范化处理
# bookCommentsNumber = np.array(bookCommentsNumber)

plt.figure(figsize=(8, 6))
# 绘图
plt.scatter(bookPublisher, bookRate, c='r',  s=bookCommentsNumber)
plt.xlabel("出版社")
plt.ylabel("图书评分")
plt.show()
file.close()

运行结果：

4. 使用饼状图对各图书评分区间的图书数量分布情况进行统计：[6，7)，[7,8)，[8,9)，[9,10)，要求显示图例并对比例最高的部分进行突出显示。

实验代码：

# 使用饼状图对各图书评分区间的图书数量分布情况进行统计：
# [6，7)，[7,8)，[8,9)，[9,10)
# 要求显示图例并对比例最高的部分进行突出显示

import csv
import matplotlib.pyplot as plt

# 设置中文字体，防止中文乱码
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']

# 数据读取
file = open("processedBookInfo.csv")
cont = csv.reader(file)
next(cont)  # 跳过表头

count1 = 0  # 评分[6,7)之间的图书数量
count2 = 0  # 评分[7,8)之间的图书数量
count3 = 0  # 评分[8,9)之间的图书数量
count4 = 0  # 评分[9,10)之间的图书数量

# 统计各图书评分区间的图书数量
for line in cont:
    grade = float(line[2])
    if 6 <= grade < 7:
        count1 += 1
    elif grade < 8:
        count2 += 1
    elif grade < 9:
        count3 += 1
    elif grade < 10:
        count4 += 1

label = ['[6,7)', '[7,8)', '[8,9)', '[9,10)']  # 各类别标签
color = ['greenyellow', 'lightcyan', 'lightcoral', 'moccasin']  # 各类别颜色
size = [count1, count2, count3, count4]  # 各类别数量
explode = (0, 0, 0.1, 0)  # 各类别的偏移半径，各部分离开中心点的距离
# 绘制饼状图, autopct在饼状图中显示出百分比
pie = plt.pie(size, colors=color, explode=explode, labels=label, autopct='%1.1f%%')
# 饼状图呈正圆
for font in pie[1]:  # pie[1]:l_text,pie图外的文本
    font.set_size(10)  # 设置标签字体大小
for digit in pie[2]:  # pie[2]:p_text,pie图内的文本
    digit.set_size(12)  # 设置百分比字体的大小
plt.axis('equal')
plt.title(u'各图书评分区间的图书数量', fontsize=12)
# 显示图例
plt.legend(bbox_to_anchor=(0.82, 1), prop='SimHei')  # 图例
plt.show()
file.close()

运行结果：