数据分析基础笔记（pandas、matplotlib、numpy）

最新推荐文章于 2023-09-06 22:40:42 发布

小虎随笔

最新推荐文章于 2023-09-06 22:40:42 发布

阅读量343

点赞数 1

本文链接：https://blog.csdn.net/clygo9/article/details/117136233

版权

一、matplotlib

作用：将数据可视化

示例1：

假设一天中每隔两个小时(range(2, 26, 2))的气温分别是[15, 13, 14, 5, 17, 20, 25, 26, 26, 27, 22, 18, 15]

pyplot用来画图。

import matplotlib.pyplot as plt


fig = plt.figure(figsize = (20,8), dpi = 80)

x = list(range(2, 26 ,2))
y = [15, 13, 14.5, 17, 20, 25, 26, 26, 24, 22, 18, 15]

plt.plot(x, y)

#刻度
# _xtick_label3 = [i/2 for i in range(2, 49)] #步长1,1.5,2,2.5。。。 24
# plt.xticks(range(25, 50))  #25-50步长
plt.xticks(x)
# plt.xticks(x[::2]) #步长
# plt.yticks(range(min(y), max(y) + 1)) #y轴刻度


plt.savefig("./sig_size.png")

plt.show()

结果：

作业：

如果列表啊表示从10点到12点的每一分钟的气温，如何绘制折线图观察每分钟气温的变化情况呢？

import matplotlib.pyplot as plt
import random
import matplotlib

# 指定中文
matplotlib.rc("font", family = 'MicroSoft YaHei', weight = "bold")

#指定图像尺寸
fig = plt.figure(figsize = (20, 8), dpi = 80)

y = [random.randint(20, 35) for i in range(120)]
x = range(120)

plt.plot(x, y)

# 这个+= 为什么 前60跑完，再跑第二个11点的？
_xtick_labels = ["10点{}分".format(i) for i in range(60)]
_xtick_labels += ["11点{}分".format(i) for i in range(60)]

#第一个参数和第二个参数是绑定的
plt.xticks(x[::3], _xtick_labels[::3], rotation = 270)
plt.yticks(range(20, 36))

#x 轴
plt.xlabel("时间")
#y 轴
plt.ylabel("温度")
# 标题
plt.title("10点到12点的温度变化")

plt.show()

#指定图像尺寸和dpi(每英寸对应的点的数量)

# fig = plt.figure(figsize = (20, 8), dpi = 80)

Q:这个+= 为什么前60跑完，再跑第二个11点的？
_xtick_labels = ["10点{}分".format(i) for i in range(60)]
_xtick_labels += ["11点{}分".format(i) for i in range(60)]

作业：

假设大家在30岁的时候，根据自己的实际情况，统计出来了从11岁到30岁每年交的女朋友数量，如列表a，请绘制出该数据的折线图，以便分析自己每年交女朋友的数量走势。

a = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]

要求：

y轴表示个数

x轴表示年龄

import matplotlib.pyplot as plt
import matplotlib

matplotlib.rc("font", family = 'MicroSoft YaHei', weight = "bold")

plt.figure(figsize = (20, 8) , dpi = 80)
x = range(11, 31)
y = [1, 0, 1, 1, 2, 4, 3,2,3,4,4, 5, 6, 5, 4, 3, 3, 1, 1, 1]
y2 = [1, 0, 3, 1, 2, 2, 3, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]


xtick_labels = ["{}岁".format(i) for i in range(11, 31) ]

# 自定义绘制线条风格
plt.plot(x,y, label = "自己", color = 'r', linestyle ='--', linewidth = 5, alpha = 0.5)
plt.plot(x, y2, label = "同桌")
plt.xticks(x, xtick_labels)

#绘制网格 默认情况下有多少个x，y就有多少个网格，参数为透明度，
#通过xticks和yticks调刻度，变相的调网格密度
plt.grid(alpha = 0.4)

# 添加到图例，给label显示化,loc是位置
plt.legend(loc = 'upper left')

plt.show()

day1总结:

绘制散点图：

from matplotlib import pyplot as plt
from matplotlib import font_manager

plt.figure(figsize=(20, 8), dpi = 80)

y_3 = [11,17,16,11,12,11,12,6,6,7,8,9,12,15,14,17,18,21,16,17,20,14,15,15,15,19,21,22,22,22,23]
y_10 = [26,26,28,19,21,17,16,19,18,20,20,19,22,23,17,20,21,20,22,15,11,15,5,13,17,10,11,13,12,13,6]

x = range(1, 32)

plt.scatter(x, y_3)
plt.scatter(x, y_10)

plt.show()

绘制横线直方图：

from matplotlib import pyplot as plt
from matplotlib import font_manager
import matplotlib

matplotlib.rc("font", family = "MicroSoft YaHei", weight = "bold")

plt.figure(figsize = (20, 8), dpi = 80)

a = ["战狼2","速度与激情8","功夫瑜伽","西游伏妖篇","变形金刚5：最后的骑士","摔跤吧！爸爸","加勒比海盗5：死无对证","金刚：骷髅岛","极限特工：终极回归","生化危机6：终章","乘风破浪","神偷奶爸3","智取威虎山","大闹天竺","金刚狼3：殊死一战","蜘蛛侠：英雄归来","悟空传","银河护卫队2","情圣","新木乃伊",]
b=[56.01,26.94,17.53,16.49,15.45,12.96,11.8,11.61,11.28,11.12,10.49,10.3,8.75,7.55,7.32,6.99,6.88,6.86,6.58,6.23]

plt.bar(range(len(a)), b)
plt.xticks(range(len(a)), a, rotation = 90)

plt.show()

条形图：

from matplotlib import pyplot as plt
import matplotlib

matplotlib.rc("font", family = "MicroSoft YaHei", weight = "bold")

plt.figure(figsize = (20, 8), dpi = 80)

a = ["猩球崛起3：终极之战","敦刻尔克","蜘蛛侠：英雄归来","战狼2"]
b_16 = [15746,312,4497,319]
b_15 = [12357,156,2045,168]
b_14 = [2358,399,2358,362]

bar_width = 0.2

x_14 = list(range(len(a)))
x_15 = [i+bar_width for i in x_14]
x_16 = [i+bar_width*2 for i in x_14]

plt.bar(range(len(a)), b_14, width = bar_width)
plt.bar(x_15, b_15, width=bar_width)
plt.bar(x_16, b_16, width = bar_width)

plt.show()

直方图：

plt.bar(range(12), quantity, width = 1)

二、numpy

创建数组：

import numpy as np
a = np.array([12,32])

b = np.arange(0,10)

c = np.array(range(15))

读取数据：

三、pandas

import pandas as  pd
import numpy as np

data_dict = {'color' : ['black', 'white', 'black', 'white', 'black', 'white', 'black', 
                        'white', 'black', 'white'],
             'size' : ['S', 'M', 'L', 'M', 'L', 'S', 'S', 'XL', 'XL', 'M'],
             'date' : pd.date_range('1/1/2019', periods = 10, freq = 'W'),#freq表示日期偏移量
                    'feature_1': np.random.randn(10), #函数返回一个或一组样本，具有标准正态分布，参数表示维度
                    'feature_2' : np.random.normal(0.5, 2, 10)}  #生成高斯分布的概率密度随机数
array = [['A','B','B','B','C','A','B','A','C','C'],
        ['JP','CN','US','US','US','CN','CN','CA','JP','CA']]

index = pd.MultiIndex.from_arrays(array, names = ['class', 'country']) #标题
data_df = pd.DataFrame(data_dict, index = index) #复合索引
print(data_df)

3.2 分组聚合

import pandas as  pd
import numpy as np

data_dict = {'color' : ['black', 'white', 'black', 'white', 'black', 'white', 'black', 
                        'white', 'black', 'white'],
             'size' : ['S', 'M', 'L', 'M', 'L', 'S', 'S', 'XL', 'XL', 'M'],
             'date' : pd.date_range('1/1/2019', periods = 10, freq = 'W'),#freq表示日期偏移量
                    'feature_1': np.random.randn(10), #函数返回一个或一组样本，具有标准正态分布，参数表示维度
                    'feature_2' : np.random.normal(0.5, 2, 10)}  #生成高斯分布的概率密度随机数
array = [['A','B','B','B','C','A','B','A','C','C'],
        ['JP','CN','US','US','US','CN','CN','CA','JP','CA']]

index = pd.MultiIndex.from_arrays(array, names = ['class', 'country']) #标题
data_df = pd.DataFrame(data_dict, index = index) #复合索引
print(data_df)

group_1 = data_df.groupby('size')
# for i in list(group_1):
#     print(i)
# print(group_1.get_group('M'))

#多重分组
group_2 = data_df.groupby(['size', 'color'])
# for i in list(group_2):
#     print(i)

#查看组别个数
# print(group_1.size())
# print(group_2.size())

# 通过函数分组，带有feature的列分为一组，不带有feature的列分为另一组
# def get_letter_type(letter):
#     if 'feature' in letter:
#         return 'feature'
#     else:
#         return 'other'

# for i in list(data_df.groupby(get_letter_type, axis = 1)):
#     print(i)

# for i in list(data_df.groupby(level = [0, 1])):
#     print(i)

# print(group_2.agg({'feature_1': np.max, 'feature_2':np.mean}))

# data_range = lambda x : x.max() - x.min()
# print(data_df.groupby('size').transform(data_range))

# data_df.iloc[1, 3:5] = np.nan
# f = lambda x : x.fillna(x.mean())
# df_trans = group_1.transform(f)
# print(df_trans)

#rolling不理解
# print(data_df.groupby('color').rolling(3).feature_1.mean())

3.3 时间序列

暂不看

小虎随笔

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
1
评论
数据分析基础笔记（pandas、matplotlib、numpy）

一、matplotlib作用：将数据可视化示例1：假设一天中每隔两个小时(range(2, 26, 2))的气温分别是[15, 13, 14, 5, 17, 20, 25, 26, 26, 27, 22, 18, 15]pyplot用来画图。
复制链接

扫一扫