数据分析基础笔记(pandas、matplotlib、numpy)

一、matplotlib

作用:将数据可视化

示例1:

假设一天中每隔两个小时(range(2, 26, 2))的气温分别是[15, 13, 14, 5, 17, 20, 25, 26, 26, 27, 22, 18, 15]

pyplot用来画图。

import matplotlib.pyplot as plt


fig = plt.figure(figsize = (20,8), dpi = 80)

x = list(range(2, 26 ,2))
y = [15, 13, 14.5, 17, 20, 25, 26, 26, 24, 22, 18, 15]

plt.plot(x, y)

#刻度
# _xtick_label3 = [i/2 for i in range(2, 49)] #步长1,1.5,2,2.5。。。 24
# plt.xticks(range(25, 50))  #25-50步长
plt.xticks(x)
# plt.xticks(x[::2]) #步长
# plt.yticks(range(min(y), max(y) + 1)) #y轴刻度


plt.savefig("./sig_size.png")

plt.show()

结果:

 

作业:

如果列表啊表示从10点到12点的每一分钟的气温,如何绘制折线图观察每分钟气温的变化情况呢?

import matplotlib.pyplot as plt
import random
import matplotlib

# 指定中文
matplotlib.rc("font", family = 'MicroSoft YaHei', weight = "bold")

#指定图像尺寸
fig = plt.figure(figsize = (20, 8), dpi = 80)

y = [random.randint(20, 35) for i in range(120)]
x = range(120)

plt.plot(x, y)

# 这个+= 为什么 前60跑完,再跑第二个11点的?
_xtick_labels = ["10点{}分".format(i) for i in range(60)]
_xtick_labels += ["11点{}分".format(i) for i in range(60)]

#第一个参数和第二个参数是绑定的
plt.xticks(x[::3], _xtick_labels[::3], rotation = 270)
plt.yticks(range(20, 36))

#x 轴
plt.xlabel("时间")
#y 轴
plt.ylabel("温度")
# 标题
plt.title("10点到12点的温度变化")

plt.show()

#指定图像尺寸和dpi(每英寸对应的点的数量)

# fig = plt.figure(figsize = (20, 8), dpi = 80)

 

Q:这个+= 为什么前60跑完,再跑第二个11点的?
_xtick_labels = ["10点{}分".format(i) for i in range(60)]
_xtick_labels += ["11点{}分".format(i) for i in range(60)]

 

作业:

假设大家在30岁的时候,根据自己的实际情况,统计出来了从11岁到30岁每年交的女朋友数量,如列表a,请绘制出该数据的折线图,以便分析自己每年交女朋友的数量走势。

a = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]

要求:

y轴表示个数

x轴表示年龄

import matplotlib.pyplot as plt
import matplotlib

matplotlib.rc("font", family = 'MicroSoft YaHei', weight = "bold")

plt.figure(figsize = (20, 8) , dpi = 80)
x = range(11, 31)
y = [1, 0, 1, 1, 2, 4, 3,2,3,4,4, 5, 6, 5, 4, 3, 3, 1, 1, 1]
y2 = [1, 0, 3, 1, 2, 2, 3, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]


xtick_labels = ["{}岁".format(i) for i in range(11, 31) ]

# 自定义绘制线条风格
plt.plot(x,y, label = "自己", color = 'r', linestyle ='--', linewidth = 5, alpha = 0.5)
plt.plot(x, y2, label = "同桌")
plt.xticks(x, xtick_labels)

#绘制网格 默认情况下有多少个x,y就有多少个网格,参数为透明度,
#通过xticks和yticks调刻度,变相的调网格密度
plt.grid(alpha = 0.4)

# 添加到图例,给label显示化,loc是位置
plt.legend(loc = 'upper left')

plt.show()

 

day1总结:

 

 

绘制散点图:

from matplotlib import pyplot as plt
from matplotlib import font_manager

plt.figure(figsize=(20, 8), dpi = 80)

y_3 = [11,17,16,11,12,11,12,6,6,7,8,9,12,15,14,17,18,21,16,17,20,14,15,15,15,19,21,22,22,22,23]
y_10 = [26,26,28,19,21,17,16,19,18,20,20,19,22,23,17,20,21,20,22,15,11,15,5,13,17,10,11,13,12,13,6]

x = range(1, 32)

plt.scatter(x, y_3)
plt.scatter(x, y_10)

plt.show()

 

 

绘制横线直方图:

from matplotlib import pyplot as plt
from matplotlib import font_manager
import matplotlib

matplotlib.rc("font", family = "MicroSoft YaHei", weight = "bold")

plt.figure(figsize = (20, 8), dpi = 80)

a = ["战狼2","速度与激情8","功夫瑜伽","西游伏妖篇","变形金刚5:最后的骑士","摔跤吧!爸爸","加勒比海盗5:死无对证","金刚:骷髅岛","极限特工:终极回归","生化危机6:终章","乘风破浪","神偷奶爸3","智取威虎山","大闹天竺","金刚狼3:殊死一战","蜘蛛侠:英雄归来","悟空传","银河护卫队2","情圣","新木乃伊",]
b=[56.01,26.94,17.53,16.49,15.45,12.96,11.8,11.61,11.28,11.12,10.49,10.3,8.75,7.55,7.32,6.99,6.88,6.86,6.58,6.23]

plt.bar(range(len(a)), b)
plt.xticks(range(len(a)), a, rotation = 90)

plt.show()

 

条形图:

from matplotlib import pyplot as plt
import matplotlib

matplotlib.rc("font", family = "MicroSoft YaHei", weight = "bold")

plt.figure(figsize = (20, 8), dpi = 80)

a = ["猩球崛起3:终极之战","敦刻尔克","蜘蛛侠:英雄归来","战狼2"]
b_16 = [15746,312,4497,319]
b_15 = [12357,156,2045,168]
b_14 = [2358,399,2358,362]

bar_width = 0.2

x_14 = list(range(len(a)))
x_15 = [i+bar_width for i in x_14]
x_16 = [i+bar_width*2 for i in x_14]

plt.bar(range(len(a)), b_14, width = bar_width)
plt.bar(x_15, b_15, width=bar_width)
plt.bar(x_16, b_16, width = bar_width)

plt.show()

 

直方图:

plt.bar(range(12), quantity, width = 1)

 

二、numpy

创建数组:

import numpy as np
a = np.array([12,32])

b = np.arange(0,10)

c = np.array(range(15))

读取数据:

 

三、pandas

import pandas as  pd
import numpy as np

data_dict = {'color' : ['black', 'white', 'black', 'white', 'black', 'white', 'black', 
                        'white', 'black', 'white'],
             'size' : ['S', 'M', 'L', 'M', 'L', 'S', 'S', 'XL', 'XL', 'M'],
             'date' : pd.date_range('1/1/2019', periods = 10, freq = 'W'),#freq表示日期偏移量
                    'feature_1': np.random.randn(10), #函数返回一个或一组样本,具有标准正态分布,参数表示维度
                    'feature_2' : np.random.normal(0.5, 2, 10)}  #生成高斯分布的概率密度随机数
array = [['A','B','B','B','C','A','B','A','C','C'],
        ['JP','CN','US','US','US','CN','CN','CA','JP','CA']]

index = pd.MultiIndex.from_arrays(array, names = ['class', 'country']) #标题
data_df = pd.DataFrame(data_dict, index = index) #复合索引
print(data_df)

3.2 分组聚合

import pandas as  pd
import numpy as np

data_dict = {'color' : ['black', 'white', 'black', 'white', 'black', 'white', 'black', 
                        'white', 'black', 'white'],
             'size' : ['S', 'M', 'L', 'M', 'L', 'S', 'S', 'XL', 'XL', 'M'],
             'date' : pd.date_range('1/1/2019', periods = 10, freq = 'W'),#freq表示日期偏移量
                    'feature_1': np.random.randn(10), #函数返回一个或一组样本,具有标准正态分布,参数表示维度
                    'feature_2' : np.random.normal(0.5, 2, 10)}  #生成高斯分布的概率密度随机数
array = [['A','B','B','B','C','A','B','A','C','C'],
        ['JP','CN','US','US','US','CN','CN','CA','JP','CA']]

index = pd.MultiIndex.from_arrays(array, names = ['class', 'country']) #标题
data_df = pd.DataFrame(data_dict, index = index) #复合索引
print(data_df)

group_1 = data_df.groupby('size')
# for i in list(group_1):
#     print(i)
# print(group_1.get_group('M'))

#多重分组
group_2 = data_df.groupby(['size', 'color'])
# for i in list(group_2):
#     print(i)

#查看组别个数
# print(group_1.size())
# print(group_2.size())

# 通过函数分组,带有feature的列分为一组,不带有feature的列分为另一组
# def get_letter_type(letter):
#     if 'feature' in letter:
#         return 'feature'
#     else:
#         return 'other'

# for i in list(data_df.groupby(get_letter_type, axis = 1)):
#     print(i)

# for i in list(data_df.groupby(level = [0, 1])):
#     print(i)

# print(group_2.agg({'feature_1': np.max, 'feature_2':np.mean}))

# data_range = lambda x : x.max() - x.min()
# print(data_df.groupby('size').transform(data_range))

# data_df.iloc[1, 3:5] = np.nan
# f = lambda x : x.fillna(x.mean())
# df_trans = group_1.transform(f)
# print(df_trans)

#rolling不理解
# print(data_df.groupby('color').rolling(3).feature_1.mean())

 

3.3 时间序列

暂不看

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值