数据分析入门

最新推荐文章于 2024-08-22 07:51:42 发布

清风若过@

最新推荐文章于 2024-08-22 07:51:42 发布

阅读量175

点赞数

文章标签： matplotlib python 开发语言

本文链接：https://blog.csdn.net/weixin_44108577/article/details/131262695

版权

matplotlib-画图

基本绘制表格

from matplotlib import pyplot as plt

y = [15,13,14.5,17,20,25,26,26,27,22,18,15]
x=range(2,26,2)

# 绘图
plt.plot(x,y)
# 展示图形
plt.show()

设置图片大小

plt.figure(figsize=(20,8),dpi = 80)

figsize 中20，8表示宽和高，dpi每英寸上点的个数，让图片更加清晰

from matplotlib import pyplot as plt

# 设置图片大小
plt.figure(figsize = (20, 8), dpi=80)

y = [15,13,14.5,17,20,25,26,26,27,22,18,15]
x=range(2,26,2)

# 绘图
plt.plot(x,y)
# 展示图形
plt.show()

图片保存

savefig()

#保存
plt.savefig("./t1.png")

绘制x轴的刻度

plt.xticks(x)

plt.xticks(range(2,25))

x=range(2,26,2)
# 绘图
plt.plot(x,y)

# 绘制x轴的刻度
# xtick_labels = [i/2 for i in range(4,49)]
# plt.xticks(xtick_labels[::3])

plt.xticks(range(25,50))

传了什么，才有什么

x轴中式2-26，但是我正在x刻度上只显示25-50，所以前面都是空的

让轴上显示字符串

# 调整x轴的刻度
_x = list(x)[::10]  #取了一个步长
xtrick_lables = ["hello,{}".format(i) for i in _x]
# 如果想要在列表上显示字符串的时候,把数字和字符串做个一一对应
plt.xticks(_x,xtrick_lables)

几点几分的设置

# 调整x轴的刻度
_x = list(x)  
xtrick_lables = ["10点{}分".format(i) for i in range(60)]
xtrick_lables +=["11点{}分".format(i) for i in range(60)]
# xtrick_lables +=["{0}:{1:2d}".format(x,y) for x in range(10,12) for y in range(0,60,10)]

# 如果想要在列表上显示字符串的时候,把数字和字符串做个一一对应
plt.xticks(_x[::3], xtrick_lables[::3],rotation = 45)  # 取了一个步长,旋转90度

处理不显示中文的问题

plt.xticks(fontproperties="STSong")

添加描述信息

# 添加描述信息
plt.xlabel("时间", fontproperties="STSong")
plt.ylabel("温度 单位(℃)", fontproperties="STSong")
plt.title("10点到12点每分钟的气温变化情况", fontproperties="STSong")

绘制网格

# 绘制网格
plt.grid(alpha=0.1) #设置透明度

绘制两条线

plt.plot(x, y_1,label="自己")
plt.plot(x, y_2, label="同桌")
# 添加图例
plt.legend(prop="STSong",loc = "upper left")

所学代码

from matplotlib import pyplot as plt
import random

y_1 = [1,0,1,2,1,2,3,4,5,6,1,2,1,2]
y_2 = [1,2,3,2,3,2,3,2,3,2,3,2,3,2]
x = range(20,34)

# 绘图
# 设置图片大小
plt.figure(figsize=(20, 8), dpi=80)
plt.plot(x, y_1,label="自己",color = "orange",linestyle=":",linewidth = 5,alpha = 0.8)
plt.plot(x, y_2, label="同桌", color="cyan", linestyle="--")

# 调整x轴的刻度
xtrick_label = ["{}岁".format(i) for i in x]
plt.xticks(x, xtrick_label, fontproperties="STSong")
plt.yticks(range(0,9))
# 添加描述信息
plt.xlabel("个数", fontproperties="STSong")
plt.ylabel("年龄", fontproperties="STSong")
plt.title("男朋友的个数", fontproperties="STSong")

# 绘制网格
plt.grid(alpha=0.4, linestyle=":")  # 设置透明度

# 添加图例
plt.legend(prop="STSong",loc = "upper left")

plt.show()

添加文本注释

添加水印

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-htX2NKXi-1686993936008)(D:\总结\img\数据分析\微信图片_20230527161320.jpg)]

总结

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-TDl6EUd5-1686993936010)(D:\总结\img\数据分析\微信图片_20230527162246.png)]
[外链图片转存失败,源站可能有防盗Alt]!链机制,建(https://img-3mhoqd.csnimg.cn/images/202205A4100510.png)D:\总结\img\数据分析\微信图片_20230527162246.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YL210S8b-1686993936012)(D:\总结\img\数据分析\微信图片_20230527162254.png)]

散点图

使用scatter方法绘制散点图，和之前绘制折线图的唯一区别

from matplotlib import pyplot as plt

y_3 = [11,17,16,11,12,6,6,7,8,9,12,15,14,17,18,21,20,16,17,20,14,15,15,15,19,21,22,22,22,23]
y_10 = [26,26,26,19,21,17,16,19,22,23,17,20,21,20,22,15,11,15,5,14,17,10,11,13,12,14,6,12,14,15,16]

x_3 = range(1,31)
x_10 = range(51, 82)



# 设置图形大小
plt.figure(figsize=(20,8),dpi = 80)

# 绘图
plt.scatter(x_3, y_3, label="3月份")
plt.scatter(x_10, y_10, label="10月份")

# 调整x轴的刻度
_x = list(x_3)+list(x_10)
xticks_labels = ["3月{}日".format(i) for i in x_3]
xticks_labels += ["10月{}日".format(i) for i in x_10]
plt.xticks(_x[::3], xticks_labels[::3], fontproperties="STSong",rotation = 45)

# 添加图例
plt.legend(loc="upper left", prop="STSong")

# 添加描述信息
plt.xlabel("时间", fontproperties="STSong")
plt.ylabel("温度", fontproperties="STSong")
plt.title("标题", fontproperties="STSong")



# 展示
plt.show()

绘制横向条形图

# 绘制横着的条形图
from matplotlib import pyplot as plt

a = ['战狼2','速度与激情2','我们的少年','人生路不熟','长沙夜生活','极限特工\n:终极回归','加勒比海盗：死无对证']
b = [56.01,26.97,12.96,23.67,45.56,23.56,34.56]

# 设置图形大小
plt.figure(figsize = (20,8),dpi = 80)

# 绘制条形图
# plt.bar(range(len(a)),b,width=0.3)
plt.barh(range(len(a)),b,height=0.3,color="orange")

plt.yticks(range(len(a)), a, fontproperties="STSong",rotation=45)
plt.grid(alpha=0.3)
# plt.xticks(range(len(a)), a, fontproperties="STSong",rotation=45)
plt.show()

绘制直方图

自己总结

第一步：先放数据
第二步：绘图

# 绘图
plt.plot(x,y)
  plt.scatter(x_3, y_3, label="3月份")
  plt.barh(range(len(a)),b,height=0.3,color="orange")

第三步：展示/保存

#保存
plt.savefig("./t1.png")

# 展示图形
plt.show()

到这里就可以大致看到表格了

后面就是增加细节

第四步：绘制横坐标和纵坐标的描述信息

# 调整x轴的刻度
_x = list(x)  
xtrick_lables = ["10点{}分".format(i) for i in range(60)]
xtrick_lables +=["11点{}分".format(i) for i in range(60)]
# xtrick_lables +=["{0}:{1:2d}".format(x,y) for x in range(10,12) for y in range(0,60,10)]

# 如果想要在列表上显示字符串的时候,把数字和字符串做个一一对应
plt.xticks(_x[::3], xtrick_lables[::3],rotation = 45)  # 取了一个步长,旋转90度

plt.yticks(x_15, a, fontproperties="STSong", rotation=45)

增加图例和描述信息

# 添加描述信息
plt.xlabel("时间", fontproperties="STSong")
plt.ylabel("温度 单位(℃)", fontproperties="STSong")
plt.title("10点到12点每分钟的气温变化情况", fontproperties="STSong")

图形的不同应用

折线图体现变化

散点图体现x轴和Y轴之间的关系

条形图体现离散数据

直方图统计连续数据

numpy-数值型的数据

cmd中查找已下载安装包的指令

python -m pip list

安装国内镜像

pip install -i https://pypi.tuna.tsinghua.edu.cn/simple/ opencv-python

阿里云：http://mirrors.aliyun.com/pypi/simple/
清华大学：https://pypi.tuna.tsinghua.edu.cn/simple/
中国科学技术大学：http://pypi.mirrors.ustc.edu.cn/simple/

创建数组

# 使用numpy生成数组，得到ndarray这种类型
t1 = np.array([1,2,3])
print(t1)
print(type(t1))

t2 = np.array(range(10))
print(t2)
print(type(t1))

t3 = np.arange(4,10,2)
print(t3)
print(type(t3))

调整他的数据类型和查找他的类型以及生成小数

import numpy as np
import random

# 使用numpy生成数组，得到ndarray这种类型
t1 = np.array([1,2,3])
print(t1)
print(type(t1))

t2 = np.array(range(10))
print(t2)
print(type(t1))

t3 = np.arange(4,10,2)
print(t3)
print(type(t3))

# 查找他的类型
print(t3.dtype)
print("*"*100)
# numpy中的数据类型
t4 = np.array(range(1,4),dtype = "i1")
print(t4)
print(t4.dtype)

# numpy中的bool类型
t5 = np.array([1,1,0,1,0,0],dtype = bool)
print(t5)
print(t5.dtype)

# 调整数据类型
t6 = t5.astype("int8")
print(t6)
print(t6.dtype)

# 生成小数
t7 = np.array([random.random() for i in range(10)])
print(t7)

t8 = np.round(t7,3)
print(t8)

数组形状

shape

b = a.reshape(3,4)
b.shape
# 得到的结果是（3，4）

reshape和flatten

reshape是return类型的函数

两个数字就是二维，一个数字就是一维。

import numpy as np

t3 = np.arange(24).reshape((2,3,4))

print(t3)

# 变成二维数组
t3.reshape((4,6))

# 变成一维数组
t3.reshape((24,))

# 不知道有几个数，但仍然想把它变成一维的
# t3.shape[0]指的是块数，行数，列数
t4 = t3.reshape((t3.shape[0]*t3.shape[1]*t3.shape[2],))
print(t4)
# 不需要自己写数量，自动降为一维
t5 = t3.flatten()
print(t5)

数值的加减乘除以及多维数组下的加减乘除

# 每个数字都相加
t6 =t5+2
print(t6)
  t6 =t5*2
  t6 =t5/0

/0得出的结果 [nan inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf]

nan不是数字

inf无穷无限

读取本地数据

轴

0表示行，1表示列

0表示块 1表示行 2表示列

读取数据

np.loadtxt(fname, dtype=np.float, delimiter=None skiprows=0, usecols=None, unpack=False)

参数	解释
frame	文件，字符串或产生器，可以是.gz或bz2压缩文件
dtype	数据类型，可选，csv的字符串以什么数据类型读入数组中，默认np.float
delimiter	分割字符串，默认是任何空格，改为逗号
skiprows	跳过前x行，一般跳过第一行表头
usecols	读取指定的列，索引，元组类型
unpack	如果是true，读入属性将分别写入不同数组变量，false读入数据只写入一个数组变量，默认false 转置

t1 = np.loadtxt(us_file_path,delimiter=",",dtype="int")

转置的方法：t2.transpose()/t2.swapaxes()

索引和切片

取文件里面的行和列

# 取行
print(t2[2])

# 去多行
print(t2[2:])

# 去不连续的多行
print(t2[[2,8,10]])

# 取列,写：表示每一列都要
print(t2[,:])
print(t2[2:,:])
print(t2[[2,10,3],:])
# 取第一列
print(t2[:,0])

# 取连续的多列
print(t2[:,2:])

# 取不连续的多列
print(t2[[2,8,10],[0,2]])

# 取多行和多列，取第三行到第五行，第二列到第四列的结果
b = t2[2:5,1:4]

# 取多个不相邻的点
# 选出来的位置是(0,0),(2,1),(2,3)
c = t2[[0,2,2],[0,1,3]]
# 两个冒号表示步长::2,两个两个一下

数值的修改

重新赋值

# 重新赋值
t[:,2:4] =0
# 取t2中小于10的数修改为3
t2[t2<10] = 3
  
  a = 3 if 3>2 else 4  //3
  a  = 3 if 3<2 else 4 //4
  # 同样
  np.where(t<10,0,10)

clip裁剪

t.clip(10,18)

把小于10的替换成10，大于18的替换成18

赋值nan

nan是float类型

t2 = t2.astype(float)
t2[3,3] = np.nan

数组的拼接

np.vstack((t1,t2))//竖直拼接
np.hstack((t1,t2))//水平拼接

数组的行列交换

t[[1,2],:] = t[[2,1]:] //行交换
t[:,[0.2]] = t[:,[2,0]] //列交换

创建全为0或者全为1的数组 ones()/zeros()

np.zeros((3,4))
np.ones((3,4))

创建对角线为1的正方形数组（方阵）

np.eyes(3)

获取最大值和最小值的位置

np.argmax(t,axis = 0)
np.argmin(t,axis = 1)

生成随机数random

np.random.randint(10,20,(4,5))

创建10-20的4行5列的整数

下次随机种子和上次一样的

np.random.seed(10)
np.random.randint(0,20,(3,4))

seed(10)中的10相当于一个记号，第10号的随机数就固定了，你可以标记每一号产生的随机数

a=b，a = b[:]和b.copy()

浅拷贝

a=b 完全不复制，a和b相互影响

a = b[:]视图操作，一种切片，会创建新的对象a，但是a的数据完全由b保管，他们两个的数据变化是一致的

a= b.copy()复制，a和b互不影响

nan和常用的统计方法

不为0的个数

np.count_nonzeros(t2)

当前数组里面到底有哪些结果是nan的值

np.count_nonzeros(t2！=t2)

因为只有np.nan == np.nan是false值

np.isnan(t2)

np.count_nonzeros(np.isnan(t2))

nan和任何数值计算都是nan

np.sum(t2)
np.sum(t2,axis = 0)
np.sum(t2,axis = 1)

t2.mean(axis=0)计算均值

np.median(t2,axis=0)中值

np.max(t2,axis=0)最大值最小值

np.ptp(t,axis=None)极值：最大值和最小值的差

np.std(axis=None)标准差：平均值的分散程度

numpy练习代码

import numpy as np

def fill_ndarray(t1):
  for i in range(t1.shape[1]):
    temp_col = t1[:,i] #当前这一列
    nan_num = np.count_nonzero(temp_col != temp_col) #nan的个数
    if nan_num !=0 : #不为0，说明这一列中有nan
      temp_not_nan_col = temp_col[temp_col == temp_col] #当前一列不为nan的array
      # temp_not_nan_col.mean() #平均值
      # 选中当前为nan的位置，把值赋值为不为nan的均值
      temp_col[np.isnan(temp_col)] = temp_not_nan_col.mean() 
  return t1



if __name__ == '__main__':
  t1 = np.arange(12).reshape((3, 4)).astype(float)
  t1[1, 2:] = np.nan
  print(t1)
  t1 = fill_ndarray(t1)
  print(t1)

总结

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-v63Jprl0-1686993936014)(D:\总结\img\新建文件夹\微信图片_20230612220036.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-4ZCybPAO-1686993936015)(D:\总结\img\新建文件夹\python1.PNG)]

pandas-所有数据类型

下载

pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pandas这个贼快

series

基础知识掌握

import pandas as pd

t1 = pd.Series([1,2,31,12,3,4])

print(t1)
print(type(t1))

t2 = pd.Series([1, 23, 2, 2, 1], index=list("abcde"))
print(t2)

temp_dict = {"name":"xiaohong","age":30,"tel":10086}
t3 = pd.Series(temp_dict)
print(t3)

切片和索引

t4 = t3["age"]
t4 = t3[1]
t3[:3] #前三行
t3[[1,2]] #不连续的行
t3[["age","tel"]] #不连续的行
# 布尔索引
t1[t1>10]

print(t3.index)
for i in t3.index:
  print(i)
t3.values #取值

series对象本质上由两个数组构成

一个数组构成对象的键（index，索引）一个数组构成对象的值（values) ,键->值

读取外部文件

csv文件

df = pd.read_csv("./dogName2.csv")

sql文件

pd.read_sql(sql_sentence,connection)

dataFrame

t = pd.DataFrame(np.arrange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))

字典的方式

d1 = {"name":["xiaoming","xiaogang"],"age":[20,32],"tel":[10086,10010]}

列表的方式

d1 = [{"name":"xiaoming","age":20,"tel":10086},{"name":"xiaogang","age":32,"tel":10010}]

pd.dataFrame(d1)

结果：

age name tel

0 20 xiaoming 10086

1 32 xiaogang 10010

行是一条数据，列是多条数据

字典是无序的

如果那个位置是缺失的，得到的结果为Nan

去掉不想要的数据

data = collection.find()
data_list = []
  for i in data:
	temp={}
   temp["info"]  = i["info"]
     temp["rating_count"]  = i["rating_count"]
     temp["rating_value"]  = i["rating_value"]
     temp["title"]  = i["title"]
     temp["country"]  = i["country"]
     data_list.append(temp)
 df = pd.dataFrame(data_list)
 print(df)

常用属性

ndim 返回DataFrame的维数；
shape 返回DataFrame的形状；（3，3）3行3列
dtypes 返回DataFrame中每一列元素的数据类型；
size 返回DataFrame中元素的个数；
T 返回DataFrame的转置结果；
index 返回DataFrame中的索引；
columns 返回DataFrame中的列索引；
values 返回DataFrame中的数值；

df.head (5) 可以显示前几行

df.tail(3)

df.info() 展示df的概览 df.describe() 数字类型的那几列

dataFrame中排序的方法

df.sort_values(by="id"，ascending = False)

by后面传的是通过什么来排序的

ascending默认升序，改为false为降序

取行和索引

#前20行
df[:20]
#只取某列
df["id"]
#选择行和列
df[:20]["id"]

pandas取行或者列的注意点：

方括号写数据，表示取行，对行进行操作

写字符串，表示取列，对列进行操作

某些行和列

df.loc通过标签索引行数据

df.iloc通过位置获取行数据

loc

import pandas as pd
import numpy as np

t3 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
print(t3)

t5 = t3.loc["a","Z"]
print(type(t5))

# 取整行
t6 = t3.loc["a"]

# 取整列
t6 = t3.loc[,"Z"]

# 取多行多列
t7 = t3.loc[["a","c"],:]
t7 = t3.loc[:,["X","Y"]]
t7 = t3.loc[["a","b"],["X","Y"]]
# 取[a,c]闭区间的，abc三行
t7 = t3.loc["a":"c",["X","Y"]]

iloc

t7 = t3.iloc[1]
t7 = t3.iloc[1,2]
t7 = t3.iloc[:,[2,1]]
t7 = t3.iloc[[0,2],[2,1]]
t3.iloc[1:,:2]
print(t7)

布尔索引

df[df[("Count_AnimalName"]<100)&("Count_AnimalName">80)]

字符串的方法

df["info"].str.split("/").tolist()

缺失数据的处理

dropna()函数的作用是去除读入的数据中（DataFrame）含有NaN的行。

t3 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
t3.iloc[[0, 2], [2, 1]] = np.nan
print(t3)
print(pd.isnull(t3))

print(pd.notnull(t3))
# 方法1:x这一列中不为nan的数据
print(t3[pd.notnull(t3["X"])])
# 方法2：dropna() inplace是否进行原地修改,就在t3上修改，how=“all"表示全部为nan时删除
print(t3.dropa(axis=0,how="any",inplace=true))

填充数据

t3.fillna(0)
  #填充中位数或者均值
t3.fillna(t3.mean())
#只填充一列
t3["age"] = t3["age"].fillna(t3.mean())
# 出现0
t3[t3 == 0] = np.nan

注：

pandas中的nan是不进行计算的，而numpy中直接是nan了

电影数直方图

import pandas as pd
from matplotlib import pylab as plt

data_path = "./movie.csv"
df = pd.read_csv(data_path)
print(df.head(1))
print(df.info())

# series类型，values是ndarray
runtime_data = df["Runtime"].values

max_runtime = runtime_data.max()
min_runtime = runtime_data.min()

# 计算组数
num_bin = (max_runtime-min_runtime)//10

plt.figure(figsize=(20,8),dpi=80)
plt.hist(runtime_data,num_bin)

plt.xticks(range(min_runtime, max_runtime+5,5))

plt.show()

步长设置每0.5的操作

_x = [min_runtime]
  i = min_runtime
  while i<=max_runtime:
		i = i+0.5
  	_x.append(i)
plt.xticks(_x)

常用方法

import pandas as pd
import numpy as np

data_path = "./movie.csv"
df = pd.read_csv(data_path)
print(df.head(1))
print(df.info())

# 获取平均评分
df["Rating"].mean()

# 导演的人数
# set是集合的意思 去重用的
len(set(df["Director"].tolist()))
# 自动获取列表
len(df["Dirrctor"].unique())

# 获取演员人数
temp_actors_list = df["Actor"].str.split(",").tolist()
# 双重循环
actors_list = [i for j in temp_actors_list for i in j]
# actors_list = list(np.array(temp_actors_list).flatten())
actor_num = len(set(actors_list))
print(actor_num)

# 最大值
df["Rating"].argmax()
df["Rating"].max()
# 最小值
df["Rating"].argmin()
df["Rating"].min()

统计每个分类的电影的数量和（字符串离散化的案例）

重要思想：将字符串转化为数据进行统计

df = pd.get_demmies(df)

import pandas as pd
import numpy as np

data_path = "./movie.csv"
df = pd.read_csv(data_path)

# 统计分类的列表
temp_list = df["Genre"].str.split(",").tolist()  # [[],[],[]]

genre_list = list(set([i for j in temp_list for i in j]))

# 构造全为0的数组
zeros_df = pd.DataFrame(np.zeros(df.shape[0],len(genre_list)),columns = genre_list)

# 给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
  # zeros_df.loc[0,["Sci_fi","musical"]]
  zeros_df.loc[i,temp_list[i]] = 1

# 统计每个分类的电影的数量和
genre_count = zeros_df.sum(axis = 0)


#排序
genre_count = genre_count.sort_values()

#画图
_x = genre_count.index
_y = genre_count.values

plt.figure(figsize = (20,8),dpi = 80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.show()

数据合并join

行join

join:默认情况下是吧行索引相同的数据合并在一起

所有的行数以第一个参数的行数为准

join就是通过index行索引进行合并的

merge根据列索引进行合并

df1.merge(df3,on="a")

根据第a列进行内连接

how：选择进行什么链接

“outer“外连接 ”left"左连接 “right”右链接

df1.merge(t2,left_on="0",right_on="x")

df1根据0这一行为准，df2以x这一行为准

分组

grouped = df.groupby(by="columns_name")9

是DataFrameGroupBy这个对象

可以进行遍历

统计中国和美国星巴克的数量

for i,j in grouped:
    print(i)
    print(j,type(j))
 df[df["country"]="US"]

调用聚合方法

country_count = grouped["Brand"].count()
  print(country_count["US"])
print(country_count["CH"])

grouped是一个DataFrameGroupBy对象，是可迭代的

grouped中的每一个元素是一个元组

元组里面是（索引（分组的值），分组之后的DataFrame）

DataFrameGroupBy对象方法

count 分组中非nan得数量
sun
mean
median
std.var
min.max

统计中国每个省份统计星巴克的数量

china_data = df[df["Country"] == "CN"]
  grouped = china_data.grouped(by="State").count()["Brand"]

数据按照多个条件进行分组,返回series

df["Brand"].groupby(by=[df["Country"],df["State"]]).count()

数据按照多个条件进行分组,返回dataframe

多加了一个方括号

grouped = df[["Brand"]].groupby(by=[df["Country"],df["State"]]).count()
  df.groupby(by=[df["Country"],df["State"]])[["Brand"]].count()
  df.groupby(by=[df["Country"],df["State"]]).count()[["Brand"]]

索引得方法和属性

grouped.index

grouped.index=["a","b"]

grouped.reindex(["a","f"])

取a和f两行

f不存在，所以它的值都为nan

grouped.reindex(["a","f"])

指定某一列为index

grouped.set_index("a",drop=False)

drop得意思是仍然希望把这一列保留

grouped["d"].unique()

交换层级(如果有两层索引)

如果想去列表内层得one中得数据，先将他们得层级对调，再取

d.swaplevel()["one"]

b.loc["one"].loc["h"]

这样才能取到值

这样也可以取到

x["one","h"]

使用matplotlib呈现出店铺总数排名前10的国家

import pandas as pd
from matplotlib import pyplot as plt

data_path = "./movie.csv"
df = pd.read_csv(data_path)

# 准备数据
data1 = df.groupby(by="Country").count()["brand"].sort_values(ascending=False)[:10]

_x = data1.index
_x = data1.values

plt.figure(figsize = (20,8),dpi=80)

plt.bar(range(len(_x)),_x)
plt.xticks(range(len(_x)),_x)
plt.show()

当这一行中有缺失的处理办法(去除这一列中nan的行)

data1 = df[pd.notnull(df["orignal_publication_year"])]
grouped = data1.grouped(by="orignal_publication_year").count()["title"]

总结

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-PifqhXjX-1686993936017)(D:\总结\img\新建文件夹\微信图片_20230615154100.jpg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aXFavyy3-1686993936017)(D:\总结\img\新建文件夹\微信图片_20230615154111.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MejnrwlB-1686993936018)(D:\总结\img\新建文件夹\3.png)]

统计出这些数据中不同类型的经济情况的次数，根据不同月份不同类型经济电话的次数的变化情况

Pandas的tolist函数就是可以将Dataframe，Series等格式转化为list的数据类型

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

data_path = "./911.csv"
df = pd.read_csv(data_path)

# print(df.head(1))
# print(df.info())

# 获取分类情况
# print(df["title"].str.split(":"))

temp_list = df["title"].str.split(":").to_list()
cate_list = list(set([i[0] for i in temp_list]))
print(cate_list)

# 构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(cate_list))),columns = cate_list)

# 赋值
for cate in cate_list:
  # zeros_df["ems"][true]将为true的位置赋值为1
  zeros_df[cate][df["title"].str.contains(cate)] = 1
print(zeros_df)
zeros_df.sum(axis=0)
#法2
# for i in range(df.shape[0]):
#   zeros_df.loc[i,temp_list[i][0]] = 1
# print(zeros_df)

#法三
temp_list = df["title"].str.split(":").to_list()
cate_list = [i[0] for i in temp_list]
df["cate"] = pd.DataFrame(np.array(cate_list).reshape((df.shape[0],1)))
print(np.array(cate_list).reshape((df.shape[0], 1)))
print(df.groupby(by="cate").count()["title"])

时间序列

data_range

start=None

end=None

periods 时间段

freq 频率

# start和end连用或者室start和periods连用
date = pd.date_range(start="20171230",end="20181223",freq="10D")
date = pd.date_range(start="20171230",periods=10,freq="M")
print(date)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lOMmixTu-1686993936019)(D:\总结\img\新建文件夹\微信图片_20230616171626.jpg)]

在dateframe中使用时间序列

date = pd.date_range(start="20171230",periods=10,freq="M")
df = pd.DataFrame(np.random.rand(10),index = date)

将时间字符串转化为时间序列类型

df["timeStamp"] = pd.to_datetime(df["timeStamp"],formate="")

重采样

将时间序列从一个频率转化为另一个频率处理的过程，将高频率数据转化为低频率数据为降采样，低频率数据转化为高频率为升采样

df.resample("M")

dir(i)可以看所有的方法

不同月份电话次数

_x = [i.strftime(“%Y%m%d”) for i in _x]是用来重新定义时间戳

import pandas as pd
from matplotlib import pyplot as plt

data_path = "./911.csv"
df = pd.read_csv(data_path)

df["timeStamp"] = pd.to_datetime(df["timeStamp"])

df.set_index("timeStamp",inplace=True)
# print(df.head())

# 统计出911数据中不同月份电话次数
count_by_month = df.resample("M").count()["title"]
print(count_by_month)

# 画图
_x = count_by_month.index
_y = count_by_month.values

# for i in _x:
#   print(dir(i))
#   break
_x = [i.strftime("%Y/%m/%d") for i in _x]

plt.figure(figsize=(20,8),dpi=80)

plt.plot(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x,rotation=45)
plt.show()

不同月份不同类型的电话的次数的变化情况

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

data_path = "./911.csv"
df = pd.read_csv(data_path)

# 把时间字符串转化为时间类型设置为索引
df["timeStamp"] = pd.to_datetime(df["timeStamp"])

# 添加列，表示分类
temp_list = df["title"].str.split(":").to_list()
cate_list = [i[0] for i in temp_list]
df["cate"] = pd.DataFrame(np.array(cate_list).reshape((df.shape[0], 1)))

df.set_index("timeStamp", inplace=True)
# 分组
for group_name,group_data in df.groupby(by="cate"):
  # 对不同的分类都进行绘图
  count_by_month = group_data.resample("M").count()["title"]

  # 画图
  _x = count_by_month.index
  print(group_name)
  print(group_data)
  _y = count_by_month.values

  _x = [i.strftime("%Y/%m/%d") for i in _x]

  plt.plot(range(len(_x)), _y, label=group_name)

plt.figure(figsize=(20, 8), dpi=80)
plt.legend(loc="best")
plt.xticks(range(len(_x)), _x, rotation=45)
plt.show()

groupBy对象支持迭代，可以产生一组二元元组（由分组名和数据块组成）

PeriodIndex

period = pd.PeriodIndex(year=df["year"],month=df["month"],hour=df["hour"],freq="H")

##　绘制出５个城市ＰＭ２.５的变化情况

import pandas as pd
from matplotlib import pyplot as plt


file_path = './PM2.5/BeijingPM20100101_20151231.csv'

df = pd.read_csv(file_path)

# print(df.head())
# print(df.info())

# 把分开的时间字符串通过periodIndex的方法转化为pandas的时间类型
period = pd.PeriodIndex(year=df["year"],month=df["month"],hour=df["hour"],freq="H")
df["datetime"] = period

# 把datetime设置为索引
df.set_index("datetime",inplace=True)

# 进行降采样
df.index = pd.to_datetime(df.index)

df = df.resample("7D").mean()
print(df)

# 处理缺失数据，删除缺失数据
data = df["PM_US Post"].dropna()

# 画图
_x = data.index
_y = data.values

plt.figure(figsize = (20,8),dpi=80)
plt.plot(range(len(_x)),_y)c

# plt.xticks(range(0,len(_x),20),list(_x)[::20])

plt.show()

清风若过@

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
数据分析入门

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-TDl6EUd5-1686993936010)(D:\总结\img\数据分析\微信图片_20230527162246.png)][外链图片转存失败,源站可能有防盗Alt]!链机制,建(https://img-3mhoqd.csnimg.cn/images/202205A4100510.png)D:\总结\img\数据分析\微信图片_20230527162246.png)]
复制链接

扫一扫