目录
一、为什么要学习pandas?
二、pandas的常用数据类型
三、pandas之Series创建
1、使用列表创建Series:
import pandas as ps # 使用pandas之前需要安装pandas模块
a = ps.Series([9,8,7,6])
print(a,"\n","左边为索引值,右边数值")
2、使用字典创建Series:
import pandas as ps # 使用pandas之前需要安装pandas模块
c = ps.Series({"name":"AA","age":"sex","sex":"boy"}) # 使用字典创建Series
print(c,"\n")
3、手动指定Series的index
import pandas as ps # 使用pandas之前需要安装pandas模块
b = ps.Series([9,8,7,6],index=list("abcd")) # index = "给予指定的索引"
print(b)
四、改变数据类型
d = ps.Series([1,2,3,4]).astype(float) # 转换数据类型为浮点型
d.astype(float) # 此方法等同于上面(转换数据类型为浮点型)
print(d,"\n")
五、pandas之Series切片和索引
import pandas as ps # 使用pandas之前需要安装pandas模块
# 单个取值
a = ps.Series([1,2,3,4],index=list("abcd"))
print(a["a"]) # 利用字符串索引输出对应的值
print("#"*50) # 隔开
print(a[0]) # 利用数字索引输出对应的值
print("#"*50) # 隔开
# 连续取值
print(a[:3]) # 从第一位开始连续取三个数
print("#"*50) # 隔开
print(a[[1,3]]) # 取第二个和第四个的值
print("#"*50) # 隔开
print(a[["b","d"]]) # 取索引为b,d的数值
print("#"*50) # 隔开
print(a[a > 2]) # 输出大于2的数值
六、pandas之Series的索引和值
import pandas as ps # 使用pandas之前需要安装pandas模块
a = ps.Series([1,2,3,4],index=list("abcd"))
print(a.index) # 单独输出索引
print(a.values) # 单独输出值
print(a.index[0]) # 输出索引的第一位
print(a.values[0]) # 输出值的第一位
七、pandas之读取外部数据
import pandas as ps # 使用pandas之前需要安装pandas模块
a = ps.read_csv("./123.csv") # read.csv("文件路径") 读取文件
print(a)
八、pandas之DataFrame
import pandas as ps
import numpy as np
a = ps.DataFrame(np.arange(12).reshape((3,4))) # DataFrame 可以是多维数组
print(a)
1、DataFrame可传入的参数
1、data
2、index
3、columns
4、dtype
5、copy
b = ps.DataFrame(np.arange(12).reshape((3,4)),index=list("abc"),columns=list("wxyz"))
print(b)
2、字典创建DataFrame
import pandas as ps
e ={"name":["xiaozhang","xiaoming","xiaofang"],"age":[69,96,6],"sex":["男","女","男"]} # 使用字典创建DataFrame
e1 = ps.DataFrame(e)
print(e1) # 此时行索引(column)就变为了字典的键
填入多个字典,且键值对数量不一致时:
import pandas as ps
f = {"name":"xiaozhang","age":69,"sex":"男"},{"name":"xiaozhang","age":69},{"name":"xiaozhang","sex":"男"} # 创建3个键值数量不等的字典
f1 = ps.DataFrame(f)
print(f1)
空键所对应的值会被 NAN 代替
3、DataFrame的几个常用方法
print(f1,"\n") # 获取行索引
print(f1.index) # 获取行索引
print(f1.columns,"\n") # 获取列索引
print(f1.values,"\n") # 获取值
print(f1.shape,"\n") # 获取形状
print(f1.dtypes,"\n") # 获取数数据类型
print(f1.head(1),"\n") # 显示头几行,不填参数默认为五行
print(f1.tail(1),"\n") # 显示尾几行,不填参数默认为五行
print(f1.info(),"\n") # 行列索引,占用内存等信息
print(f1.describe(),"\n") # 快速综合统计结果:计算、均值、标准差、 最大值、最小值、四分位数
输出结果:
name age sex
0 xiaozhang 69.0 男
1 laoli 96.0 NaN
2 dagang NaN 男
RangeIndex(start=0, stop=3, step=1)
Index(['name', 'age', 'sex'], dtype='object')
[['xiaozhang' 69.0 '男']
['laoli' 96.0 nan]
['dagang' nan '男']]
(3, 3)
name object
age float64
sex object
dtype: object
name age sex
0 xiaozhang 69.0 男
name age sex
2 dagang NaN 男
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 3 non-null object
1 age 2 non-null float64
2 sex 2 non-null object
dtypes: float64(1), object(2)
memory usage: 200.0+ bytes
None
age
count 2.000000
mean 82.500000
std 19.091883
min 69.000000
25% 75.750000
50% 82.500000
75% 89.250000
max 96.000000
Process finished with exit code 0
4、DataFrame排序
import pandas as ps
a = ps.read_csv("./排序.txt")
print(a.head())
print("#"*50) # 隔开
print(a.sort_values(by="D",ascending=True)) # sort_values(by="哪一列,ascending=是否是升序),被选中的列将会按照指定的方式排序
5、DataFrame和Series的数据类型转换(括号)
import pandas as ps
import numpy as np
A1 = ps.DataFrame(np.arange(6).reshape(3,2),columns=["A","B"])
print(A1,"\n","*"*50)
print("一个括号的类型为:",type(A1["A"]),"\n",A1["A"],"\n","*"*50)
print("两个括号的类型为:",type(A1[["A"]]),"\n",A1[["A"]],"\n","*"*50)
九、pandas之取行或取列
import pandas as ps
a = ps.read_csv("./排序.txt")
print(a,"\n","#"*50)
# 方括号内写数字是对行进行操作,写字符串是对列进行操作
print(a[:2],"\n","#"*50) # 取前两行
print(a[:2]["D"]) # D列的前两行
1、loc
import pandas as ps
a = ps.read_csv("./排序.txt")
print(a,"\n","#"*50)
# 方括号内写数字是对行进行操作,写字符串是对列进行操作
print(a[:2],"\n","#"*50) # 取前两行
print(a[:2]["D"],"\n","#"*50) # D列的前两行
# loc,获取某一个元素
b = a.loc[0,"B"] # loc[行索引,列索引]
print(b,"\n","#"*50)
# loc,获取某一行所有的元素
b = a.loc[0] # loc[行索引] , 或是:a.loc[0,:]
print(b,"\n","#"*50)
# loc,获取某一列所有的元素
b = a.loc[:,"B"] # loc[:,列索引]
print(b,"\n","#"*50)
# loc,获取多行
b = a.loc[[0,2],:] # loc[[行索引1,行索引2...],:]
print(b,"\n","#"*50)
# loc,获取多列
b = a.loc[:,["A","C"]] # loc[:,[列索引1,列索引2...]]
print(b,"\n","#"*50)
# loc,获取多多行多列
b = a.loc[[0,2],["A","C"]] # loc[[行索引1,行索引2...],[列索引1,列索引2...]]
print(b,"\n","#"*50)
输出结果:
A B C D
0 45 26 57 92
1 215 548 5 78
2 784 51 102 36
3 584 56 98 72
##################################################
A B C D
0 45 26 57 92
1 215 548 5 78
##################################################
0 92
1 78
Name: D, dtype: int64
##################################################
26
##################################################
A 45
B 26
C 57
D 92
Name: 0, dtype: int64
##################################################
0 26
1 548
2 51
3 56
Name: B, dtype: int64
##################################################
A B C D
0 45 26 57 92
2 784 51 102 36
##################################################
A C
0 45 57
1 215 5
2 784 102
3 584 98
##################################################
A C
0 45 57
2 784 102
##################################################
Process finished with exit code 0
2、iloc
import pandas as ps
a = ps.read_csv("./排序.txt")
print(a,"\n","#"*50)
# iloc,获取某一行所有的元素
b = a.iloc[0] # loc[行索引] , 或是:a.loc[0,:]
print(b,"\n","#"*50)
# iloc,获取某一列所有的元素
b = a.iloc[:,1] # loc[:,列索引]
print(b,"\n","#"*50)
# iloc,获取多行
b = a.iloc[[0,2],:] # loc[[行索引1,行索引2...],:]
print(b,"\n","#"*50)
# iloc,获取多列
b = a.iloc[:,[1,3]] # loc[:,[列索引1,列索引2...]]
print(b,"\n","#"*50)
# iloc,获取多多行多列
b = a.iloc[[0,2],[1,3]] # loc[[行索引1,行索引2...],[列索引1,列索引2...]]
print(b,"\n","#"*50)
输出结果:
A B C D
0 45 26 57 92
1 215 548 5 78
2 784 51 102 36
3 584 56 98 72
##################################################
A 45
B 26
C 57
D 92
Name: 0, dtype: int64
##################################################
0 26
1 548
2 51
3 56
Name: B, dtype: int64
##################################################
A B C D
0 45 26 57 92
2 784 51 102 36
##################################################
B D
0 26 92
1 548 78
2 51 36
3 56 72
##################################################
B D
0 26 92
2 51 36
##################################################
Process finished with exit code 0
注:iloc使用的是数字索引,loc使用的是字符串索引
三、pandas之布尔索引
import pandas as ps
a = ps.read_csv("./排序.txt")
print(a,"\n","#"*50)
# pandas之布尔索引
b = a[a["A"] > 200] # 输出 A 列大于200的所有数
print(b,"\n","#"*50)
b = a[(a["A"] > 100) & (a["A"] < 300)] # 输出 A 列大于100小于300的数,每段比较要用括号括起来,中间插入逻辑运算符( &/and ,|/or 等 )
print(b,"\n","#"*50)
四、pandas之缺失数据处理
删除处理:
import pandas as ps
import numpy as np
a = ps.read_csv("./456.txt")
print(a,"\n","#"*50)
a = np.arange(16).reshape((4,4)) # 用numpy生成一个4x4的数组
a = a.astype(float) # 转换为浮点型
a[1,1] = "NaN" # 给指定位置赋值为nan
a[2,3] = "nan" # 给指定位置赋值为nan
a = ps.DataFrame(a) # 用numpy生成一个DataFrame
print(a,"\n","#"*50)
a.dropna(axis=1,how="any",inplace=True) # a.dropna(axis= axis轴(0:横,1:纵) ,how=删除方式(any / all ,出现一个就删除 / 全部为nan才删除),inplace= 是否马上应用到数组)
print(a)
替换处理:
import pandas as ps
import numpy as np
a = ps.read_csv("./456.txt")
print(a,"\n","#"*50)
a = np.arange(16).reshape((4,4)) # 用numpy生成一个4x4的数组
a = a.astype(float) # 转换为浮点型
a[1,1] = "NaN" # 给指定位置赋值为nan
a[2,3] = "nan" # 给指定位置赋值为nan
b = ps.DataFrame(a)
print(b.fillna(b.mean()),"\n","#"*50) # 数组.fillna(数组.mean()) 将数组中的所有nan取均值填充到数组中
print(b[1].fillna(b[1].mean())) # 将第1列的nan替换为取均值后的数值
# b[1] = b[1].fillna(b[1].mean()) # 将均值替换后的数值赋值给数组相应位置
十、pandas常用统计方法
set() 函数将列表、元组等可迭代对象转换为集合,len() 函数计算列表,元组等长度。
unique() 取唯一的元素
import pandas as ps
import numpy as np
a = ps.read_csv("./456.txt")
print(a,"\n","#"*50)
a1 = a["A"] # 将纵索引 A 下面的元素赋值给a1
print(len(set(a1.tolist())),"\n","#"*50) # set() 函数将列表、元组等可迭代对象转换为集合,len() 函数计算列表,元组等长度。转换为集合的原因在于去除重复的元素
print(len(a1.unique()),"\n","#"*50) # unique() 取唯一的元素(等同上)
使用两个for循环将列表嵌套平铺
import pandas as ps
import numpy as np
a2 = [[1,2],[3,4],[5,6]] # list嵌list
a3 = [i for j in a2 for i in j] # 将list平铺开,所有元素放在一个列表内
print(a3)
十一、数据合并之join()
import pandas as ps
import numpy as np
t1 = ps.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
print(t1)
# t1
# a b c d
# A 1.0 1.0 1.0 1.0
# B 1.0 1.0 1.0 1.0
t2 = ps.DataFrame(np.zeros((3,3)),index=["A","B","C"],columns=["x","y","z"])
print(t2,"\n","#"*50)
# t2
# x y z
# A 0.0 0.0 0.0
# B 0.0 0.0 0.0
# C 0.0 0.0 0.0
# join()以行索引进行拼接,如果两个数组的行索引不同时多出的部分将会被去掉,缺少的部分将全被nan填充
print(t1.join(t2),"\n","#"*50) # 将t2添加进t1里(以t2为基础),多余去除,缺少补nan
print(t2.join(t1),"\n","#"*50) # 将t1添加进t2里(以t1为基础),多余去除,缺少补nan
十一、数据合并之merge()
1、内拼接(inner):
import pandas as ps
import numpy as np
####################################### merge()合并 #######################################
t3 = ps.DataFrame(np.arange(9).reshape((3,3)),index=["A","B","C"],columns=["a","y","z"])
t3.loc["A","a"] = 1.0
# t3.loc["B","a"] = 4
print(t3,"\n","#"*50)
# t3:
# a y z
# A 1 1 2
# B 3 4 5
# C 6 7 8
t4 = ps.DataFrame(np.arange(8).reshape((2,4)),index=["A","B"],columns=list("abcd"))
t4.loc["A","a"] = 1
print(t4,"\n","#"*50)
# t4:
# a b c d
# A 1 1 2 3
# B 4 5 6 7
print(t4.merge(t3,on="a"),"\n","#"*50) # merge()以列索引进行拼接,两数组必须要有1个相同的列索引才能进行拼接,如果指定的索引下没有相同的数值那么将得到一个空的数组
# merge(需要拼接的数组,on="以哪一个列索引进行拼接"),默认为内拼接,用来拼接的列索引的元素相同时即可拼接,不同时将会被省略
输出结果:
2、外拼接(outer):
import pandas as ps
import numpy as np
####################################### merge()合并 #######################################
t3 = ps.DataFrame(np.arange(9).reshape((3,3)),index=["A","B","C"],columns=["a","y","z"])
t3.loc["A","a"] = 1.0
# t3.loc["B","a"] = 4
print(t3,"\n","#"*50)
# t3:
# a y z
# A 1 1 2
# B 3 4 5
# C 6 7 8
t4 = ps.DataFrame(np.arange(8).reshape((2,4)),index=["A","B"],columns=list("abcd"))
t4.loc["A","a"] = 1
print(t4,"\n","#"*50)
# t4:
# a b c d
# A 1 1 2 3
# B 4 5 6 7
print(t4.merge(t3,on="a",how="outer"),"\n","#"*50) # 将拼接方式设置为外拼接,用来拼接的列索引的元素相同时即可拼接,不同时将会被nan填充
输出结果:
3、左拼接(left):
import pandas as ps
import numpy as np
####################################### merge()合并 #######################################
t3 = ps.DataFrame(np.arange(9).reshape((3,3)),index=["A","B","C"],columns=["a","y","z"])
t3.loc["A","a"] = 1.0
# t3.loc["B","a"] = 4
print(t3,"\n","#"*50)
# t3:
# a y z
# A 1 1 2
# B 3 4 5
# C 6 7 8
t4 = ps.DataFrame(np.arange(8).reshape((2,4)),index=["A","B"],columns=list("abcd"))
t4.loc["A","a"] = 1
print(t4,"\n","#"*50)
# t4:
# a b c d
# A 1 1 2 3
# B 4 5 6 7
print(t4.merge(t3,on="a",how="left"),"\n","#"*50) # 将拼接方式设置为左拼接,以左边的数组为主,用来拼接的列索引的元素相同时即可拼接,不同时/缺少时将会被nan填充
输出结果:
4、右拼接(rigth):
import pandas as ps
import numpy as np
####################################### merge()合并 #######################################
t3 = ps.DataFrame(np.arange(9).reshape((3,3)),index=["A","B","C"],columns=["a","y","z"])
t3.loc["A","a"] = 1.0
# t3.loc["B","a"] = 4
print(t3,"\n","#"*50)
# t3:
# a y z
# A 1 1 2
# B 3 4 5
# C 6 7 8
t4 = ps.DataFrame(np.arange(8).reshape((2,4)),index=["A","B"],columns=list("abcd"))
t4.loc["A","a"] = 1
print(t4,"\n","#"*50)
# t4:
# a b c d
# A 1 1 2 3
# B 4 5 6 7
print(t4.merge(t3,on="a",how="right"),"\n","#"*50) # 将拼接方式设置为右拼接,以右边的数组为主,用来拼接的列索引的元素相同时即可拼接,不同时/缺少时将会被nan填充
输出结果: