"""
Pandas常用的两个工具数据结构:
1.Series
2.DataFrame
DataFrame:
1.DataFrame表示的是矩阵的数据表,它包含已排序的列集合,每一列可以是不同的值类型(数值、字符串、布尔值等)
2.DataFrame既有行索引,也有列索引,可以被视为一个共享相同索引的Series的字典
3.在DataFrame中,数据被存储为一个以上的二维块,而不是列表、字典或其他一维数组的集合
DataFrame的创建方式:
利用包含等长度列表或Numpy数组的字典来形成DataFrame
"""
import pandas as pd
player = {"Faker": [1001], "Uzi": [1002], "Khan": [1003], "Ming": [1004], "MLXG": [1005]}
df_1 = pd.DataFrame(player)
print(df_1)
"""
Faker Uzi Khan Ming MLXG
0 1001 1002 1003 1004 1005
"""
data = {"player": ["Faker", "Uzi", "Khan", "Ming", "MLXG", "ClearLove"], "rank": [2001, 2002, 2003, 2004, 2005, 2006],
"country": ["South Korea", "China", "South Korea", "China", "China", "China"]}
df_2 = pd.DataFrame(data) # 产生的DataFrame会自动为Series分配索引,并且列会按照排序的顺序排列
print(df_2)
"""
player rank country
0 Faker 2001 South Korea
1 Uzi 2002 China
2 Khan 2003 South Korea
3 Ming 2004 China
4 MLXG 2005 China
5 ClearLove 2006 China
"""
# ▶知识点1:对于大型的DataFrame,head方法会选出【头部】的五行数据,tail方法则相反,会选出【尾部】的五行数据
print(df_2.head()) # ClearLove没有显示出来
"""
player rank country
0 Faker 2001 South Korea
1 Uzi 2002 China
2 Khan 2003 South Korea
3 Ming 2004 China
4 MLXG 2005 China
"""
# ▶知识点2:如果你指定了列的顺序,DataFrame的列将会按照指定顺序排列
df_3 = pd.DataFrame(data, columns=["player", "country", "rank"])
print(df_3)
"""
player country rank
0 Faker South Korea 2001
1 Uzi China 2002
2 Khan South Korea 2003
3 Ming China 2004
"""
# ▶知识点3:如果传入的列不在字典中,将会在结果中出现缺失值
df_4 = pd.DataFrame(data, columns=["player", "country", "rank", "year"],
index=["one", "two", "three", "four", "five", "six"])
print(df_4)
"""
player country rank year
one Faker South Korea 2001 NaN
two Uzi China 2002 NaN
three Khan South Korea 2003 NaN
four Ming China 2004 NaN
five MLXG China 2005 NaN
six ClearLove China 2006 NaN
"""
# ▶知识点4:DataFrame中的【列】,可以按【字典型标记】或【属性】检索为Series
# ▶知识点5:需要注意的是:frame[column]对于任意列名均有效,frame.column只在列名是有效的Python变量名时有效
# 方式1:字典型标记
print(df_4["player"])
"""
one Faker
two Uzi
three Khan
four Ming
five MLXG
six ClearLove
Name: player, dtype: object
"""
# 方式2:属性检索
print(df_4.player)
"""
one Faker
two Uzi
three Khan
four Ming
five MLXG
six ClearLove
Name: player, dtype: object
"""
# ▶知识点6:DataFrame中的【行】,可以通过【位置】或【特殊属性loc】进行选取
print(df_4.iloc[0])
print(df_4.loc["one"])
"""
以上结果一致:
player Faker
country South Korea
rank 2001
year NaN
Name: one, dtype: object
"""
# ▶知识点7:列的引用是可以修改的,空的列可以赋值为标量或值数组
df_4["year"] = 20
# ▶知识点8:当将列表或数组赋值给一个列时,值的长度必须和DataFrame的长度相匹配
df_4["sex"] = ["male", "male", "male", "male", "male", "male"]
print(df_4)
"""
player country rank year sex
one Faker South Korea 2001 20 male
two Uzi China 2002 20 male
three Khan South Korea 2003 20 male
four Ming China 2004 20 male
five MLXG China 2005 20 male
six ClearLove China 2006 20 male
"""
# ▶知识点9:如果将Series赋值给一列时,Series的索引将会按照DataFrame的索引重新排列,并在空缺的地方填充缺失值
val = pd.Series(["哈哈", "啧啧", "嘿嘿"], index=["two", "four", "six"])
# ▶知识点10:如果被赋值的;列不存在,则会生成一个新的列
df_4["remark"] = val
print(df_4)
"""
player country rank year sex remark
one Faker South Korea 2001 20 male NaN
two Uzi China 2002 20 male 哈哈
three Khan South Korea 2003 20 male NaN
four Ming China 2004 20 male 啧啧
five MLXG China 2005 20 male NaN
six ClearLove China 2006 20 male 嘿嘿
"""
# ▶知识点11:del关键字可以像在字典中那样对DataFrame删除列
del df_4["remark"]
print(df_4)
"""
player country rank year sex
one Faker South Korea 2001 20 male
two Uzi China 2002 20 male
three Khan South Korea 2003 20 male
four Ming China 2004 20 male
five MLXG China 2005 20 male
six ClearLove China 2006 20 male
"""
# ▶知识点12:从DataFrame中选取的列时数据的视图,而不是拷贝。因此,对Series的修改会映射到DataFrame中。
# ▶知识点13:如果需要复制,则应当显式的使用Series的copy方法
# ▶知识点14:另一种常用的数据形式是包含字典的嵌套字典,当把嵌套字典赋值给DataFrame时,pandas会将【字典的键】作为【列】,【内部字典的键】作为【行索引】
pop = {"XiaoMing": {2001: 2.1, 2002: 2.2}, "XiaoHong": {2003: 2.3, 2004: 2.4, 2002: 2.0}}
df_5 = pd.DataFrame(pop)
print(df_5)
"""
XiaoMing XiaoHong
2001 2.1 NaN
2002 2.2 2.0
2003 NaN 2.3
2004 NaN 2.4
"""
# ▶知识点15:转置操作(调换行和列)
print(df_5.T)
"""
2001 2002 2003 2004
XiaoMing 2.1 2.2 NaN NaN
XiaoHong NaN 2.0 2.3 2.4
"""
# ▶知识点16:内部字典的键被联合、排序后形成了结果的索引,如果显式的指明索引,内部字典的键将不会被排序
df_6 = pd.DataFrame(pop, index=[2004, 2003, 2001, 2002])
print(df_6)
"""
XiaoMing XiaoHong
2004 NaN 2.4
2003 NaN 2.3
2001 2.1 NaN
2002 2.2 2.0
"""
# ▶知识点17:包含Series的字典可以用于构造DataFrame
data_1 = {"Sakura": df_5["XiaoMing"][:2], "Saber": df_5["XiaoHong"][:-1]}
df_7 = pd.DataFrame(data_1)
print(df_7)
"""
Sakura Saber
2001 2.1 NaN
2002 2.2 2.0
2003 NaN 2.3
"""
# ▶知识点18:如果DataFrame的索引和列拥有name属性,则这些name属性也会被显示
df_7.index.name = "year"
df_7.columns.name = "state"
print(df_7)
"""
state Sakura Saber
year
2001 2.1 NaN
2002 2.2 2.0
2003 NaN 2.3
"""
# ▶知识点19:和Series类似,DataFrame的values属性会将包含在DataFrame中的数据以二维ndarray的形式返回
# ▶知识点20:如果DataFrame的列是不同的dtypes,则values的dtype会自动选择适合所有列的类型
print(df_7.values)
"""
[[2.1 nan]
[2.2 2. ]
[nan 2.3]]
"""
二、Pandas-2.Pandas数据结构:DataFrame
最新推荐文章于 2024-04-28 17:36:58 发布