Pandas
数据合并
- join:默认情况下他是把行索引相同的数据合并到一起
import pandas as pd
data = {'M': pd.Series([1, 1, 1]),
'N': pd.Series([1, 1, 1]),
'O': pd.Series(['a', 'b', 'c']),
'P': pd.Series([1, 1, 1])}
t1 = pd.DataFrame.from_dict(data)
t1.index = ['A', 'B', 'C']
data1 = {'V': pd.Series([0, 0]),
'W': pd.Series([0, 0]),
'X': pd.Series(['c', 'd']),
'Y': pd.Series([0, 0]),
'Z': pd.Series([0, 0])}
t2 = pd.DataFrame.from_dict(data1)
t2.index = ['A', 'B']
print(t1)
print("-----------------------------")
print(t2)
print("-----------------------------")
print(t1.join(t2))
print("-----------------------------")
- merge
import pandas as pd
data = {'M': pd.Series([1, 1, 1]),
'N': pd.Series([1, 1, 1]),
'O': pd.Series(['a', 'b', 'c']),
'P': pd.Series([1, 1, 1])}
t1 = pd.DataFrame.from_dict(data)
t1.index = ['A', 'B', 'C']
data1 = {'V': pd.Series([0, 0]),
'W': pd.Series([0, 0]),
'X': pd.Series(['c', 'd']),
'Y': pd.Series([0, 0]),
'Z': pd.Series([0, 0])}
t2 = pd.DataFrame.from_dict(data1)
t2.index = ['A', 'B']
print(t1)
print("-----------------------------")
print(t2)
print("-----------------------------")
# 不定how视作内连接, 只展示两个df对应的部分
print(t1.merge(t2, left_on="O", right_on="X"))
print("-----------------------------")
# 外连接,拼接后全部展示
print(t1.merge(t2, left_on="O", right_on="X", how="outer"))
print("-----------------------------")
# 左连接,只展示左边df对应的部分
print(t1.merge(t2, left_on="O", right_on="X", how="left"))
print("-----------------------------")
# 右连接,只展示右边df对应的部分
print(t1.merge(t2, left_on="O", right_on="X", how="right"))
print("-----------------------------")
分组和聚合
# 分组
df.groupby(by="columns_name")
# 聚合
df.count()
df.sum()
df.mean()
df.median()
df.min()
df.max()
df.std()
df.var()
索引和复合索引
-
索引
获取index:df.index 指定index :df.index = ['x','y'] 重新设置index : df.reindex(list("abcedf")) 指定某一列作为index :df.set_index("Country",drop=False) 返回index的唯一值:df.set_index("Country").index.unique()
-
复合索引
import pandas as pd
data = {'a': pd.Series([0, 1, 2, 3, 4, 5, 6]),
'b': pd.Series([7, 6, 5, 4, 3, 2, 1]),
'c': pd.Series(['one', 'one', 'one', 'two', 'two', 'two', 'two']),
'd': pd.Series(['h', 'j', 'k', 'l', 'm', 'n', 'o'])}
t1 = pd.DataFrame.from_dict(data)
print(t1)
X = t1.set_index(['c', 'd'])['a']
print(X)
print(X['one', 'h'])
a b c d
0 0 7 one h
1 1 6 one j
2 2 5 one k
3 3 4 two l
4 4 3 two m
5 5 2 two n
6 6 1 two o
c d
one h 0
j 1
k 2
two l 3
m 4
n 5
o 6
Name: a, dtype: int64
0
Process finished with exit code 0