import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
"""pandas的数据结构介绍
s = pd.Series([1, 3, 6, np.nan, 44, 1]) # 链表型的数据结构
print(s)
dates = pd.date_range('20160101', periods=6)
df = pd.DataFrame( # 表格型的数据结构
np.random.randn(6, 4), # 表格数据
index=dates, # 索引
columns=['a', 'b', 'c', 'd'] # 列名
)
print(df)
print(df['b'])
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
print(df1)
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
print(df2)
print(df2.dtypes) # 数据中的类型
print(df2.index) # 看对列的序号
print(df2.columns) # 每种数据的名称
print(df2.values) # 所有df2的值
print(df2.describe()) # 数据的总结
print(df2.T) # 翻转并查看数据
print(df2.sort_index(axis=1, ascending=False)) # 对索引排序,并让列名升序为False
print(df2.sort_values(by='B')) # 对列名B进行排序
"""
"""选择数据,并查看数据
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(
np.arange(24).reshape((6, 4)),
index=dates,
columns=['A', 'B', 'C', 'D']
)
print(df)
print(df['A']) # 查看A列名(根据列名)
print(df.A) # 同上
print(df[0:3]) # 查看前三行数据(根据索引)
print(df['20130102':'20130104']) # 同上
print(df.loc['20130102']) # 通过索引(索引不能切片)
print(df.loc[:,['A','B']]) # 通过列名(全部索引)
print(df.loc['20130102', ['A', 'B']]) # 通过列名(限定索引)
print(df.iloc[3, 1]) # 通过下标,查看单个值
print(df.iloc[3:5, 1:3]) # 通过下标,查看多个值
print(df.ix[:3, ['A', 'C']]) # 查看前三行中A,B列名的数据(索引能切片)
print(df[df.A > 8]) # 筛选出A列大于8的值,并查看数据
"""
"""设置值
dates = pd.date_range("20130101", periods=6)
df = pd.DataFrame(
np.arange(24).reshape((6, 4)),
index=dates,
columns=["A", "B", "C", "D"]
)
print(df)
df.iloc[2, 2] = 1111
print(df)
df.loc['20130101', 'B'] = 2222
print(df)
df.B[df.A > 4] = 0
print(df)
df['F'] = np.nan
print(df)
df['E'] = pd.Series(
[1, 2, 3, 4, 5, 6],
index=pd.date_range('20130101', periods=6)
)
print(df)
"""
""" 处理丢失数据
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(
np.arange(24).reshape((6, 4)),
index=dates,
columns=['A', 'B', 'C', 'D']
)
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
print(df)
df2 = df.dropna( # 直接去掉有 NaN 的行或列
axis=0, # 0: 对行进行操作; 1: 对列进行操作
how='any' # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop
)
print(df2)
df3 = df.fillna(value=0) # 将 NaN 的值用其他值代替, 比如代替成 0
print(df3)
df4 = df.isnull() # 判断是否有缺失数据 NaN, 为 True 表示缺失数据
print(df4)
"""
"""导入导出
# 读取csv
data = pd.read_csv('data.csv')
# 打印出data
print(data)
data.to_pickle('student.pickle') # 将资料存取成pickle
"""
"""合并 concat
# 定义资料集
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])
# concat纵向合并
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print(res)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
'''
# 将df2合并到df1的下面,以及重置index,并打印出结果
res = df1.append(df2, ignore_index=True)
print(res)
# 合并多个df,将df2与df3合并至df1的下面,以及重置index,并打印出结果
res = df1.append([df2, df3], ignore_index=True)
print(res)
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
# 合并series,将s1合并至df1,以及重置index,并打印出结果
res = df1.append(s1, ignore_index=True)
print(res)
# join (合并方式)----------------------------------------------------------------
# df1 = pd.DataFrame(
# np.ones((3, 4)) * 0,
# columns=['a', 'b', 'c', 'd'],
# index=[1, 2, 3])
# df2 = pd.DataFrame(
# np.ones((3, 4)) * 1,
# columns=['b', 'c', 'd', 'e'],
# index=[2, 3, 4])
# 纵向"外"合并df1与df2
# res = pd.concat([df1, df2], axis=0, join='outer', sort=True, ignore_index=True)
# print(res)
# 纵向"内"合并df1与df2
# res = pd.concat([df1, df2], axis=0, join='inner', sort=True, ignore_index=True)
# print(res)
# res = pd.concat([df1, df2], axis=1)
# print(res)
# 依照`df1.index`进行横向合并
# res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
# print(res)
"""
"""合并 merge
# 定义资料集并打印出
left = pd.DataFrame({'key': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
'''
key key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
'''
print(right)
'''
key key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
'''
# 依据key column合并,并打印出
res = pd.merge(left, right, on='key') # 依据key合并
# print(res)
'''
key key2_x A B key2_y C D
0 K0 K0 A0 B0 K0 C0 D0
1 K0 K1 A1 B1 K0 C0 D0
2 K1 K0 A2 B2 K0 C1 D1
3 K1 K0 A2 B2 K0 C2 D2
4 K2 K1 A3 B3 K0 C3 D3
'''
# 依据key1与key2 columns进行合并,并打印出四种结果['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key', 'key2'], how='inner') # 依据两组key合并
# print(res)
'''
key key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
'''
res = pd.merge(left, right, on=['key', 'key2'], how='outer')
# print(res)
'''
key key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3
'''
res = pd.merge(left, right, on=['key', 'key2'], how='left')
# print(res)
'''
key key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
'''
res = pd.merge(left, right, on=['key', 'key2'], how='right')
print(res)
'''
key key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
'''
# 定义资料集并打印出(Indicator参数)
df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']})
df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]})
# print(df1)
'''
col1 col_left
0 0 a
1 1 b
'''
# print(df2)
'''
col1 col_right
0 1 2
1 2 2
2 2 2
'''
# 依据col1进行合并,并启用indicator=True,最后打印出
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
# print(res)
'''
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
'''
# 自定indicator column的名称,并打印出
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
# print(res)
'''
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
'''
# 定义资料集并打印出(依据index合并)
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
'''
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
'''
print(right)
'''
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
'''
# 依据左右资料集的index进行合并,how='outer',并打印出
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
'''
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
'''
# 依据左右资料集的index进行合并,how='inner',并打印出
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
'''
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2
'''
# 定义资料集
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
# 使用suffixes解决overlapping的问题
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
'''
k age_boy age_girl
0 K0 1 4
1 K0 1 5
'''
"""
# 随机生成1000个数据
# data = pd.Series(np.random.randn(1000), index=np.arange(1000))
# 为了方便观看效果, 我们累加这个数据
# data.cumsum()
# pandas 数据可以直接观看其可视化形式
# data.plot()
# plt.show()
# data = pd.DataFrame(
# np.random.randn(1000,4),
# index=np.arange(1000),
# columns=list("ABCD")
# )
# data.cumsum()
# data.plot()
# plt.show()
pandas基础
最新推荐文章于 2024-09-13 08:50:19 发布