pandas基础

最新推荐文章于 2024-09-13 08:50:19 发布
迷心兔
最新推荐文章于 2024-09-13 08:50:19 发布
阅读量101
点赞数
原文链接：https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/
版权
Pthon numpy 专栏收录该内容
12 篇文章 0 订阅
订阅专栏
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


"""pandas的数据结构介绍
s = pd.Series([1, 3, 6, np.nan, 44, 1])  # 链表型的数据结构
print(s)

dates = pd.date_range('20160101', periods=6)
df = pd.DataFrame(  # 表格型的数据结构
    np.random.randn(6, 4),  # 表格数据
    index=dates,  # 索引
    columns=['a', 'b', 'c', 'd']  # 列名
)
print(df)
print(df['b'])

df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
print(df1)

df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

print(df2)
print(df2.dtypes)  # 数据中的类型
print(df2.index)  # 看对列的序号
print(df2.columns)  # 每种数据的名称
print(df2.values)  # 所有df2的值
print(df2.describe())  # 数据的总结
print(df2.T)  # 翻转并查看数据
print(df2.sort_index(axis=1, ascending=False))  # 对索引排序，并让列名升序为False
print(df2.sort_values(by='B'))  # 对列名B进行排序
"""


"""选择数据,并查看数据
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(
    np.arange(24).reshape((6, 4)),
    index=dates,
    columns=['A', 'B', 'C', 'D']
)

print(df)
print(df['A'])  # 查看A列名（根据列名）
print(df.A)  # 同上

print(df[0:3])  # 查看前三行数据（根据索引）
print(df['20130102':'20130104'])  # 同上

print(df.loc['20130102'])  # 通过索引（索引不能切片）
print(df.loc[:,['A','B']]) # 通过列名（全部索引）
print(df.loc['20130102', ['A', 'B']])  # 通过列名（限定索引）

print(df.iloc[3, 1])  # 通过下标，查看单个值
print(df.iloc[3:5, 1:3])  # 通过下标，查看多个值

print(df.ix[:3, ['A', 'C']])  # 查看前三行中A，B列名的数据（索引能切片）
print(df[df.A > 8])  # 筛选出A列大于8的值，并查看数据
"""


"""设置值
dates = pd.date_range("20130101", periods=6)
df = pd.DataFrame(
    np.arange(24).reshape((6, 4)),
    index=dates,
    columns=["A", "B", "C", "D"]
)
print(df)

df.iloc[2, 2] = 1111
print(df)

df.loc['20130101', 'B'] = 2222
print(df)

df.B[df.A > 4] = 0
print(df)

df['F'] = np.nan
print(df)

df['E'] = pd.Series(
    [1, 2, 3, 4, 5, 6],
    index=pd.date_range('20130101', periods=6)
)
print(df)
"""

""" 处理丢失数据
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(
    np.arange(24).reshape((6, 4)),
    index=dates,
    columns=['A', 'B', 'C', 'D']
)
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
print(df)
df2 = df.dropna(  # 直接去掉有 NaN 的行或列
    axis=0,     # 0: 对行进行操作; 1: 对列进行操作
    how='any'   # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop
)
print(df2)
df3 = df.fillna(value=0)  # 将 NaN 的值用其他值代替, 比如代替成 0
print(df3)

df4 = df.isnull()  # 判断是否有缺失数据 NaN, 为 True 表示缺失数据
print(df4)
"""


"""导入导出
# 读取csv
data = pd.read_csv('data.csv')

# 打印出data
print(data)

data.to_pickle('student.pickle')  # 将资料存取成pickle
"""

"""合并 concat
# 定义资料集
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])

# concat纵向合并
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print(res)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
'''
# 将df2合并到df1的下面，以及重置index，并打印出结果
res = df1.append(df2, ignore_index=True)
print(res)
# 合并多个df，将df2与df3合并至df1的下面，以及重置index，并打印出结果
res = df1.append([df2, df3], ignore_index=True)
print(res)
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
# 合并series，将s1合并至df1，以及重置index，并打印出结果
res = df1.append(s1, ignore_index=True)
print(res)


# join (合并方式)----------------------------------------------------------------
# df1 = pd.DataFrame(
#     np.ones((3, 4)) * 0,
#     columns=['a', 'b', 'c', 'd'],
#     index=[1, 2, 3])
# df2 = pd.DataFrame(
#     np.ones((3, 4)) * 1,
#     columns=['b', 'c', 'd', 'e'],
#     index=[2, 3, 4])

# 纵向"外"合并df1与df2
# res = pd.concat([df1, df2], axis=0, join='outer', sort=True, ignore_index=True)
# print(res)
# 纵向"内"合并df1与df2
# res = pd.concat([df1, df2], axis=0, join='inner', sort=True, ignore_index=True)
# print(res)


# res = pd.concat([df1, df2], axis=1)
# print(res)
# 依照`df1.index`进行横向合并
# res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
# print(res)
"""

"""合并 merge
# 定义资料集并打印出
left = pd.DataFrame({'key': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
'''
  key key2   A   B
0  K0   K0  A0  B0
1  K0   K1  A1  B1
2  K1   K0  A2  B2
3  K2   K1  A3  B3
'''
print(right)
'''
  key key2   C   D
0  K0   K0  C0  D0
1  K1   K0  C1  D1
2  K1   K0  C2  D2
3  K2   K0  C3  D3
'''
# 依据key column合并，并打印出
res = pd.merge(left, right, on='key')  # 依据key合并
# print(res)
'''
  key key2_x   A   B key2_y   C   D
0  K0     K0  A0  B0     K0  C0  D0
1  K0     K1  A1  B1     K0  C0  D0
2  K1     K0  A2  B2     K0  C1  D1
3  K1     K0  A2  B2     K0  C2  D2
4  K2     K1  A3  B3     K0  C3  D3
'''
# 依据key1与key2 columns进行合并，并打印出四种结果['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key', 'key2'], how='inner')  # 依据两组key合并
# print(res)
''' 
  key key2   A   B   C   D
0  K0   K0  A0  B0  C0  D0
1  K1   K0  A2  B2  C1  D1
2  K1   K0  A2  B2  C2  D2
'''
res = pd.merge(left, right, on=['key', 'key2'], how='outer')
# print(res)
'''
  key key2    A    B    C    D
0  K0   K0   A0   B0   C0   D0
1  K0   K1   A1   B1  NaN  NaN
2  K1   K0   A2   B2   C1   D1
3  K1   K0   A2   B2   C2   D2
4  K2   K1   A3   B3  NaN  NaN
5  K2   K0  NaN  NaN   C3   D3
'''
res = pd.merge(left, right, on=['key', 'key2'], how='left')
# print(res)
'''
  key key2   A   B    C    D
0  K0   K0  A0  B0   C0   D0
1  K0   K1  A1  B1  NaN  NaN
2  K1   K0  A2  B2   C1   D1
3  K1   K0  A2  B2   C2   D2
4  K2   K1  A3  B3  NaN  NaN
'''
res = pd.merge(left, right, on=['key', 'key2'], how='right')
print(res)
'''
  key key2    A    B   C   D
0  K0   K0   A0   B0  C0  D0
1  K1   K0   A2   B2  C1  D1
2  K1   K0   A2   B2  C2  D2
3  K2   K0  NaN  NaN  C3  D3
'''

# 定义资料集并打印出(Indicator参数)
df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']})
df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]})
# print(df1)
'''
   col1 col_left
0     0        a
1     1        b
'''
# print(df2)
'''
   col1  col_right
0     1          2
1     2          2
2     2          2
'''
# 依据col1进行合并，并启用indicator=True，最后打印出
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
# print(res)
'''
   col1 col_left  col_right      _merge
0     0        a        NaN   left_only
1     1        b        2.0        both
2     2      NaN        2.0  right_only
3     2      NaN        2.0  right_only
'''
# 自定indicator column的名称，并打印出
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
# print(res)
'''
   col1 col_left  col_right indicator_column
0     0        a        NaN        left_only
1     1        b        2.0             both
2     2      NaN        2.0       right_only
3     2      NaN        2.0       right_only
'''

# 定义资料集并打印出（依据index合并）
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])
print(left)
'''
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
'''
print(right)
'''
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3
'''
# 依据左右资料集的index进行合并，how='outer',并打印出
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
'''
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3
'''
# 依据左右资料集的index进行合并，how='inner',并打印出
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
'''
     A   B   C   D
K0  A0  B0  C0  D0
K2  A2  B2  C2  D2
'''
# 定义资料集
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})

# 使用suffixes解决overlapping的问题
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
'''
    k  age_boy  age_girl
0  K0        1         4
1  K0        1         5
'''
"""
# 随机生成1000个数据
# data = pd.Series(np.random.randn(1000), index=np.arange(1000))
# 为了方便观看效果, 我们累加这个数据
# data.cumsum()
# pandas 数据可以直接观看其可视化形式
# data.plot()
# plt.show()

# data = pd.DataFrame(
#     np.random.randn(1000,4),
#     index=np.arange(1000),
#     columns=list("ABCD")
#     )
# data.cumsum()
# data.plot()
# plt.show()