learn pandas ---学习笔记3

最新推荐文章于 2024-05-06 21:40:02 发布

Grace_yanyanyan

最新推荐文章于 2024-05-06 21:40:02 发布

阅读量142

点赞数

分类专栏： learn numpy & pandas 文章标签： numpy pandas python

原文链接：https://study.163.com/course/courseMain.htm?courseId=1003240004

版权

learn numpy & pandas 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

网易云课堂：
https://study.163.com/course/courseMain.htm?courseId=1003240004
莫烦主页：
https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/



import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D'])
# print(df)
'''
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
'''

# df.iloc[2,2] = 1111
# df.loc['20130101','B'] = 2222
# print(df)
'''
             A     B     C   D
2013-01-01   0  2222     2   3
2013-01-02   4     5     6   7
2013-01-03   8     9  1111  11
2013-01-04  12    13    14  15
2013-01-05  16    17    18  19
2013-01-06  20    21    22  23
'''

# df.B[df.A>4] = 0
'''
             A  B   C   D
2013-01-01   0  1   2   3
2013-01-02   4  5   6   7
2013-01-03   8  0  10  11
2013-01-04  12  0  14  15
2013-01-05  16  0  18  19
2013-01-06  20  0  22  23
'''
# df.A[df.A>4] = 0
'''
            A   B   C   D
2013-01-01  0   1   2   3
2013-01-02  4   5   6   7
2013-01-03  0   9  10  11
2013-01-04  0  13  14  15
2013-01-05  0  17  18  19
2013-01-06  0  21  22  23
'''
# df[df.A>4] = 0  
'''
            A  B  C  D
2013-01-01  0  1  2  3
2013-01-02  4  5  6  7
2013-01-03  0  0  0  0
2013-01-04  0  0  0  0
2013-01-05  0  0  0  0
2013-01-06  0  0  0  0
'''
# df['F'] = np.nan
'''
             A   B   C   D   F
2013-01-01   0   1   2   3 NaN
2013-01-02   4   5   6   7 NaN
2013-01-03   8   9  10  11 NaN
2013-01-04  12  13  14  15 NaN
2013-01-05  16  17  18  19 NaN
2013-01-06  20  21  22  23 NaN
'''
# df['E'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130101',periods=6)) 
#用上面的方法也可以加上 Series 序列（但是长度必须对齐）
'''
             A   B   C   D  E
2013-01-01   0   1   2   3  1
2013-01-02   4   5   6   7  2
2013-01-03   8   9  10  11  3
2013-01-04  12  13  14  15  4
2013-01-05  16  17  18  19  5
2013-01-06  20  21  22  23  6
'''



print(df)



import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
# print(df)
'''
             A     B     C   D
2013-01-01   0   NaN   2.0   3
2013-01-02   4   5.0   NaN   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
'''
#如果想直接去掉有 NaN 的行或列, 可以使用 dropna
# df.dropna(axis=0,how='any') 
# 0: 对行进行操作; 1: 对列进行操作
# 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop 
'''
             A     B     C   D
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
''' 

# df.fillna(value=0)   #将 NaN 的值用其他值代替, 比如代替成 0:
# print(df.isnull())   #判断是否有缺失数据 NaN, 为 True 表示缺失数据
'''
                A      B      C      D
2013-01-01  False   True  False  False
2013-01-02  False  False   True  False
2013-01-03  False  False  False  False
2013-01-04  False  False  False  False
2013-01-05  False  False  False  False
2013-01-06  False  False  False  False
'''
# 如果这个表很大，看不过来，可以用下面这句：
# print(np.any(df.isnull()) == True  )    #True



print(df)


#pandas可以读取与存取的资料格式有很多种，像csv、excel、json、html与pickle等…， 详细请看官方说明文件

#范例下载：https://github.com/MorvanZhou/tutorials/blob/master/numpy%26pandas/15_read_to/student.csv

import pandas as pd #加载模块

#读取csv
data = pd.read_csv('student.csv')

#打印出data
print(data)

data.to_pickle('student.pickle')



import pandas as pd
import numpy as np

#定义资料集
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
'''
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
'''
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
'''
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
'''
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
'''
#concat纵向合并,就是上下合并
res = pd.concat([df1, df2, df3], axis=0)   #axis=1 是横向合并
#axis=0是预设值，因此未设定任何参数时，函数默认axis=0
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
''' 
#仔细观察会发现结果的index是0, 1, 2, 0, 1, 2, 0, 1, 2，若要将index重置，请看例子二。
#承上一个例子，并将index_ignore设定为True
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
'''

#join (合并方式)
#函数默认join='outer'。此方式是依照column来做纵向合并，有相同的column上下合并在一起，其他独自的column个自成列，原本没有值的位置皆以NaN填充。
#定义资料集
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
'''
     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
'''
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
'''
     b    c    d    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
'''
#纵向"外"合并df1与df2
# res = pd.concat([df1, df2]）
res = pd.concat([df1, df2], axis=0, join='outer')
'''
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
'''

#承上一个例子,join='inner',只有相同的column合并在一起，其他的会被抛弃
#纵向"内"合并df1与df2
res = pd.concat([df1, df2], axis=0, join='inner')
'''
     b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
'''
#重置index并打印结果
res = pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
'''
     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0
'''

#join_axes (依照 axes 合并)

#依照`df1.index`进行横向合并
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])  # 只照顾df1的index
'''
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
'''
#移除join_axes，并打印结果
res = pd.concat([df1, df2], axis=1)   #这样的话df1，df2的index都被照顾到了
'''
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
'''

# append (添加数据) 
# append只有纵向合并，没有横向合并。
#定义资料集
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
'''
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
'''
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
'''
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
'''
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
'''

#将df2合并到df1的下面，以及重置index，并打印出结果
res = df1.append(df2, ignore_index=True)   #append一个
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
'''
#合并多个df，将df2与df3合并至df1的下面，以及重置index，并打印出结果
# res = df1.append([df2, df3], ignore_index=True)   #append两个
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  1.0  1.0  1.0  1.0
7  1.0  1.0  1.0  1.0
8  1.0  1.0  1.0  1.0
'''
#定义新数据集
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
'''
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
'''
a    1
b    2
c    3
d    4
dtype: int64
'''
#合并series，将s1合并至df1，以及重置index，并打印出结果
# res = df1.append(s1, ignore_index=True)  #只添加一行数据
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0
'''


# print(res)



#pandas中的merge和concat类似,但主要是用于两组有key column的数据,统一索引的数据. 通常也被用在Database的处理当中.

#依据一组key合并
import pandas as pd

#定义资料集并打印出
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                             'A': ['A0', 'A1', 'A2', 'A3'],
                             'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                              'C': ['C0', 'C1', 'C2', 'C3'],
                              'D': ['D0', 'D1', 'D2', 'D3']})

# print(left)
'''
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
'''
# print(right)
'''
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3
'''

#依据key column合并，并打印出
# res = pd.merge(left, right, on='key')
# print(res)
'''
 key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3
'''

#依据两组key合并

#合并时有4种方法how = ['left', 'right', 'outer', 'inner']，预设值how='inner'
#定义资料集并打印出
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                      'key2': ['K0', 'K1', 'K0', 'K1'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})

# print(left)
#    A   B key1 key2
# 0  A0  B0   K0   K0
# 1  A1  B1   K0   K1
# 2  A2  B2   K1   K0
# 3  A3  B3   K2   K1

# print(right)
#    C   D key1 key2
# 0  C0  D0   K0   K0
# 1  C1  D1   K1   K0
# 2  C2  D2   K1   K0
# 3  C3  D3   K2   K0

#依据key1与key2 columns进行合并，并打印出四种结果['left', 'right', 'outer', 'inner']
# res = pd.merge(left, right, on=['key1', 'key2']） #默认的是 inner
# res = pd.merge(left, right, on=['key1', 'key2'], how='inner')  #只考虑相同的部分，不同的丢弃
# print(res)
#    A   B key1 key2   C   D
# 0  A0  B0   K0   K0  C0  D0
# 1  A2  B2   K1   K0  C1  D1
# 2  A2  B2   K1   K0  C2  D2
# res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
# print(res)
#     A    B key1 key2    C    D
# 0   A0   B0   K0   K0   C0   D0
# 1   A1   B1   K0   K1  NaN  NaN
# 2   A2   B2   K1   K0   C1   D1
# 3   A2   B2   K1   K0   C2   D2
# 4   A3   B3   K2   K1  NaN  NaN
# 5  NaN  NaN   K2   K0   C3   D3

# res = pd.merge(left, right, on=['key1', 'key2'], how='left')  #以left的key1，key2为主
# print(res)
#    A   B key1 key2    C    D
# 0  A0  B0   K0   K0   C0   D0
# 1  A1  B1   K0   K1  NaN  NaN
# 2  A2  B2   K1   K0   C1   D1
# 3  A2  B2   K1   K0   C2   D2
# 4  A3  B3   K2   K1  NaN  NaN

# res = pd.merge(left, right, on=['key1', 'key2'], how='right')  #以right的key1，key2为主
# print(res)
#     A    B key1 key2   C   D
# 0   A0   B0   K0   K0  C0  D0
# 1   A2   B2   K1   K0  C1  D1
# 2   A2   B2   K1   K0  C2  D2
# 3  NaN  NaN   K2   K0  C3  D3


#定义资料集并打印出
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})

# print(df1)
#   col1  col_left
# 0     0        a
# 1     1        b

# print(df2)
#   col1  col_right
# 0     1          2
# 1     2          2
# 2     2          2

# 依据col1进行合并，并启用indicator=True，最后打印出
#indicator默认的参数是false
# res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
# print(res)
#   col1 col_left  col_right      _merge
# 0   0.0        a        NaN   left_only
# 1   1.0        b        2.0        both
# 2   2.0      NaN        2.0  right_only
# 3   2.0      NaN        2.0  right_only

# 自定indicator column的名称，并打印出
# res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
# print(res)
#   col1 col_left  col_right indicator_column
# 0   0.0        a        NaN        left_only
# 1   1.0        b        2.0             both
# 2   2.0      NaN        2.0       right_only
# 3   2.0      NaN        2.0       right_only

#定义资料集并打印出
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                     index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])

# print(left)
#     A   B
# K0  A0  B0
# K1  A1  B1
# K2  A2  B2

# print(right)
#     C   D
# K0  C0  D0
# K2  C2  D2
# K3  C3  D3

#依据左右资料集的index进行合并，how='outer',并打印出
# res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
# print(res)
#      A    B    C    D
# K0   A0   B0   C0   D0
# K1   A1   B1  NaN  NaN
# K2   A2   B2   C2   D2
# K3  NaN  NaN   C3   D3

#依据左右资料集的index进行合并，how='inner',并打印出
# res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
# print(res)
#     A   B   C   D
# K0  A0  B0  C0  D0
# K2  A2  B2  C2  D2

#定义资料集
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
# print(boys)
'''
    k  age
0  K0    1
1  K1    2
2  K2    3
'''
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
# print(girls)
'''
    k  age
0  K0    4
1  K0    5
2  K3    6
'''

#使用suffixes解决overlapping的问题
# res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
# print(res)
#    age_boy   k  age_girl
# 0        1  K0         4
# 1        1  K0         5

res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
print(res)
'''
   k  age_boy  age_girl
0  K0      1.0       4.0
1  K0      1.0       5.0
2  K1      2.0       NaN
3  K2      3.0       NaN
4  K3      NaN       6.0
'''



import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# # 随机生成1000个数据
# data = pd.Series(np.random.randn(1000),index=np.arange(1000))
# #这是一个线性的数据，我们随机生成1000个数据，Series 默认的 index 就是从0开始的整数，但是这里我显式赋值以便让大家看的更清楚
# # print(data)
# # 为了方便观看效果, 我们累加这个数据
# data1 = data.cumsum()
# # print(data1)
# # pandas 数据可以直接观看其可视化形式
# data1.plot()  #因为是随机数字，故每次刷新后图形可能不一样
# #就这么简单，熟悉 matplotlib 的朋友知道如果需要plot一个数据，我们可以使用 plt.plot(x=, y=)，把x,y的数据作为参数存进去，但是data本来就是一个数据，所以我们可以直接plot。
# plt.show()

data = pd.DataFrame(
    np.random.randn(1000,4),   #生成一个1000*4的dataframe
    index=np.arange(1000),
    columns=list("ABCD")
    )
# data.cumsum()
# data.plot()
# plt.show()
'''
这个就是我们刚刚生成的4个column的数据，因为有4组数据，所以4组数据会分别plot出来。
plot 可以指定很多参数，具体的用法大家可以自己查一下这里
'''
'''
除了plot，我经常会用到还有scatter，这个会显示散点图，首先给大家说一下在 pandas 中有多少种方法

bar
hist
box
kde
area
scatter
hexbin
但是我们今天不会一一介绍，主要说一下 plot 和 scatter. 因为scatter只有x，y两个属性，我们我们就可以分别给x, y指定数据
'''
ax = data.plot.scatter(x='A',y='B',color='Blue',label='Class1')
data.plot.scatter(x='A',y='C',color='LightGreen',label='Class2',ax=ax)
plt.show()

Grace_yanyanyan

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
learn pandas ---学习笔记3

网易云课堂：https://study.163.com/course/courseMain.htm?courseId=1003240004莫烦主页：https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/import pandas as pdimport numpy as npdates = pd.date_r...
复制链接

扫一扫