pandas-学习

pandas基本介绍

import numpy as np
import pandas as pd


s = pd.Series([1, 3, 6, np.nan, 88, 3])  # series 系列
print(s)
dates = pd.date_range('20160101', periods=6)  # periods 周期,时期
print(dates)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
print(df)
'''
                   a         b         c         d
2016-01-01 -0.558977 -0.741284  0.562322  1.212522
2016-01-02  0.236301  0.069895 -0.945983  1.328624
2016-01-03 -1.319606 -1.071127  1.013776  0.070183
2016-01-04 -0.725431  0.190953  1.538477 -0.212148
2016-01-05 -1.028601  0.302218 -0.857509 -0.630979
2016-01-06  0.309582  0.318177  0.787511  0.244988

'''
print("如果index=, columns= 都是默认")
df1 = pd.DataFrame(np.random.randn(12).reshape(3, 4))
print(df1)
'''
          0         1         2         3
0  0.066480  1.976401  0.151181  1.668614
1 -0.491919  0.815604  1.305159 -0.263863
2 -0.258095  1.345336  1.333973 -1.313072
'''
print("定义DataFrame的第二种方式")
df2 = pd.DataFrame({"A": 1,
                    "B": pd.Timestamp('20130102'),
                    "C": pd.Series(1, index=list(range(4)), dtype='float32'),
                    "D": np.array([3] * 4, dtype='int32'),
                    "E": pd.Categorical(['test', 'train', 'test', 'train']),
                    "F": 'foo'})
print(df2)
print(df2.dtypes)
'''   A          B    C  D      E    F
0  1 2013-01-02  1.0  3   test  foo
1  1 2013-01-02  1.0  3  train  foo
2  1 2013-01-02  1.0  3   test  foo
3  1 2013-01-02  1.0  3  train  foo
A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
'''
print("所有行的序号")
print(df2.index)
print(df2.columns)
'''
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''
print("获取值信息")
print(df2.values)
'''
[[1 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
'''
print("describe(), 可以获得以下信息")
print(df2.describe())
'''         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0'''
print("转置, transpose(), 也可以用df2.T")
print(df2.transpose())
'''                     0  ...                    3
A                    1  ...                    1
B  2013-01-02 00:00:00  ...  2013-01-02 00:00:00
C                  1.0  ...                  1.0
D                    3  ...                    3
E                 test  ...                train
F                  foo  ...                  foo

[6 rows x 4 columns]'''
print("排序, axis=0是对行排序,axis=1是对列进行排序")
print(df2.sort_index(axis=1, ascending=False))
'''     F      E  D    C          B  A
0  foo   test  3  1.0 2013-01-02  1
1  foo  train  3  1.0 2013-01-02  1
2  foo   test  3  1.0 2013-01-02  1
3  foo  train  3  1.0 2013-01-02  1
'''
print("用值进行排序")
print(df2.sort_values(by='E'))
'''   A          B    C  D      E    F
0  1 2013-01-02  1.0  3   test  foo
2  1 2013-01-02  1.0  3   test  foo
1  1 2013-01-02  1.0  3  train  foo
3  1 2013-01-02  1.0  3  train  foo
'''

选择数据,数据筛选

import numpy as np
import pandas as pd


print("创建一个pandas日期列表")
dates = pd.date_range('20130101', periods=6)
print(dates)
'''DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')'''
print("创建一个DataFrame")
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
'''
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
'''
print("获取列")
print(df['A'], df.A)
'''
2013-01-01     0
2013-01-02     4
2013-01-03     8
2013-01-04    12
2013-01-05    16
2013-01-06    20
Freq: D, Name: A, dtype: int32 
2013-01-01     0
2013-01-02     4
2013-01-03     8
2013-01-04    12
2013-01-05    16
2013-01-06    20
Freq: D, Name: A, dtype: int32'''
print("按照切片来取")
print(df[:3], '\n', df['20130102': '20130104'])
''' 
           A  B   C   D
2013-01-01  0  1   2   3
2013-01-02  4  5   6   7
2013-01-03  8  9  10  11 
              A   B   C   D
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
'''
print("更具体一点的选择,用标签: select by label: loc")
print(df.loc['20130102'])
'''
A    4
B    5
C    6
D    7
'''
print("选择所有行, 指定列的数据")
print(df.loc[:, ['A', 'C']])
'''
             A   C
2013-01-01   0   2
2013-01-02   4   6
2013-01-03   8  10
2013-01-04  12  14
2013-01-05  16  18
2013-01-06  20  22'''
print("也可以这样")
print(df.loc['20130102', ['A', 'C']])
'''
A    4
C    6'''

print("按照位置来选择: select by position: iloc")
print(df.iloc[3:5, 1:3])
'''
             B   C
2013-01-04  13  14
2013-01-05  17  18
'''
print("也可以这样")
print(df.iloc[[1, 3, 5], 1:3])
'''
             B   C
2013-01-02   5   6
2013-01-04  13  14
2013-01-06  21  22'''

# print("标签和位置混合筛选: mixed selection: ix")
# print(df.ix[:3, ['A', 'C']])
print("Boolean indexing方法")
print(df)
print(df[df.A > 8])
'''
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
             A   B   C   D
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
'''

修改数据值

import numpy as np
import pandas as pd


print("创建一个pandas日期列表")
dates = pd.date_range('20130101', periods=6)
print(dates)
'''DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')'''
print("创建一个DataFrame")
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
'''
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
'''
print("用位置iloc修改值")
df.iloc[2, 2] = 1111
print(df)
print("第二种方法")
df.loc['20130102', 'B'] = 2222
print(df)
'''
             A     B     C   D
2013-01-01   0     1     2   3
2013-01-02   4  2222     6   7
2013-01-03   8     9  1111  11
2013-01-04  12    13    14  15
2013-01-05  16    17    18  19
2013-01-06  20    21    22  23

Process finished with exit code 0
'''
print("第三种方法, ")
# df[df.A > 4] = 0
# print(df)
'''
            A     B  C  D
2013-01-01  0     1  2  3
2013-01-02  4  2222  6  7
2013-01-03  0     0  0  0
2013-01-04  0     0  0  0
2013-01-05  0     0  0  0
2013-01-06  0     0  0  0'''
df.A[df.A > 4] = 0
print(df)
'''
            A     B     C   D
2013-01-01  0     1     2   3
2013-01-02  4  2222     6   7
2013-01-03  0     9  1111  11
2013-01-04  0    13    14  15
2013-01-05  0    17    18  19
2013-01-06  0    21    22  23'''
print("加一个空列")
df['F'] = np.nan
print(df)
'''
            A     B     C   D   F
2013-01-01  0     1     2   3 NaN
2013-01-02  4  2222     6   7 NaN
2013-01-03  0     9  1111  11 NaN
2013-01-04  0    13    14  15 NaN
2013-01-05  0    17    18  19 NaN
2013-01-06  0    21    22  23 NaN'''
print("加入一个有值的列")
df['E'] = pd.Series(range(6), index=dates)
print(df)
'''
            A     B     C   D   F  E
2013-01-01  0     1     2   3 NaN  0
2013-01-02  4  2222     6   7 NaN  1
2013-01-03  0     9  1111  11 NaN  2
2013-01-04  0    13    14  15 NaN  3
2013-01-05  0    17    18  19 NaN  4
2013-01-06  0    21    22  23 NaN  5'''

处理丢失的数据

import numpy as np
import pandas as pd


print("创建一个pandas日期列表")
dates = pd.date_range('20130101', periods=6)
print(dates)
'''DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')'''
print("创建一个DataFrame")
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
'''
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
'''
df.iloc[2, 3] = np.nan
df.iloc[1, 2] = np.nan
print(df)
'''
             A   B     C     D
2013-01-01   0   1   2.0   3.0
2013-01-02   4   5   NaN   7.0
2013-01-03   8   9  10.0   NaN
2013-01-04  12  13  14.0  15.0
2013-01-05  16  17  18.0  19.0
2013-01-06  20  21  22.0  23.0'''
print('axis=0 是按行丢掉数据, how=any是只要有nan的就都丢掉 ')
print('axis=1是丢掉列', 'how=all是只有全是nan的时候才丢掉')
print(df.dropna(axis=0, how='any'))
'''
             A   B     C     D
2013-01-01   0   1   2.0   3.0
2013-01-04  12  13  14.0  15.0
2013-01-05  16  17  18.0  19.0
2013-01-06  20  21  22.0  23.0
'''
print(df.dropna(axis=1, how='all'))   # 不变
print('填充数据follna, value=0')
print(df.fillna(value=0))
print('判断数据是否有缺失用df.isnull(),')
print(df.isnull())
print(np.any(df.isnull()) == True)

数据的读取与写入

import numpy as np
import pandas as pd


print("读取数据")
data = pd.read_csv('')
# csv, excel, hdf, sql, json, msgpack, html, gbq, stata, sas, clipboard, pickle等格式
data.to_pickle('')

连接数据1,concat append

import numpy as np
import pandas as pd


# concatenating 连接 合并
print('创建一个3行4列用0填充的Frame')
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])

print('上下合并, 0就是竖向合并, 1是横向')
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)  # ignore 忽视
print(res)
df4 = pd.DataFrame(np.ones((3, 4))*4, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
res1 = pd.concat([df1, df4], axis=0, ignore_index=True, join='outer') # 会用nan填充  outer, inner
print(res1)
'''
     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  NaN  4.0  4.0  4.0  4.0
4  NaN  4.0  4.0  4.0  4.0
5  NaN  4.0  4.0  4.0  4.0
'''
res2 = pd.concat([df1, df4], axis=0, ignore_index=True, join='inner')
print(res2)
'''
     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  4.0  4.0  4.0
4  4.0  4.0  4.0
5  4.0  4.0  4.0'''
print('join_axes参数')
res3 = pd.concat([df1, df4], axis=1)
print(res3)
'''
     a    b    c    d    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  4.0  4.0  4.0  4.0
3  NaN  NaN  NaN  NaN  4.0  4.0  4.0  4.0
4  NaN  NaN  NaN  NaN  4.0  4.0  4.0  4.0
'''
# res4 = pd.concat([df1, df4], axis=1, join_axes=[df1.index]) 已经没有这个参数了
print('用append添加数据')
res5 = df1.append([df2, df3], ignore_index=True)
print(res5)
print('添加一行数据')
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
res6 = df1.append(s1, ignore_index=True)
print(res6)

合并数据 merge 相同key

import pandas as pd


# merge 合并
left = pd.DataFrame({'key': ['k0', 'k1', 'k2', 'k3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'b': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['k0', 'k1', 'k2', 'k3'],
                      'c': ['c1', 'c2', 'c3', 'c4'],
                      'd': ['d1', 'd2', 'd3', 'd4']})
print(left)
print(right)
res = pd.merge(left, right, on='key')
print(res)
'''
  key   A   b   c   d
0  k0  A0  B0  c1  d1
1  k1  A1  B1  c2  d2
2  k2  A2  B2  c3  d3
3  k3  A3  B3  c4  d4'''

merge 不同key

import numpy as np
import pandas as pd


# merge 合并
left = pd.DataFrame({'key': ['k0', 'k1', 'k2', 'k3'],
                     'key1': ['k0', 'k1', 'k0', 'k1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'b': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['k0', 'k1', 'k2', 'k3'],
                      'key1': ['k0', 'k0', 'k0', 'k0'],
                      'c': ['c1', 'c2', 'c3', 'c4'],
                      'd': ['d1', 'd2', 'd3', 'd4']})
print(left)
print(right)
res = pd.merge(left, right, on=['key', 'key1'], how='inner')   # how默认=inner
print(res) # how= [left, right, outer, inner]
res1 = pd.merge(left, right, on=['key', 'key1'], how='outer')
print(res1)

merge 参数 indicator

import pandas as pd


# indicator 指示器 指示信号
df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']})
df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': ['b', 'c', 'd']})
print(df1)
print(df2)
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
'''
   col1 col_left col_right indicator_column
0     0        a       NaN        left_only
1     1        b         b             both
2     2      NaN         c       right_only
3     2      NaN         d       right_only
'''
print('give the indicator a custom name')
res1 = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res1)
'''
   col1 col_left col_right indicator_column
0     0        a       NaN        left_only
1     1        b         b             both
2     2      NaN         c       right_only
3     2      NaN         d       right_only
'''

merge参数 left_index, right_index

import pandas as pd


left = pd.DataFrame({'a': ['a0', 'a1', 'a2'],
                     'b': ['b0', 'b1', 'b2']},
                     index=['k0', 'k1', 'k2'])
right = pd.DataFrame({'c': ['c0', 'c2', 'c3'],
                      'd': ['d0', 'd2', 'd3']},
                     index=['k0', 'k2', 'k3'])
print(left)
print(right)
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
'''
      a    b    c    d
k0   a0   b0   c0   d0
k1   a1   b1  NaN  NaN
k2   a2   b2   c2   d2
k3  NaN  NaN   c3   d3'''

merge参数suffixes 后缀

import pandas as pd


boys = pd.DataFrame({'k': ['ko', 'k1', 'k2'],
                     'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['ko', 'k0', 'k3'],
                     'age': [4, 5, 6]})
print(boys)
print(girls)
res = pd.merge(boys, girls, on='k', how='outer', suffixes=['_boy', '_girl'])  # suffixes 后缀
print(res)
'''
    k  age_boy  age_girl
0  ko      1.0       4.0
1  k1      2.0       NaN
2  k2      3.0       NaN
3  k0      NaN       5.0
4  k3      NaN       6.0
'''
print('join的用法和merge差不多')

polt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


print('线性的plot')
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
print(data)
data = data.cumsum()
print(data)
data.plot()
plt.show()
print('四组plot DataFrame')
data1 = pd.DataFrame(np.random.randn(1000, 4),
                     index=np.arange(1000),
                     columns=list('abcd'))
print(data1)
data_sum = data1.cumsum()
print(data_sum)
# data_sum.plot()
# plt.show()
print('plot.bar, hist, box, area, scatter, hexbin, pie等方法')
ax = data_sum.plot.scatter(x='a', y='b', color='DarkBlue', label='Class 1')
# plt.show()
data_sum.plot.scatter(x='a', y='c', color='DarkGreen', label='Class 2', ax=ax)
plt.show()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值