Pandas 常见操作(一)
# 导入 pandas 和 numpy 两个常用数据处理包,并缩写。
import numpy as np
import pandas as pd
# 创建一个 Series
s = pd.Series([1, 3, 6, np.nan, 44, 2]) # 默认给出索引
s
result:
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 2.0
dtype: float64
# 用 DataFrame创建表格,给出行索引和列索引
dates = pd.date_range('20200101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c','d'])
result
a b c d
2020-01-01 -2.003205 -0.386903 -1.964165 1.819738
2020-01-02 -1.312786 -1.822615 1.488771 -1.288448
2020-01-03 -0.128299 -0.523143 0.076266 -0.262308
2020-01-04 0.012361 -0.094343 -0.070053 2.136403
2020-01-05 -1.320334 -0.157720 -0.183065 0.124426
2020-01-06 1.469395 0.506660 0.961168 0.390255
# 按照某一列索引,也会有它对应的行索引出来
df['b']
result
2020-01-01 -0.386903
2020-01-02 -1.822615
2020-01-03 -0.523143
2020-01-04 -0.094343
2020-01-05 -0.157720
2020-01-06 0.506660
Freq: D, Name: b, dtype: float64
#若 DataFrame 不给定行列索引,那么就默认整数索引
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
ruesult:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
# 还可以用字典的形式创建表格
df2 = pd.DataFrame({'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=np.arange(4),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo'})
result:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
# 查看表格的数据类型
df2.dtypes
result:
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
# 默认 index 为行索引
df2.index
result:
Int64Index([0, 1, 2, 3], dtype='int64')
# 列索引
df2.columns
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
# 查看表格中所有的值,不要索引
df2.values
array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
dtype=object)
# 查看表格的总体描述,包括个数,均值,标准差,最小值,最大值
# 如果是某个数据类型的属性则不需要括号,若是它的方法,则需要括号。
df2.describe()
reuslt:
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
# 将表格旋转 90 度,转置
df2.T
df2
reuslt:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
# 将表格按照索引来降序排列, axis 确定是行还是列,0 为行,1 为列
# ascending = False 表示降序,True为升序
df2.sort_index(axis=1, ascending=False)
F E D C B A
0 foo test 3 1.0 2013-01-02 1.0
1 foo train 3 1.0 2013-01-02 1.0
2 foo test 3 1.0 2013-01-02 1.0
3 foo train 3 1.0 2013-01-02 1.0
# 对表格数值排序
df2.sort_values(by='E')
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
2 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
3 1.0 2013-01-02 1.0 3 train foo