import numpy as np
import pandas as pd
数据结构
# 数据结构
s = pd.Series([i * 2 for i in range(1, 11)])
print(type(s))
dates = pd.date_range("20181111", periods=8)
df = pd.DataFrame(np.random.randn(8, 5), index=dates, columns=list("ABCDE"))
print(df)
df = pd.DataFrame({"A": 1, "B": pd.Timestamp("20181111"), "C": pd.Series(1, index=list(range(4)), dtype="float32"),
"D": np.array([3] * 4, dtype="float32"),
"E": pd.Categorical(["police", "student", "teacher", "doctor"])})
print(df)
<class 'pandas.core.series.Series'>
A B C D E
2018-11-11 -0.802629 1.761646 -1.595913 -2.611994 1.199651
2018-11-12 -0.080405 1.414423 0.282082 0.441391 0.286743
2018-11-13 1.027058 0.424783 0.505843 2.085103 -0.685436
2018-11-14 0.224657 -1.126532 1.626780 1.099044 -0.571063
2018-11-15 0.837687 -1.010012 -0.116045 -1.511156 1.491553
2018-11-16 2.179503 -0.179622 0.055233 -0.105327 0.413564
2018-11-17 -0.194890 -1.479347 0.634556 0.050228 0.139761
2018-11-18 -0.272830 -0.201564 1.640784 0.326020 -0.023554
A B C D E
0 1 2018-11-11 1.0 3.0 police
1 1 2018-11-11 1.0 3.0 student
2 1 2018-11-11 1.0 3.0 teacher
3 1 2018-11-11 1.0 3.0 doctor
基本操作
# 基本操作
print(df.head(3)) # 前3行
print(df.tail(3)) # 后3行
print(df.index)
print(df.values)
print(df.T) # 转置
print(df.sort_values("C"))
print(df.sort_index(axis=1, ascending=False)) # 第一行 降序排列
print(df.describe())
A B C D E
2018-11-11 0.081990 -1.468464 -0.614239 -1.626961 -0.778364
2018-11-12 0.150434 0.695593 1.143397 -0.230024 0.574705
2018-11-13 0.005828 0.075620 -1.644779 -0.421319 0.475862
A B C D E
2018-11-16 -0.598378 -0.481976 0.344308 0.480649 0.543493
2018-11-17 -0.446692 -0.434767 0.837281 -0.218065 -0.306157
2018-11-18 0.317214 -0.272280 2.487579 0.177236 0.823826
DatetimeIndex(['2018-11-11', '2018-11-12', '2018-11-13', '2018-11-14',
'2018-11-15', '2018-11-16', '2018-11-17', '2018-11-18'],
dtype='datetime64[ns]', freq='D')
[[ 0.08198963 -1.46846441 -0.6142393 -1.62696105 -0.77836394]
[ 0.15043376 0.6955929 1.14339743 -0.23002405 0.57470463]
[ 0.00582813 0.07562016 -1.64477864 -0.42131898 0.47586182]
[ 1.36249741 1.36113274 1.14843582 -1.40476157 1.76153808]
[ 0.80885797 0.10913433 -1.03961654 -0.55401099 -1.11028359]
[-0.59837777 -0.48197602 0.34430759 0.4806492 0.5434928 ]
[-0.44669158 -0.43476696 0.83728064 -0.21806471 -0.30615684]
[ 0.31721363 -0.27228005 2.48757912 0.17723625 0.82382597]]
2018-11-11 2018-11-12 2018-11-13 2018-11-14 2018-11-15 2018-11-16 \
A 0.081990 0.150434 0.005828 1.362497 0.808858 -0.598378
B -1.468464 0.695593 0.075620 1.361133 0.109134 -0.481976
C -0.614239 1.143397 -1.644779 1.148436 -1.039617 0.344308
D -1.626961 -0.230024 -0.421319 -1.404762 -0.554011 0.480649
E -0.778364 0.574705 0.475862 1.761538 -1.110284 0.543493
2018-11-17 2018-11-18
A -0.446692 0.317214
B -0.434767 -0.272280
C 0.837281 2.487579
D -0.218065 0.177236
E -0.306157 0.823826
A B C D E
2018-11-13 0.005828 0.075620 -1.644779 -0.421319 0.475862
2018-11-15 0.808858 0.109134 -1.039617 -0.554011 -1.110284
2018-11-11 0.081990 -1.468464 -0.614239 -1.626961 -0.778364
2018-11-16 -0.598378 -0.481976 0.344308 0.480649 0.543493
2018-11-17 -0.446692 -0.434767 0.837281 -0.218065 -0.306157
2018-11-12 0.150434 0.695593 1.143397 -0.230024 0.574705
2018-11-14 1.362497 1.361133 1.148436 -1.404762 1.761538
2018-11-18 0.317214 -0.272280 2.487579 0.177236 0.823826
E D C B A
2018-11-11 -0.778364 -1.626961 -0.614239 -1.468464 0.081990
2018-11-12 0.574705 -0.230024 1.143397 0.695593 0.150434
2018-11-13 0.475862 -0.421319 -1.644779 0.075620 0.005828
2018-11-14 1.761538 -1.404762 1.148436 1.361133 1.362497
2018-11-15 -1.110284 -0.554011 -1.039617 0.109134 0.808858
2018-11-16 0.543493 0.480649 0.344308 -0.481976 -0.598378
2018-11-17 -0.306157 -0.218065 0.837281 -0.434767 -0.446692
2018-11-18 0.823826 0.177236 2.487579 -0.272280 0.317214
A B C D E
count 8.000000 8.000000 8.000000 8.000000 8.000000
mean 0.210219 -0.052001 0.332796 -0.474657 0.248077
std 0.637486 0.843833 1.358111 0.723612 0.931496
min -0.598378 -1.468464 -1.644779 -1.626961 -1.110284
25% -0.107302 -0.446569 -0.720584 -0.766699 -0.424209
50% 0.116212 -0.098330 0.590794 -0.325672 0.509677
75% 0.440125 0.255749 1.144657 -0.119239 0.636985
max 1.362497 1.361133 2.487579 0.480649 1.761538
选择(切片)
print(df["A"])
print(type(df["A"]))
print(df[:3])
print(df["20181111":"20181115"])
print(df.loc[dates[0]])
print(df.loc["20181111":"20181115", ["B", "D"]])
print(df.at[dates[0], "C"])
print(df.iloc[1:3, 2:4]) # 通过下标选择 第一行到第二行,第二列到第三列
print(df.iloc[1, 4]) # 第一行第四列 (行与列都从0开始)
print(df.iat[1, 4])
print(df[df.B > 0][df.A < 0])
print(df[df > 0])
2018-11-11 0.081990
2018-11-12 0.150434
2018-11-13 0.005828
2018-11-14 1.362497
2018-11-15 0.808858
2018-11-16 -0.598378
2018-11-17 -0.446692
2018-11-18 0.317214
Freq: D, Name: A, dtype: float64
<class 'pandas.core.series.Series'>
A B C D E
2018-11-11 0.081990 -1.468464 -0.614239 -1.626961 -0.778364
2018-11-12 0.150434 0.695593 1.143397 -0.230024 0.574705
2018-11-13 0.005828 0.075620 -1.644779 -0.421319 0.475862
A B C D E
2018-11-11 0.081990 -1.468464 -0.614239 -1.626961 -0.778364
2018-11-12 0.150434 0.695593 1.143397 -0.230024 0.574705
2018-11-13 0.005828 0.075620 -1.644779 -0.421319 0.475862
2018-11-14 1.362497 1.361133 1.148436 -1.404762 1.761538
2018-11-15 0.808858 0.109134 -1.039617 -0.554011 -1.110284
A 0.081990
B -1.468464
C -0.614239
D -1.626961
E -0.778364
Name: 2018-11-11 00:00:00, dtype: float64
B D
2018-11-11 -1.468464 -1.626961
2018-11-12 0.695593 -0.230024
2018-11-13 0.075620 -0.421319
2018-11-14 1.361133 -1.404762
2018-11-15 0.109134 -0.554011
-0.6142393019442665
C D
2018-11-12 1.143397 -0.230024
2018-11-13 -1.644779 -0.421319
0.574704626058187
0.574704626058187
/Users/Mac/pycharm-workspace/pandasTest.py:39: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
print(df[df.B > 0][df.A < 0])
Empty DataFrame
Columns: [A, B, C, D, E]
Index: []
A B C D E
2018-11-11 0.081990 NaN NaN NaN NaN
2018-11-12 0.150434 0.695593 1.143397 NaN 0.574705
2018-11-13 0.005828 0.075620 NaN NaN 0.475862
2018-11-14 1.362497 1.361133 1.148436 NaN 1.761538
2018-11-15 0.808858 0.109134 NaN NaN NaN
2018-11-16 NaN NaN 0.344308 0.480649 0.543493
2018-11-17 NaN NaN 0.837281 NaN NaN
2018-11-18 0.317214 NaN 2.487579 0.177236 0.823826
对dataframe进行操作
s1 = pd.Series(list(range(10, 18)), index=pd.date_range("20181111", periods=8))
df["F"] = s1
print(df)
df.at[dates[0], "A"] = 0
print(df)
df.iat[1, 1] = 1
df.loc[:, "D"] = np.array([4] * len(df))
print(df)
df2 = df.copy()
df2[df2 > 0] = -df2
print(df2)
A B C D E F
2018-11-11 0.081990 -1.468464 -0.614239 -1.626961 -0.778364 10
2018-11-12 0.150434 0.695593 1.143397 -0.230024 0.574705 11
2018-11-13 0.005828 0.075620 -1.644779 -0.421319 0.475862 12
2018-11-14 1.362497 1.361133 1.148436 -1.404762 1.761538 13
2018-11-15 0.808858 0.109134 -1.039617 -0.554011 -1.110284 14
2018-11-16 -0.598378 -0.481976 0.344308 0.480649 0.543493 15
2018-11-17 -0.446692 -0.434767 0.837281 -0.218065 -0.306157 16
2018-11-18 0.317214 -0.272280 2.487579 0.177236 0.823826 17
A B C D E F
2018-11-11 0.000000 -1.468464 -0.614239 -1.626961 -0.778364 10
2018-11-12 0.150434 0.695593 1.143397 -0.230024 0.574705 11
2018-11-13 0.005828 0.075620 -1.644779 -0.421319 0.475862 12
2018-11-14 1.362497 1.361133 1.148436 -1.404762 1.761538 13
2018-11-15 0.808858 0.109134 -1.039617 -0.554011 -1.110284 14
2018-11-16 -0.598378 -0.481976 0.344308 0.480649 0.543493 15
2018-11-17 -0.446692 -0.434767 0.837281 -0.218065 -0.306157 16
2018-11-18 0.317214 -0.272280 2.487579 0.177236 0.823826 17
A B C D E F
2018-11-11 0.000000 -1.468464 -0.614239 4 -0.778364 10
2018-11-12 0.150434 1.000000 1.143397 4 0.574705 11
2018-11-13 0.005828 0.075620 -1.644779 4 0.475862 12
2018-11-14 1.362497 1.361133 1.148436 4 1.761538 13
2018-11-15 0.808858 0.109134 -1.039617 4 -1.110284 14
2018-11-16 -0.598378 -0.481976 0.344308 4 0.543493 15
2018-11-17 -0.446692 -0.434767 0.837281 4 -0.306157 16
2018-11-18 0.317214 -0.272280 2.487579 4 0.823826 17
A B C D E F
2018-11-11 0.000000 -1.468464 -0.614239 -4 -0.778364 -10
2018-11-12 -0.150434 -1.000000 -1.143397 -4 -0.574705 -11
2018-11-13 -0.005828 -0.075620 -1.644779 -4 -0.475862 -12
2018-11-14 -1.362497 -1.361133 -1.148436 -4 -1.761538 -13
2018-11-15 -0.808858 -0.109134 -1.039617 -4 -1.110284 -14
2018-11-16 -0.598378 -0.481976 -0.344308 -4 -0.543493 -15
2018-11-17 -0.446692 -0.434767 -0.837281 -4 -0.306157 -16
2018-11-18 -0.317214 -0.272280 -2.487579 -4 -0.823826 -17
缺失值处理
# 缺失值处理
df1 = df.reindex(index=dates[:4], columns=list("ABCD") + ["G"])
df1.loc[dates[0]:dates[1], "G"] = 1
print(df1)
print(df1.dropna()) # 删除缺失值所在的行
print(df1.fillna(value=2))
A B C D G
2018-11-11 0.000000 -1.468464 -0.614239 4 1.0
2018-11-12 0.150434 1.000000 1.143397 4 1.0
2018-11-13 0.005828 0.075620 -1.644779 4 NaN
2018-11-14 1.362497 1.361133 1.148436 4 NaN
A B C D G
2018-11-11 0.000000 -1.468464 -0.614239 4 1.0
2018-11-12 0.150434 1.000000 1.143397 4 1.0
A B C D G
2018-11-11 0.000000 -1.468464 -0.614239 4 1.0
2018-11-12 0.150434 1.000000 1.143397 4 1.0
2018-11-13 0.005828 0.075620 -1.644779 4 2.0
2018-11-14 1.362497 1.361133 1.148436 4 2.0
统计
# 统计
print(df.mean()) # 均值
print(df.var()) # 方差
s = pd.Series([1, 2, 4, np.nan, 5, 7, 9, 10], index=dates)
print(s)
print(s.shift(2)) # 值往后移动,不是循环移动
print(s.diff()) # 后面一位减去前面一位
print(s.value_counts()) # 每个值出现的次数
print(df)
print(df.apply(np.cumsum)) # 后一行加上前一行
print(df.apply(lambda x: x.max() - x.min()))
A 0.199970
B -0.013950
C 0.332796
D 4.000000
E 0.248077
F 13.500000
dtype: float64
A 0.410232
B 0.788657
C 1.844466
D 0.000000
E 0.867685
F 6.000000
dtype: float64
2018-11-11 1.0
2018-11-12 2.0
2018-11-13 4.0
2018-11-14 NaN
2018-11-15 5.0
2018-11-16 7.0
2018-11-17 9.0
2018-11-18 10.0
Freq: D, dtype: float64
2018-11-11 NaN
2018-11-12 NaN
2018-11-13 1.0
2018-11-14 2.0
2018-11-15 4.0
2018-11-16 NaN
2018-11-17 5.0
2018-11-18 7.0
Freq: D, dtype: float64
2018-11-11 NaN
2018-11-12 1.0
2018-11-13 2.0
2018-11-14 NaN
2018-11-15 NaN
2018-11-16 2.0
2018-11-17 2.0
2018-11-18 1.0
Freq: D, dtype: float64
10.0 1
9.0 1
7.0 1
5.0 1
4.0 1
2.0 1
1.0 1
dtype: int64
A B C D E F
2018-11-11 0.000000 -1.468464 -0.614239 4 -0.778364 10
2018-11-12 0.150434 1.000000 1.143397 4 0.574705 11
2018-11-13 0.005828 0.075620 -1.644779 4 0.475862 12
2018-11-14 1.362497 1.361133 1.148436 4 1.761538 13
2018-11-15 0.808858 0.109134 -1.039617 4 -1.110284 14
2018-11-16 -0.598378 -0.481976 0.344308 4 0.543493 15
2018-11-17 -0.446692 -0.434767 0.837281 4 -0.306157 16
2018-11-18 0.317214 -0.272280 2.487579 4 0.823826 17
A B C D E F
2018-11-11 0.000000 -1.468464 -0.614239 4 -0.778364 10
2018-11-12 0.150434 -0.468464 0.529158 8 -0.203659 21
2018-11-13 0.156262 -0.392844 -1.115621 12 0.272203 33
2018-11-14 1.518759 0.968288 0.032815 16 2.033741 46
2018-11-15 2.327617 1.077423 -1.006801 20 0.923457 60
2018-11-16 1.729239 0.595447 -0.662494 24 1.466950 75
2018-11-17 1.282548 0.160680 0.174787 28 1.160793 91
2018-11-18 1.599762 -0.111600 2.662366 32 1.984619 108
A 1.960875
B 2.829597
C 4.132358
D 0.000000
E 2.871822
F 7.000000
dtype: float64
拼接
# 拼接
pieces = [df[:3], df[-3:]]
print(pd.concat(pieces)) # 拼接前三行和后三行
A B C D E F
2018-11-11 0.000000 -1.468464 -0.614239 4 -0.778364 10
2018-11-12 0.150434 1.000000 1.143397 4 0.574705 11
2018-11-13 0.005828 0.075620 -1.644779 4 0.475862 12
2018-11-16 -0.598378 -0.481976 0.344308 4 0.543493 15
2018-11-17 -0.446692 -0.434767 0.837281 4 -0.306157 16
2018-11-18 0.317214 -0.272280 2.487579 4 0.823826 17