>>> s=pd.Series([1,2,np.nan,4]) # 字符串 一维索引
>>> s
0 1.0
1 2.0
2 NaN
3 4.0
dtype: float64
>>> dates = pd.date_range('20190725',periods=4) # 表格型数据结构,二维索引
>>> dates
DatetimeIndex(['2019-07-25', '2019-07-26', '2019-07-27', '2019-07-28'], dtype='datetime64[ns]', freq='D')
>>> df = pd.DataFrame(np.random.randn(4,4),index=dates,columns=['a','b','c','d'])
>>> df
a b c d
2019-07-25 0.837788 0.012600 -1.815884 0.276740
2019-07-26 -1.586108 0.737898 -0.541256 0.449048
2019-07-27 -1.522343 0.264496 -0.285446 1.933708
2019-07-28 1.063021 -0.566470 0.071764 0.243278
>>> df["b"] # 索引数据
2019-07-25 0.012600
2019-07-26 0.737898
2019-07-27 0.264496
2019-07-28 -0.566470
Freq: D, Name: b, dtype: float64
>>> df.dtypes # 数据类型
a float64
b float64
c float64
d float64
dtype: object
>>> df.index # 横向标签
DatetimeIndex(['2019-07-25', '2019-07-26', '2019-07-27', '2019-07-28'], dtype='datetime64[ns]', freq='D')
>>> df.columns # 纵向标签
Index(['a', 'b', 'c', 'd'], dtype='object')
>>> df.values # 值
array([[ 0.83778807, 0.01259989, -1.81588355, 0.27673983],
[-1.58610777, 0.73789787, -0.54125606, 0.44904759],
[-1.52234345, 0.26449592, -0.28544604, 1.93370844],
[ 1.06302092, -0.56646994, 0.07176394, 0.24327814]])
>>> df.describe()
a b c d
count 4.000000 4.000000 4.000000 4.000000
mean -0.301911 0.112131 -0.642705 0.725694
std 1.449203 0.543202 0.821531 0.810374
min -1.586108 -0.566470 -1.815884 0.243278
25% -1.538285 -0.132168 -0.859913 0.268374
50% -0.342278 0.138548 -0.413351 0.362894
75% 0.894096 0.382846 -0.196144 0.820213
max 1.063021 0.737898 0.071764 1.933708
>>> df.T
2019-07-25 2019-07-26 2019-07-27 2019-07-28
a 0.837788 -1.586108 -1.522343 1.063021
b 0.012600 0.737898 0.264496 -0.566470
c -1.815884 -0.541256 -0.285446 0.071764
d 0.276740 0.449048 1.933708 0.243278
>>> df.sort_index(axis=1,ascending=False) # 横向 降序
d c b a
2019-07-25 0.276740 -1.815884 0.012600 0.837788
2019-07-26 0.449048 -0.541256 0.737898 -1.586108
2019-07-27 1.933708 -0.285446 0.264496 -1.522343
2019-07-28 0.243278 0.071764 -0.566470 1.063021
>>> df.sort_values(by='a') # a列 升序
a b c d
2019-07-26 -1.586108 0.737898 -0.541256 0.449048
2019-07-27 -1.522343 0.264496 -0.285446 1.933708
2019-07-25 0.837788 0.012600 -1.815884 0.276740
2019-07-28 1.063021 -0.566470 0.071764 0.243278
>>> df
A B C D
2019-01-01 0 NaN 2.0 3
2019-01-02 4 5.0 NaN 7
2019-01-03 8 9.0 10.0 11
2019-01-04 12 13.0 14.0 15
2019-01-05 16 17.0 18.0 19
2019-01-06 20 21.0 22.0 23
>>> df.dropna(axis=0,how="any") # 0横向 删除,how=all必须全为nan
A B C D
2019-01-03 8 9.0 10.0 11
2019-01-04 12 13.0 14.0 15
2019-01-05 16 17.0 18.0 19
2019-01-06 20 21.0 22.0 23
>>> df.fillna(value=99) # 代替
A B C D
2019-01-01 0 99.0 2.0 3
2019-01-02 4 5.0 99.0 7
2019-01-03 8 9.0 10.0 11
2019-01-04 12 13.0 14.0 15
2019-01-05 16 17.0 18.0 19
2019-01-06 20 21.0 22.0 23
>>> df.isnull()
A B C D
2019-01-01 False True False False
2019-01-02 False False True False
2019-01-03 False False False False
2019-01-04 False False False False
2019-01-05 False False False False
2019-01-06 False False False False
>>> np.any(df.isnull())
True
>>> df1
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
>>> df2
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
>>> df3
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
>>> res = pd.concat([df1, df2, df3], axis=0, ignore_index=True) # 纵向加 忽略原来索引
>>> res
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
>>> df1
b c d e
1 1.0 1.0 1.0 1.0
>>> df2
a b c d
1 0.0 0.0 0.0 0.0
>>> res = pd.concat([df1, df2], axis=0, join='inner') # 纵向合并,只合并有相同索引的列
>>> res
b c d
1 1.0 1.0 1.0
1 0.0 0.0 0.0
>>> res = pd.concat([df1, df2], axis=0, join_axes=[df1.columns]) # 纵向合并,按照df1.columns
>>> df1.append(df2) # 只有纵向合并
a b c d e
1 NaN 1.0 1.0 1.0 1.0
1 0.0 0.0 0.0 0.0 NaN
>>> df1
b c d e
1 1.0 1.0 1.0 1.0
>>> df2
a b c d
1 0.0 0.0 0.0 0.0
# 找到b,c,d列中,两者一样的行,这里没有一样的,为空
>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='inner')
>>> res
Empty DataFrame
Columns: [b, c, d, e, a]
Index: []
# 不一样的用nan填充
>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='outer')
>>> res
b c d e a
0 1.0 1.0 1.0 1.0 NaN
1 0.0 0.0 0.0 NaN 0.0
# 左边的索引
>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='left')
>>> res
b c d e a
0 1.0 1.0 1.0 1.0 NaN
>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='left', indicator='indicator_column')
>>> res
b c d e a indicator_column
0 1.0 1.0 1.0 1.0 NaN left_only
>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='left', indicator=True)
>>> res
b c d e a _merge
0 1.0 1.0 1.0 1.0 NaN left_only
>>> boys
k age
0 K0 1
1 K1 2
2 K2 3
>>> girls
k age
0 K0 4
1 K0 5
2 K3 6
>>> res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
>>> res
k age_boy age_girl
0 K0 1 4
1 K0 1 5
# 曲线图
data = pd.DataFrame(
np.random.randn(1000,4),
index=np.arange(1000),
columns=list("ABCD")
)
data.cumsum()
data.plot()
plt.show()
# 散点图
data.plot.scatter(x='A',y='C',color='LightGreen',label='Class2',ax=ax)
plt.show()