____tz_zs
参数 axis
axis=0 对每一列进行操作
axis=1 对每一行进行操作
参数 skipna
计算中,pandas 会默认排除NaN值,设置 skipna=False 将不再排除 NaN 值
.
#!/usr/bin/python2.7
# -*- coding:utf-8 -*-
"""
@author: tz_zs
"""
import numpy as np
import pandas as pd
data = [[1, 2, np.nan], [2, np.nan, 3], [7, 8, 9], [3, 4, 5]]
date_range = pd.date_range(start="20180701", periods=4)
df = pd.DataFrame(data=data, index=date_range,
columns=['a', 'b', 'c'])
print df
"""
a b c
2018-07-01 1 2.0 NaN
2018-07-02 2 NaN 3.0
2018-07-03 7 8.0 9.0
2018-07-04 3 4.0 5.0
"""
.
sum
求和 df.sum()
# 对每一列求和,默认排除NaN值
print df.sum()
"""
a 13.0
b 14.0
c 17.0
dtype: float64
"""
# 对每一行求和,默认排除NaN值
print df.sum(axis=1)
"""
2018-07-01 3.0
2018-07-02 5.0
2018-07-03 24.0
2018-07-04 12.0
Freq: D, dtype: float64
"""
# 设置 skipna=False 将不再排除 NaN 值
print df.sum(skipna=False)
"""
a 13.0
b NaN
c NaN
dtype: float64
"""
print df.sum(axis=1,skipna=False)
"""
2018-07-01 NaN
2018-07-02 NaN
2018-07-03 24.0
2018-07-04 12.0
Freq: D, dtype: float64
"""
.
mean
求平均 df.mean
# 对每一行求平均值
print df.mean(axis=1)
"""
2018-07-01 1.5
2018-07-02 2.5
2018-07-03 8.0
2018-07-04 4.0
Freq: D, dtype: float64
"""
print df.mean(axis=1, skipna=False)
"""
2018-07-01 NaN
2018-07-02 NaN
2018-07-03 8.0
2018-07-04 4.0
Freq: D, dtype: float64
"""
.
max、min
最大最小值 df.max、df.min
# 最大最小值
print df.max()
print df.min()
"""
a 7.0
b 8.0
c 9.0
dtype: float64
a 1.0
b 2.0
c 3.0
dtype: float64
"""
.
idxmax、idxmin
最大最小值的索引 df.idxmax、df.idxmin
# 返回每一列中最大值的索引
print df.idxmax()
# 返回每一列中最小值的索引
print df.idxmin()
"""
a 2018-07-03
b 2018-07-03
c 2018-07-03
dtype: datetime64[ns]
a 2018-07-01
b 2018-07-01
c 2018-07-02
dtype: datetime64[ns]
"""
.
cumsum
累加 df.cumsum
# 对每一列累加
print df.cumsum()
"""
a b c
2018-07-01 1.0 2.0 NaN
2018-07-02 3.0 NaN 3.0
2018-07-03 10.0 10.0 12.0
2018-07-04 13.0 14.0 17.0
"""
.
print df.median(axis=1) # 中位数
"""
2018-07-01 1.5
2018-07-02 2.5
2018-07-03 8.0
2018-07-04 4.0
Freq: D, dtype: float64
"""
.
mad
根据平均值计算平均绝对离差 df.mad
print df.mad() # 根据平均值计算平均绝对离差
"""
a 1.875000
b 2.222222
c 2.222222
dtype: float64
"""
.
std
标准差 df.std
print df.std() # 标准差
"""
a 2.629956
b 3.055050
c 3.055050
dtype: float64
"""
.
var
方差 df.var
print df.var() # 方差
"""
a 6.916667
b 9.333333
c 9.333333
dtype: float64
"""
.
diff
一阶差分 df.diff
print df.diff() # 计算一阶差分
"""
a b c
2018-07-01 NaN NaN NaN
2018-07-02 1.0 NaN NaN
2018-07-03 5.0 NaN 6.0
2018-07-04 -4.0 -4.0 -4.0
"""
.
pct_change
百分数变化 df.pct_change
pandas.Series.pct_change 、 pandas.DataFrame.pct_change
print df.pct_change() # 计算百分数变化(在列上计算)
"""
a b c
2018-07-01 NaN NaN NaN
2018-07-02 1.000000 NaN NaN
2018-07-03 2.500000 3.0 2.000000
2018-07-04 -0.571429 -0.5 -0.444444
"""
print df.pct_change(axis=1) # 计算百分数变化(在行上计算)
"""
a b c
2018-07-01 NaN 1.000000 NaN
2018-07-02 NaN NaN 0.500
2018-07-03 NaN 0.142857 0.125
2018-07-04 NaN 0.333333 0.250
"""
.
corr
计算列与列之间的相关性,不计算包括NA / null值的列。
DataFrame.corr(method='pearson', min_periods=1)
参数:
method:
- pearson:皮尔逊相关系数
- kendall:肯德尔等级相关系数
- spearman:斯皮尔曼等级相关系数
min_periods:为获取有效结构,每对列所需的最小观察数据量
返回:
关于原始DataFrame列与列之间相关性的DataFrame对象。
.
#!/usr/bin/python2.7
# -*- coding:utf-8 -*-
"""
@author: tz_zs
"""
import pandas as pd
list_l = [[1, 3, 3, 5, ], [11, 7, 15, 13], [4, 2, 7, 9]]
index = ["2018-07-02", "2018-07-03", "2018-07-04"]
col = ['a', 'b', 'c', 'd']
df = pd.DataFrame(list_l, index=index, columns=col)
print(df)
"""
a b c d
2018-07-02 1 3 3 5
2018-07-03 11 7 15 13
2018-07-04 4 2 7 9
"""
df_corr = df.corr()
print(df_corr)
print(type(df_corr))
"""
a b c d
a 1.000000 0.883852 0.999322 0.974355
b 0.883852 1.000000 0.866025 0.755929
c 0.999322 0.866025 1.000000 0.981981
d 0.974355 0.755929 0.981981 1.000000
<class 'pandas.core.frame.DataFrame'>
"""
.
#!/usr/bin/python2.7
# -*- coding:utf-8 -*-
"""
@author: tz_zs
"""
import pandas as pd
list_l = [[1, 3, 3, 5, ], [11, 7, 15, 13], [4, 2, 7, None]]
index = ["2018-07-02", "2018-07-03", "2018-07-04"]
col = ['a', 'b', 'c', 'd']
df = pd.DataFrame(list_l, index=index, columns=col)
print(df)
"""
a b c d
2018-07-02 1 3 3 5.0
2018-07-03 11 7 15 13.0
2018-07-04 4 2 7 NaN
"""
df_corr = df.corr()
print(df_corr)
print(type(df_corr))
"""
a b c d
a 1.000000 0.883852 0.999322 1.0
b 0.883852 1.000000 0.866025 1.0
c 0.999322 0.866025 1.000000 1.0
d 1.000000 1.000000 1.000000 1.0
<class 'pandas.core.frame.DataFrame'>
"""
.
end