df=DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df
Out[155]:
one two
a 1.40 NaN
b 7.10
-4.5
c NaN NaN
d 0.75
-1.3
df.sum()
Out[156]:
one 9.25
two -5.80
dtype:
float64
df.sum(axis=1)
Out[157]:
a 1.40
b 2.60
c 0.00
d -0.55
dtype:
float64
df.mean(axis=1)
Out[158]:
a 1.400
b 1.300
c NaN
d -0.275
dtype:
float64
df.mean(axis=1,skipna=False)
Out[159]:
a NaN
b 1.300
c NaN
d -0.275
dtype:
float64
df.idxmax()
Out[160]:
one b
two d
dtype:
object
df.cumsum()
Out[161]:
one two
a 1.40 NaN
b 8.50
-4.5
c NaN NaN
d 9.25
-5.8
df.describe()
Out[162]:
one two
count 3.000000 2.000000
mean 3.083333
-2.900000
std 3.493685 2.262742
min 0.750000
-4.500000
25% 1.075000
-3.700000
50% 1.400000
-2.900000
75% 4.250000
-2.100000
max 7.100000
-1.300000
obj=Series(['a','a','b','c']*4)
obj.describe()
Out[165]:
count 16
unique 3
top a
freq 8
dtype:
object
obj
Out[166]:
0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
dtype:
object
#相关系数和协方差
有待之后解决
#唯一值和统计值
obj=Series(['c','a','d','a','a','b','b','c','c'])
uniques=obj.unique()
uniques
Out[5]:
array(['c', 'a', 'd', 'b'], dtype=object)
a=obj.value_counts()
a
Out[8]:
c 3
a 3
b 2
d 1
dtype:
int64
a.sort_index()
Out[11]:
a 3
b 2
c 3
d 1
dtype:
int64
a.sort_index(ascending=False)
Out[12]:
d 1
c 3
b 2
a 3
dtype:
int64
obj.value_counts()
Out[13]:
c 3
a 3
b 2
d 1
dtype:
int64
mask=obj.isin(['b','c']) #判断
mask
Out[15]:
0 True
1 False
2 False
3 False
4 False
5 True
6 True
7 True
8 True
dtype:
bool
obj[mask]
Out[16]:
0 c
5 b
6 b
7 c
8 c
dtype:
object
data=DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})
data
Out[18]:
Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4
result=data.apply(pd.value_counts).fillna(0)
result
Out[20]:
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 0.0 2.0 1.0
3 2.0 2.0 0.0
4 2.0 0.0 2.0
5 0.0 0.0 1.0
#处理缺失数据
string_data=Series(['aardvark','artichoke',np.nan,'avocado'])
string_data
Out[22]:
0 aardvark
1 artichoke
2 NaN
3 avocado
dtype:
object
string_data.isnull()
Out[23]:
0 False
1 False
2 True
3 False
dtype:
bool
string_data[0]=None
string_data.isnull()
Out[25]:
0 True
1 False
2 True
3 False
dtype:
bool
#滤除缺失数据
from
numpy import nan as NA
data=Series([1,NA,3.5,NA,7])
data
Out[28]:
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
dtype:
float64
data.dropna()
Out[29]:
0 1.0
2 3.5
4 7.0
dtype:
float64
data[data.notnull()]
Out[30]:
0 1.0
2 3.5
4 7.0
dtype:
float64
data=DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])
data
Out[32]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
data.dropna()
Out[33]:
0 1 2
0 1.0 6.5 3.0
data.dropna(how='all') #丢弃全为NA的行
Out[34]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
data.dropna(axis=1,how='all') #丢弃全为NA的列
Out[35]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
df=DataFrame(np.random.randn(7,3))
df
Out[37]:
0 1 2
0 0.329323
-0.020140 -1.418435
1
-0.801842
-0.327760 2.141541
2
-1.990021
-0.528645 1.976050
3 0.593861 0.877746 0.709352
4
-0.367444 0.363773
-0.503865
5 0.303713
-0.567919 -1.314300
6 1.207189 0.665677
-0.229915
df.ix[:4,1]=NA;df.ix[:2,2]=NA
df
Out[39]:
0 1 2
0 0.329323 NaN NaN
1
-0.801842 NaN NaN
2
-1.990021 NaN NaN
3 0.593861 NaN 0.709352
4
-0.367444 NaN
-0.503865
5 0.303713
-0.567919 -1.314300
6 1.207189 0.665677
-0.229915
df.dropna(thresh=3) #时间序列
Out[40]:
0 1 2
5
0.303713 -0.567919 -1.314300
6 1.207189 0.665677
-0.229915
#填充缺失数据
df.fillna(0)
Out[41]:
0 1 2
0 0.329323 0.000000 0.000000
1
-0.801842 0.000000 0.000000
2
-1.990021 0.000000 0.000000
3 0.593861 0.000000 0.709352
4
-0.367444 0.000000
-0.503865
5 0.303713
-0.567919 -1.314300
6 1.207189 0.665677
-0.229915
df.fillna({1:0.5,3:-1})
Out[42]:
0 1 2
0 0.329323 0.500000 NaN
1
-0.801842 0.500000 NaN
2
-1.990021 0.500000 NaN
3 0.593861 0.500000 0.709352
4
-0.367444 0.500000
-0.503865
5 0.303713
-0.567919 -1.314300
6 1.207189 0.665677
-0.229915
_=df.fillna(0,inplace=True) #总是返回被填充对象的引用
df
Out[45]:
0 1 2
0 0.329323 0.000000 0.000000
1
-0.801842 0.000000 0.000000
2
-1.990021 0.000000 0.000000
3 0.593861 0.000000 0.709352
4
-0.367444 0.000000
-0.503865
5 0.303713
-0.567919 -1.314300
6 1.207189 0.665677
-0.229915
df.fillna(0,inplace=True)
df
Out[47]:
0 1 2
0 0.329323 0.000000 0.000000
1
-0.801842 0.000000 0.000000
2
-1.990021 0.000000 0.000000
3 0.593861 0.000000 0.709352
4
-0.367444 0.000000
-0.503865
5 0.303713
-0.567919 -1.314300
6 1.207189 0.665677
-0.229915
df=DataFrame(np.random.randn(6,3))
df.ix[2:,1]=NA;df.ix[4:,2]=NA
df
Out[50]:
0 1 2
0 1.266212 0.036003 1.549048
1 2.777833
-0.321865 -0.545796
2
-0.371542 NaN
-0.395398
3
1.231016 NaN
-1.528668
4
-1.023311 NaN NaN
5 1.817814 NaN NaN
df.fillna(method='ffill')
Out[51]:
0 1 2
0 1.266212 0.036003 1.549048
1 2.777833
-0.321865 -0.545796
2
-0.371542 -0.321865 -0.395398
3 1.231016
-0.321865 -1.528668
4
-1.023311 -0.321865 -1.528668
5 1.817814
-0.321865 -1.528668
df.fillna(method='ffill',limit=2)
Out[52]:
0 1 2
0 1.266212 0.036003 1.549048
1 2.777833
-0.321865 -0.545796
2
-0.371542 -0.321865 -0.395398
3 1.231016
-0.321865 -1.528668
4
-1.023311 NaN
-1.528668
5 1.817814 NaN
-1.528668
data=Series([1,NA,3.5,NA,7])
data.fillna(data.mean())
Out[54]:
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000
dtype:
float64