Pandas中有哪些非值数据
1. NaN 是什么
NaN是被遗失的,不属于任何类型
from numpy import NaN,nan
print(nan)
nan
print(NaN==True)
print(NaN==False)
print(NaN==0)
print(NaN=='')
print(NaN==NaN)
print(NaN==nan)
False
False
False
False
False
False
import pandas as pd
x = NaN
y = nan
n = 20
print(pd.isnull(x))
print(pd.isnull(y))
print(pd.notnull(n))
True
True
True
2.数据遗失的原因
2.1数据缺失造成的NaN
import pandas as pd
# 装载数据
visited_file = './data/survey_visited.csv'
print(pd.read_csv(visited_file))
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
# 不将控制设置为NaN
print(pd.read_csv(visited_file,na_values=[' '],keep_default_na=False))
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
2.2合并数据导致的NaN
# 合并数据
visited = pd.read_csv('./data/survey_visited.csv')
survey = pd.read_csv('./data/survey_survey.csv')
print(visited)
print(survey)
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
taken person quant reading
0 619 dyer rad 9.82
1 619 dyer sal 0.13
2 622 dyer rad 7.80
3 622 dyer sal 0.09
4 734 pb rad 8.41
5 734 lake sal 0.05
6 734 pb temp -21.50
7 735 pb rad 7.22
8 735 NaN sal 0.06
9 735 NaN temp -26.00
10 751 pb rad 4.35
11 751 pb temp -18.50
12 751 lake sal 0.10
13 752 lake rad 2.19
14 752 lake sal 0.09
15 752 lake temp -16.00
16 752 roe sal 41.60
17 837 lake rad 1.46
18 837 lake sal 0.21
19 837 roe sal 22.50
20 844 roe rad 11.25
vs = visited.merge(survey,left_on='ident',right_on='taken')
print(vs)
ident site dated taken person quant reading
0 619 DR-1 1927-02-08 619 dyer rad 9.82
1 619 DR-1 1927-02-08 619 dyer sal 0.13
2 622 DR-1 1927-02-10 622 dyer rad 7.80
3 622 DR-1 1927-02-10 622 dyer sal 0.09
4 734 DR-3 1939-01-07 734 pb rad 8.41
5 734 DR-3 1939-01-07 734 lake sal 0.05
6 734 DR-3 1939-01-07 734 pb temp -21.50
7 735 DR-3 1930-01-12 735 pb rad 7.22
8 735 DR-3 1930-01-12 735 NaN sal 0.06
9 735 DR-3 1930-01-12 735 NaN temp -26.00
10 751 DR-3 1930-02-26 751 pb rad 4.35
11 751 DR-3 1930-02-26 751 pb temp -18.50
12 751 DR-3 1930-02-26 751 lake sal 0.10
13 752 DR-3 NaN 752 lake rad 2.19
14 752 DR-3 NaN 752 lake sal 0.09
15 752 DR-3 NaN 752 lake temp -16.00
16 752 DR-3 NaN 752 roe sal 41.60
17 837 MSK-4 1932-01-14 837 lake rad 1.46
18 837 MSK-4 1932-01-14 837 lake sal 0.21
19 837 MSK-4 1932-01-14 837 roe sal 22.50
20 844 DR-1 1932-03-22 844 roe rad 11.25
2.3用户输入到时的NaN
scientisits = pd.DataFrame({
'Name':['Bill','Mike'],
'Occupation':['Chemist','Statist'],
})
print(scientisits)
Name Occupation
0 Bill Chemist
1 Mike Statist
from numpy import nan
scientisits['missing'] = nan
print(scientisits)
Name Occupation missing
0 Bill Chemist NaN
1 Mike Statist NaN
2.4重建索引
gapminder = pd.read_csv('./data/gapminder.tsv',sep='\t')
print(gapminder.head(5))
country continent year lifeExp pop gdpPercap
0 Afghanistan Asia 1952 28.801 8425333 779.445314
1 Afghanistan Asia 1957 30.332 9240934 820.853030
2 Afghanistan Asia 1962 31.997 10267083 853.100710
3 Afghanistan Asia 1967 34.020 11537966 836.197138
4 Afghanistan Asia 1972 36.088 13079460 739.981106
life_exp = gapminder.groupby(['year'])['lifeExp'].mean()
print(life_exp)
year
1952 49.057620
1957 51.507401
1962 53.609249
1967 55.678290
1972 57.647386
1977 59.570157
1982 61.533197
1987 63.212613
1992 64.160338
1997 65.014676
2002 65.694923
2007 67.007423
Name: lifeExp, dtype: float64
缺失部分数据为NaN
print(life_exp.loc[range(2000,2003)])
year
2000 NaN
2001 NaN
2002 65.694923
Name: lifeExp, dtype: float64
3.处理非值数据
3.1填充NaN
import pandas as pd
gapminder = pd.read_csv('./data/gapminder.tsv',sep='\t')
print(gapminder.head(5))
country continent year lifeExp pop gdpPercap
0 Afghanistan Asia 1952 28.801 8425333 779.445314
1 Afghanistan Asia 1957 30.332 9240934 820.853030
2 Afghanistan Asia 1962 31.997 10267083 853.100710
3 Afghanistan Asia 1967 34.020 11537966 836.197138
4 Afghanistan Asia 1972 36.088 13079460 739.981106
life_exp = gapminder.groupby(['year'])['lifeExp'].mean()
a = life_exp.loc[range(2000,2010)]
print(a)
year
2000 NaN
2001 NaN
2002 65.694923
2003 NaN
2004 NaN
2005 NaN
2006 NaN
2007 67.007423
2008 NaN
2009 NaN
Name: lifeExp, dtype: float64
填充指定值
print(a.fillna(0))
print(a.fillna('*'))
year
2000 0.000000
2001 0.000000
2002 65.694923
2003 0.000000
2004 0.000000
2005 0.000000
2006 0.000000
2007 67.007423
2008 0.000000
2009 0.000000
Name: lifeExp, dtype: float64
year
2000 *
2001 *
2002 65.6949
2003 *
2004 *
2005 *
2006 *
2007 67.0074
2008 *
2009 *
Name: lifeExp, dtype: object
forward 填充
print(a.fillna(method='ffill'))
year
2000 NaN
2001 NaN
2002 65.694923
2003 65.694923
2004 65.694923
2005 65.694923
2006 65.694923
2007 67.007423
2008 67.007423
2009 67.007423
Name: lifeExp, dtype: float64
backward 填充
print(a.fillna(method='bfill'))
year
2000 65.694923
2001 65.694923
2002 65.694923
2003 67.007423
2004 67.007423
2005 67.007423
2006 67.007423
2007 67.007423
2008 NaN
2009 NaN
Name: lifeExp, dtype: float64
先bfill后ffill填充
print(a.fillna(method='bfill').fillna(method='ffill'))
year
2000 65.694923
2001 65.694923
2002 65.694923
2003 67.007423
2004 67.007423
2005 67.007423
2006 67.007423
2007 67.007423
2008 67.007423
2009 67.007423
Name: lifeExp, dtype: float64
线性插值
print(a)
print(a.interpolate())
year
2000 NaN
2001 NaN
2002 65.694923
2003 NaN
2004 NaN
2005 NaN
2006 NaN
2007 67.007423
2008 NaN
2009 NaN
Name: lifeExp, dtype: float64
year
2000 NaN
2001 NaN
2002 65.694923
2003 65.957423
2004 66.219923
2005 66.482423
2006 66.744923
2007 67.007423
2008 67.007423
2009 67.007423
Name: lifeExp, dtype: float64
aa = pd.Series([NaN,NaN,2,NaN,4,NaN,6,NaN,8,NaN,NaN,NaN])
print(aa.interpolate())
0 NaN
1 NaN
2 2.0
3 3.0
4 4.0
5 5.0
6 6.0
7 7.0
8 8.0
9 8.0
10 8.0
11 8.0
dtype: float64
3.2删除包含NaN的行
print(a)
print(a.dropna())
year
2000 NaN
2001 NaN
2002 65.694923
2003 NaN
2004 NaN
2005 NaN
2006 NaN
2007 67.007423
2008 NaN
2009 NaN
Name: lifeExp, dtype: float64
year
2002 65.694923
2007 67.007423
Name: lifeExp, dtype: float64