#载入包
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#这句魔法命令的作用是:在notebook中启动静态图形
plt.rcParams['axes.unicode_minus'] = False # 解决坐标轴刻度负号乱码
plt.rcParams['font.sans-serif'] = ['Simhei'] # 解决中文乱码问题
# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")
#导入数据
data = pd.read_csv("D:\Desktop\数据分析训练营第五期\P5 python数据清洗&案例\【案例1】欧洲人口结构探索性分析\european_cities.csv")
data
Rank | City | State | Population | Date of census/estimate | |
---|---|---|---|---|---|
0 | 1 | London[2] | United Kingdom | 8,615,246 | 1-Jun-14 |
1 | 2 | Berlin | Germany | 3,437,916 | 31-May-14 |
2 | 3 | Madrid | Spain | 3,165,235 | 1-Jan-14 |
3 | 4 | Rome | Italy | 2,872,086 | 30-Sep-14 |
4 | 5 | Paris | France | 2,273,305 | 1-Jan-13 |
... | ... | ... | ... | ... | ... |
100 | 101 | Bonn | Germany | 309,869 | 31-Dec-12 |
101 | 102 | Malmö | Sweden | 309,105 | 31-Mar-13 |
102 | 103 | Nottingham | United Kingdom | 308,735 | 30-Jun-12 |
103 | 104 | Katowice | Poland | 308,269 | 30-Jun-12 |
104 | 105 | Kaunas | Lithuania | 306,888 | 1-Jan-13 |
105 rows × 5 columns
dt = data.copy() #对数据进行备份
1.对数据进行简单观察
dt.head() #查看数据前五行
Rank | City | State | Population | Date of census/estimate | |
---|---|---|---|---|---|
0 | 1 | London[2] | United Kingdom | 8,615,246 | 1-Jun-14 |
1 | 2 | Berlin | Germany | 3,437,916 | 31-May-14 |
2 | 3 | Madrid | Spain | 3,165,235 | 1-Jan-14 |
3 | 4 | Rome | Italy | 2,872,086 | 30-Sep-14 |
4 | 5 | Paris | France | 2,273,305 | 1-Jan-13 |
dt.shape
(105, 5)
dt.columns
Index(['Rank', 'City', 'State', 'Population', 'Date of census/estimate'], dtype='object')
dt.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Rank 105 non-null int64
1 City 105 non-null object
2 State 105 non-null object
3 Population 105 non-null object
4 Date of census/estimate 105 non-null object
dtypes: int64(1), object(4)
memory usage: 4.2+ KB
2.查看数据缺失值和重复值的情况
dt.isnull().sum() #默认按照列进行统计,sum(1)是对行进行统计
Rank 0
City 0
State 0
Population 0
Date of census/estimate 0
dtype: int64
dt.duplicated().sum() #查看重复值
0
dt.drop_duplicates(inplace = True,ignore_index = True) #去重
dt.shape #查看去重之后的
(105, 5)
3.数据清洗
dt.head()
Rank | City | State | Population | Date of census/estimate | |
---|---|---|---|---|---|
0 | 1 | London[2] | United Kingdom | 8,615,246 | 1-Jun-14 |
1 | 2 | Berlin | Germany | 3,437,916 | 31-May-14 |
2 | 3 | Madrid | Spain | 3,165,235 | 1-Jan-14 |
3 | 4 | Rome | Italy | 2,872,086 | 30-Sep-14 |
4 | 5 | Paris | France | 2,273,305 | 1-Jan-13 |
dt["City"].values
array(['London[2]', 'Berlin', 'Madrid', 'Rome', 'Paris', 'Bucharest',
'Vienna', 'Hamburg[10]', 'Budapest', 'Warsaw', 'Barcelona',
'Munich', 'Milan', 'Sofia', 'Prague', 'Brussels[17]', 'Birmingham',
'Cologne', 'Naples', 'Stockholm', 'Turin', 'Marseille',
'Amsterdam', 'Zagreb', 'Valencia', 'Kraków', 'Leeds', 'Łódź',
'Frankfurt', 'Riga', 'Seville', 'Palermo', 'Zaragoza', 'Athens',
'Wrocław', 'Rotterdam', 'Helsinki', 'Stuttgart', 'Glasgow',
'Genoa', 'Düsseldorf', 'Dortmund', 'Essen', 'Málaga', 'Copenhagen',
'Sheffield', 'Lisbon', 'Poznań', 'Bremen', 'Vilnius', 'Leipzig',
'Dresden', 'Gothenburg', 'Dublin', 'Bradford', 'Hanover',
'The Hague', 'Manchester', 'Antwerp', 'Edinburgh', 'Nuremberg',
'Duisburg', 'Lyon', 'Liverpool', 'Gdańsk', 'Toulouse', 'Murcia',
'Tallinn', 'Bristol', 'Bratislava', 'Szczecin',
'Palma de Mallorca', 'Bologna', 'Las Palmas', 'Florence', 'Brno',
'Bydgoszcz', 'Bochum', 'Bilbao', 'Cardiff', 'Lublin', 'Nice',
'Wuppertal', 'Plovdiv', 'Varna', 'Alicante', 'Leicester',
'Utrecht', 'Córdoba', 'Bielefeld', 'Wakefield', 'Aarhus',
'Cluj-Napoca', 'Coventry', 'Bari', 'Thessaloniki', 'Wirral',
'Timișoara', 'Catania', 'Valladolid', 'Bonn', 'Malmö',
'Nottingham', 'Katowice', 'Kaunas'], dtype=object)
dt["City"].str.isalpha() #判断 是否为字母
0 False
1 True
2 True
3 True
4 True
...
100 True
101 True
102 True
103 True
104 True
Name: City, Length: 105, dtype: bool
#将布尔型 放在数据[]后 会返回为True的原数据
dt["City"][~dt["City"].str.isalpha()] #~表示取反
0 London[2]
7 Hamburg[10]
15 Brussels[17]
56 The Hague
71 Palma de Mallorca
73 Las Palmas
92 Cluj-Napoca
Name: City, dtype: object
dt.iloc[[0,7,15],1] #只能用数字
0 London[2]
7 Hamburg[10]
15 Brussels[17]
Name: City, dtype: object
dt.iloc[[0,7,15],1] = ["London","Hamburg","Brussels"]
dt.iloc[[0,7,15],1]
0 London
7 Hamburg
15 Brussels
Name: City, dtype: object
dt["Population"].values
array(['8,615,246', '3,437,916', '3,165,235', '2,872,086', '2,273,305',
'1,883,425', '1,794,770', '1,746,342', '1,744,665', '1,729,119',
'1,602,386', '1,407,836', '1,332,516', '1,291,895', '1,246,780',
'1,175,831', '1,092,330', '1,034,175', '989,845', '909,976',
'898,095', '852,516', '813,562', '790,017', '786,424', '760,700',
'757,655', '709,757', '701,350', '701,185', '696,676', '677,015',
'666,058', '664,046', '632,432', '616,528', '605,523', '604,297',
'596,550', '594,774', '593,682', '575,944', '569,884', '566,913',
'559,440', '557,382', '547,631', '547,161', '546,451', '537,152',
'531,562', '530,754', '528,014', '527,612', '524,619', '514,137',
'510,909', '510,772', '510,610', '495,360', '495,121', '486,816',
'484,344', '469,690', '460,354', '441,802', '441,354', '434,810',
'432,451', '417,389', '409,211', '407,648', '384,202', '382,296',
'377,207', '378,327', '362,286', '362,213', '351,629', '348,493',
'348,120', '343,304', '342,885', '341,041', '335,819', '334,678',
'331,606', '330,772', '328,841', '328,314', '327,627', '326,676',
'324,576', '323,132', '322,751', '322,240', '320,229', '319,279',
'315,576', '311,501', '309,869', '309,105', '308,735', '308,269',
'306,888'], dtype=object)
dt.Population.str.replace(",","") #用空值 取代 ,
0 8615246
1 3437916
2 3165235
3 2872086
4 2273305
...
100 309869
101 309105
102 308735
103 308269
104 306888
Name: Population, Length: 105, dtype: object
dt.Population.str.replace(",","") .astype(int)
0 8615246
1 3437916
2 3165235
3 2872086
4 2273305
...
100 309869
101 309105
102 308735
103 308269
104 306888
Name: Population, Length: 105, dtype: int32
dt["Population"] = dt.Population.str.replace(",","") .astype(int)
dt.Population
0 8615246
1 3437916
2 3165235
3 2872086
4 2273305
...
100 309869
101 309105
102 308735
103 308269
104 306888
Name: Population, Length: 105, dtype: int32
dt["Date of census/estimate"]
0 1-Jun-14
1 31-May-14
2 1-Jan-14
3 30-Sep-14
4 1-Jan-13
...
100 31-Dec-12
101 31-Mar-13
102 30-Jun-12
103 30-Jun-12
104 1-Jan-13
Name: Date of census/estimate, Length: 105, dtype: object
dt["Date of census/estimate"] = pd.to_datetime(dt["Date of census/estimate"])
dt.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Rank 105 non-null int64
1 City 105 non-null object
2 State 105 non-null object
3 Population 105 non-null int32
4 Date of census/estimate 105 non-null datetime64[ns]
dtypes: datetime64[ns](1), int32(1), int64(1), object(2)
memory usage: 3.8+ KB
4.重置索引 index
dt.set_index(["Rank"],inplace=True)
dt.head()
City | State | Population | Date of census/estimate | |
---|---|---|---|---|
Rank | ||||
1 | London | United Kingdom | 8615246 | 2014-06-01 |
2 | Berlin | Germany | 3437916 | 2014-05-31 |
3 | Madrid | Spain | 3165235 | 2014-01-01 |
4 | Rome | Italy | 2872086 | 2014-09-30 |
5 | Paris | France | 2273305 | 2013-01-01 |
5.探索性分析
- 5.1 查看那个国家的城市最多
dt.State
Rank
1 United Kingdom
2 Germany
3 Spain
4 Italy
5 France
...
101 Germany
102 Sweden
103 United Kingdom
104 Poland
105 Lithuania
Name: State, Length: 105, dtype: object
dt.duplicated("State").sum()
81
dt.drop_duplicates("State",ignore_index = True) #去重 查看 国家多少个
City | State | Population | Date of census/estimate | |
---|---|---|---|---|
0 | London | United Kingdom | 8615246 | 2014-06-01 |
1 | Berlin | Germany | 3437916 | 2014-05-31 |
2 | Madrid | Spain | 3165235 | 2014-01-01 |
3 | Rome | Italy | 2872086 | 2014-09-30 |
4 | Paris | France | 2273305 | 2013-01-01 |
5 | Bucharest | Romania | 1883425 | 2011-10-20 |
6 | Vienna | Austria | 1794770 | 2015-01-01 |
7 | Budapest | Hungary | 1744665 | 2014-01-01 |
8 | Warsaw | Poland | 1729119 | 2014-03-31 |
9 | Sofia | Bulgaria | 1291895 | 2014-12-14 |
10 | Prague | Czech Republic | 1246780 | 2013-01-01 |
11 | Brussels | Belgium | 1175831 | 2014-01-01 |
12 | Stockholm | Sweden | 909976 | 2014-01-31 |
13 | Amsterdam | Netherlands | 813562 | 2014-05-31 |
14 | Zagreb | Croatia | 790017 | 2011-03-31 |
15 | Riga | Latvia | 701185 | 2014-01-01 |
16 | Athens | Greece | 664046 | 2011-05-24 |
17 | Helsinki | Finland | 605523 | 2013-02-28 |
18 | Copenhagen | Denmark | 559440 | 2013-01-01 |
19 | Lisbon | Portugal | 547631 | 2011-01-01 |
20 | Vilnius | Lithuania | 537152 | 2013-01-01 |
21 | Dublin | Ireland | 527612 | 2011-04-10 |
22 | Tallinn | Estonia | 434810 | 2015-02-01 |
23 | Bratislava | Slovakia Slovak Republic | 417389 | 2013-12-31 |
dt.State.nunique() #查看State列 不一样的个数
24
方法一:使用sort_values进行排序
dt.groupby("State") #对国家进行分组
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001780DD17D30>
dt.groupby("State")["City"].count() #对city进行计数
State
Austria 1
Belgium 2
Bulgaria 3
Croatia 1
Czech Republic 2
Denmark 2
Estonia 1
Finland 1
France 5
Germany 19
Greece 2
Hungary 1
Ireland 1
Italy 10
Latvia 1
Lithuania 2
Netherlands 4
Poland 10
Portugal 1
Romania 3
Spain 13
Sweden 3
United Kingdom 16
Slovakia Slovak Republic 1
Name: City, dtype: int64
dt.groupby("State")["City"].count().sort_values(ascending = False)[:5] #ascending表示升序 [:5] 表示显示前五行
State
Germany 19
United Kingdom 16
Spain 13
Poland 10
Italy 10
Name: City, dtype: int64
方法二:用.nlargest()提取值最大的n个记录
dt.groupby("State")["City"].count().nlargest(6) #前6个最大的
State
Germany 19
United Kingdom 16
Spain 13
Italy 10
Poland 10
France 5
Name: City, dtype: int64
- 5.2 查看那个国家人口最多
dt.head()
City | State | Population | Date of census/estimate | |
---|---|---|---|---|
Rank | ||||
1 | London | United Kingdom | 8615246 | 2014-06-01 |
2 | Berlin | Germany | 3437916 | 2014-05-31 |
3 | Madrid | Spain | 3165235 | 2014-01-01 |
4 | Rome | Italy | 2872086 | 2014-09-30 |
5 | Paris | France | 2273305 | 2013-01-01 |
dt.groupby("State")["Population"].sum().nlargest()
State
United Kingdom 16011877
Germany 15119548
Spain 10041639
Italy 8764067
Poland 6267409
Name: Population, dtype: int32
popu = dt.groupby("State")["Population"].sum().sort_values( ascending = False)
popu
State
United Kingdom 16011877
Germany 15119548
Spain 10041639
Italy 8764067
Poland 6267409
France 4395271
Romania 2527280
Netherlands 2271771
Bulgaria 1968755
Austria 1794770
Sweden 1747095
Hungary 1744665
Belgium 1686441
Czech Republic 1625107
Greece 986286
Denmark 886116
Lithuania 844040
Croatia 790017
Latvia 701185
Finland 605523
Portugal 547631
Ireland 527612
Estonia 434810
Slovakia Slovak Republic 417389
Name: Population, dtype: int32
#设置画布
plt.figure(figsize=(15,5)) #设置大小
sns.barplot(popu.index,popu) #绘制条形图,popu.index为横坐标,popu为纵坐标 也可以写为popu.values
plt.xticks(rotation = 45,ha = "right") #旋转坐标轴标签 ha是对齐方式
# 设置刻度文本的大小
plt.yticks(fontsize=10)
plt.xticks(fontsize=10)
# 设置轴标签的文本以及大小
plt.ylabel('国家名',fontsize=12)
plt.xlabel('主要城市人口(千万)',fontsize=12)
plt.show()
# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ZzWolWaJ-1684310503578)(output_47_0.png)]
#画图
plt.figure(figsize=(15,5)) #设置大小
sns.barplot(popu.values, popu.index, orient = "h") #orient表示转项
plt.xticks(rotation = 45, ha = "right") #旋转坐标轴标签 ha是对齐方式
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pIfTRJOr-1684310503579)(output_48_0.png)]
- 5.3 查看欧洲每年人口变化情况
#使用apply循环进行 提取x的year
dt["year"] = dt["Date of census/estimate"].apply(lambda x: x.year) #map映射也行
dt.head()
City | State | Population | Date of census/estimate | year | |
---|---|---|---|---|---|
Rank | |||||
1 | London | United Kingdom | 8615246 | 2014-06-01 | 2014 |
2 | Berlin | Germany | 3437916 | 2014-05-31 | 2014 |
3 | Madrid | Spain | 3165235 | 2014-01-01 | 2014 |
4 | Rome | Italy | 2872086 | 2014-09-30 | 2014 |
5 | Paris | France | 2273305 | 2013-01-01 | 2013 |
dt.groupby("year")["Population"].sum().plot(kind = "line",color = "r",title = "欧洲人口变化情况",xlabel = "年份",ylabel = "人口数量");
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3Qi1jJoW-1684310503579)(output_51_0.png)]
#设置画布
plt.figure(figsize=(10,5)) #设置大小
sns.lineplot(dt.year,dt.Population) #绘制条形图,popu.index为横坐标,popu为纵坐标 也可以写为popu.values
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cLJ8F3Dp-1684310503579)(output_52_0.png)]
dt.pivot_table(values="Population",index="State",columns="year",aggfunc='sum',margins=True)
year | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | All |
---|---|---|---|---|---|---|---|
State | |||||||
Austria | NaN | NaN | NaN | NaN | NaN | 1794770.0 | 1794770 |
Belgium | NaN | NaN | NaN | NaN | 1686441.0 | NaN | 1686441 |
Bulgaria | NaN | NaN | NaN | 676860.0 | 1291895.0 | NaN | 1968755 |
Croatia | NaN | 790017.0 | NaN | NaN | NaN | NaN | 790017 |
Czech Republic | NaN | NaN | NaN | 1625107.0 | NaN | NaN | 1625107 |
Denmark | NaN | NaN | NaN | 559440.0 | 326676.0 | NaN | 886116 |
Estonia | NaN | NaN | NaN | NaN | NaN | 434810.0 | 434810 |
Finland | NaN | NaN | NaN | 605523.0 | NaN | NaN | 605523 |
France | 1269450.0 | NaN | 852516.0 | 2273305.0 | NaN | NaN | 4395271 |
Germany | NaN | NaN | 3979488.0 | 7702144.0 | 3437916.0 | NaN | 15119548 |
Greece | NaN | 986286.0 | NaN | NaN | NaN | NaN | 986286 |
Hungary | NaN | NaN | NaN | NaN | 1744665.0 | NaN | 1744665 |
Ireland | NaN | 527612.0 | NaN | NaN | NaN | NaN | 527612 |
Italy | NaN | NaN | NaN | 1399736.0 | 7364331.0 | NaN | 8764067 |
Latvia | NaN | NaN | NaN | NaN | 701185.0 | NaN | 701185 |
Lithuania | NaN | NaN | NaN | 844040.0 | NaN | NaN | 844040 |
Netherlands | NaN | NaN | NaN | 616528.0 | 1655243.0 | NaN | 2271771 |
Poland | NaN | NaN | 1888240.0 | NaN | 4379169.0 | NaN | 6267409 |
Portugal | NaN | 547631.0 | NaN | NaN | NaN | NaN | 547631 |
Romania | NaN | 2527280.0 | NaN | NaN | NaN | NaN | 2527280 |
Spain | NaN | NaN | 2557947.0 | NaN | 7483692.0 | NaN | 10041639 |
Sweden | NaN | NaN | NaN | 837119.0 | 909976.0 | NaN | 1747095 |
United Kingdom | NaN | 495360.0 | 5212391.0 | 1688880.0 | 8615246.0 | NaN | 16011877 |
Slovakia Slovak Republic | NaN | NaN | NaN | 417389.0 | NaN | NaN | 417389 |
All | 1269450.0 | 5874186.0 | 14490582.0 | 19246071.0 | 39596435.0 | 2229580.0 | 82706304 |