案例一：欧洲城市人口

最新推荐文章于 2024-06-24 21:34:04 发布

烤糊的可乐

最新推荐文章于 2024-06-24 21:34:04 发布

阅读量149

点赞数

文章标签： python matplotlib 开发语言

本文链接：https://blog.csdn.net/qq_52378102/article/details/130728251

版权

#载入包
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline      
#这句魔法命令的作用是：在notebook中启动静态图形

plt.rcParams['axes.unicode_minus'] = False    # 解决坐标轴刻度负号乱码
plt.rcParams['font.sans-serif'] = ['Simhei']  # 解决中文乱码问题

# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")

#导入数据
data = pd.read_csv("D:\Desktop\数据分析训练营第五期\P5 python数据清洗&案例\【案例1】欧洲人口结构探索性分析\european_cities.csv")
data

	Rank	City	State	Population	Date of census/estimate
0	1	London[2]	United Kingdom	8,615,246	1-Jun-14
1	2	Berlin	Germany	3,437,916	31-May-14
2	3	Madrid	Spain	3,165,235	1-Jan-14
3	4	Rome	Italy	2,872,086	30-Sep-14
4	5	Paris	France	2,273,305	1-Jan-13
...	...	...	...	...	...
100	101	Bonn	Germany	309,869	31-Dec-12
101	102	Malmö	Sweden	309,105	31-Mar-13
102	103	Nottingham	United Kingdom	308,735	30-Jun-12
103	104	Katowice	Poland	308,269	30-Jun-12
104	105	Kaunas	Lithuania	306,888	1-Jan-13

105 rows × 5 columns

dt = data.copy()   #对数据进行备份

1.对数据进行简单观察

dt.head()    #查看数据前五行

	Rank	City	State	Population	Date of census/estimate
0	1	London[2]	United Kingdom	8,615,246	1-Jun-14
1	2	Berlin	Germany	3,437,916	31-May-14
2	3	Madrid	Spain	3,165,235	1-Jan-14
3	4	Rome	Italy	2,872,086	30-Sep-14
4	5	Paris	France	2,273,305	1-Jan-13

dt.shape

(105, 5)

dt.columns

Index(['Rank', 'City', 'State', 'Population', 'Date of census/estimate'], dtype='object')

dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Rank                     105 non-null    int64 
 1   City                     105 non-null    object
 2   State                    105 non-null    object
 3   Population               105 non-null    object
 4   Date of census/estimate  105 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.2+ KB

2.查看数据缺失值和重复值的情况

dt.isnull().sum()   #默认按照列进行统计，sum（1）是对行进行统计

Rank                       0
City                       0
State                      0
Population                 0
Date of census/estimate    0
dtype: int64

dt.duplicated().sum()   #查看重复值

dt.drop_duplicates(inplace = True,ignore_index = True)   #去重

dt.shape     #查看去重之后的

(105, 5)

3.数据清洗

dt.head()

	Rank	City	State	Population	Date of census/estimate
0	1	London[2]	United Kingdom	8,615,246	1-Jun-14
1	2	Berlin	Germany	3,437,916	31-May-14
2	3	Madrid	Spain	3,165,235	1-Jan-14
3	4	Rome	Italy	2,872,086	30-Sep-14
4	5	Paris	France	2,273,305	1-Jan-13

dt["City"].values

array(['London[2]', 'Berlin', 'Madrid', 'Rome', 'Paris', 'Bucharest',
       'Vienna', 'Hamburg[10]', 'Budapest', 'Warsaw', 'Barcelona',
       'Munich', 'Milan', 'Sofia', 'Prague', 'Brussels[17]', 'Birmingham',
       'Cologne', 'Naples', 'Stockholm', 'Turin', 'Marseille',
       'Amsterdam', 'Zagreb', 'Valencia', 'Kraków', 'Leeds', 'Łódź',
       'Frankfurt', 'Riga', 'Seville', 'Palermo', 'Zaragoza', 'Athens',
       'Wrocław', 'Rotterdam', 'Helsinki', 'Stuttgart', 'Glasgow',
       'Genoa', 'Düsseldorf', 'Dortmund', 'Essen', 'Málaga', 'Copenhagen',
       'Sheffield', 'Lisbon', 'Poznań', 'Bremen', 'Vilnius', 'Leipzig',
       'Dresden', 'Gothenburg', 'Dublin', 'Bradford', 'Hanover',
       'The Hague', 'Manchester', 'Antwerp', 'Edinburgh', 'Nuremberg',
       'Duisburg', 'Lyon', 'Liverpool', 'Gdańsk', 'Toulouse', 'Murcia',
       'Tallinn', 'Bristol', 'Bratislava', 'Szczecin',
       'Palma de Mallorca', 'Bologna', 'Las Palmas', 'Florence', 'Brno',
       'Bydgoszcz', 'Bochum', 'Bilbao', 'Cardiff', 'Lublin', 'Nice',
       'Wuppertal', 'Plovdiv', 'Varna', 'Alicante', 'Leicester',
       'Utrecht', 'Córdoba', 'Bielefeld', 'Wakefield', 'Aarhus',
       'Cluj-Napoca', 'Coventry', 'Bari', 'Thessaloniki', 'Wirral',
       'Timișoara', 'Catania', 'Valladolid', 'Bonn', 'Malmö',
       'Nottingham', 'Katowice', 'Kaunas'], dtype=object)

dt["City"].str.isalpha()       #判断   是否为字母

0      False
1       True
2       True
3       True
4       True
       ...  
100     True
101     True
102     True
103     True
104     True
Name: City, Length: 105, dtype: bool

#将布尔型  放在数据[]后   会返回为True的原数据
dt["City"][~dt["City"].str.isalpha()]   #~表示取反

0             London[2]
7           Hamburg[10]
15         Brussels[17]
56            The Hague
71    Palma de Mallorca
73           Las Palmas
92          Cluj-Napoca
Name: City, dtype: object

dt.iloc[[0,7,15],1]    #只能用数字

0        London[2]
7      Hamburg[10]
15    Brussels[17]
Name: City, dtype: object

dt.iloc[[0,7,15],1] = ["London","Hamburg","Brussels"]

dt.iloc[[0,7,15],1]

0       London
7      Hamburg
15    Brussels
Name: City, dtype: object

dt["Population"].values

array(['8,615,246', '3,437,916', '3,165,235', '2,872,086', '2,273,305',
       '1,883,425', '1,794,770', '1,746,342', '1,744,665', '1,729,119',
       '1,602,386', '1,407,836', '1,332,516', '1,291,895', '1,246,780',
       '1,175,831', '1,092,330', '1,034,175', '989,845', '909,976',
       '898,095', '852,516', '813,562', '790,017', '786,424', '760,700',
       '757,655', '709,757', '701,350', '701,185', '696,676', '677,015',
       '666,058', '664,046', '632,432', '616,528', '605,523', '604,297',
       '596,550', '594,774', '593,682', '575,944', '569,884', '566,913',
       '559,440', '557,382', '547,631', '547,161', '546,451', '537,152',
       '531,562', '530,754', '528,014', '527,612', '524,619', '514,137',
       '510,909', '510,772', '510,610', '495,360', '495,121', '486,816',
       '484,344', '469,690', '460,354', '441,802', '441,354', '434,810',
       '432,451', '417,389', '409,211', '407,648', '384,202', '382,296',
       '377,207', '378,327', '362,286', '362,213', '351,629', '348,493',
       '348,120', '343,304', '342,885', '341,041', '335,819', '334,678',
       '331,606', '330,772', '328,841', '328,314', '327,627', '326,676',
       '324,576', '323,132', '322,751', '322,240', '320,229', '319,279',
       '315,576', '311,501', '309,869', '309,105', '308,735', '308,269',
       '306,888'], dtype=object)

dt.Population.str.replace(",","")  #用空值 取代  ，

0      8615246
1      3437916
2      3165235
3      2872086
4      2273305
        ...   
100     309869
101     309105
102     308735
103     308269
104     306888
Name: Population, Length: 105, dtype: object

dt.Population.str.replace(",","") .astype(int)

0      8615246
1      3437916
2      3165235
3      2872086
4      2273305
        ...   
100     309869
101     309105
102     308735
103     308269
104     306888
Name: Population, Length: 105, dtype: int32

dt["Population"] = dt.Population.str.replace(",","") .astype(int)

dt.Population

0      8615246
1      3437916
2      3165235
3      2872086
4      2273305
        ...   
100     309869
101     309105
102     308735
103     308269
104     306888
Name: Population, Length: 105, dtype: int32

dt["Date of census/estimate"]

0       1-Jun-14
1      31-May-14
2       1-Jan-14
3      30-Sep-14
4       1-Jan-13
         ...    
100    31-Dec-12
101    31-Mar-13
102    30-Jun-12
103    30-Jun-12
104     1-Jan-13
Name: Date of census/estimate, Length: 105, dtype: object

dt["Date of census/estimate"] = pd.to_datetime(dt["Date of census/estimate"])

dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Rank                     105 non-null    int64         
 1   City                     105 non-null    object        
 2   State                    105 non-null    object        
 3   Population               105 non-null    int32         
 4   Date of census/estimate  105 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int32(1), int64(1), object(2)
memory usage: 3.8+ KB

4.重置索引 index

dt.set_index(["Rank"],inplace=True)

dt.head()

	City	State	Population	Date of census/estimate
Rank
1	London	United Kingdom	8615246	2014-06-01
2	Berlin	Germany	3437916	2014-05-31
3	Madrid	Spain	3165235	2014-01-01
4	Rome	Italy	2872086	2014-09-30
5	Paris	France	2273305	2013-01-01

5.探索性分析

5.1 查看那个国家的城市最多

dt.State

Rank
1       United Kingdom
2              Germany
3                Spain
4                Italy
5               France
            ...       
101            Germany
102             Sweden
103     United Kingdom
104             Poland
105          Lithuania
Name: State, Length: 105, dtype: object

dt.duplicated("State").sum()

dt.drop_duplicates("State",ignore_index = True)   #去重  查看 国家多少个

	City	State	Population	Date of census/estimate
0	London	United Kingdom	8615246	2014-06-01
1	Berlin	Germany	3437916	2014-05-31
2	Madrid	Spain	3165235	2014-01-01
3	Rome	Italy	2872086	2014-09-30
4	Paris	France	2273305	2013-01-01
5	Bucharest	Romania	1883425	2011-10-20
6	Vienna	Austria	1794770	2015-01-01
7	Budapest	Hungary	1744665	2014-01-01
8	Warsaw	Poland	1729119	2014-03-31
9	Sofia	Bulgaria	1291895	2014-12-14
10	Prague	Czech Republic	1246780	2013-01-01
11	Brussels	Belgium	1175831	2014-01-01
12	Stockholm	Sweden	909976	2014-01-31
13	Amsterdam	Netherlands	813562	2014-05-31
14	Zagreb	Croatia	790017	2011-03-31
15	Riga	Latvia	701185	2014-01-01
16	Athens	Greece	664046	2011-05-24
17	Helsinki	Finland	605523	2013-02-28
18	Copenhagen	Denmark	559440	2013-01-01
19	Lisbon	Portugal	547631	2011-01-01
20	Vilnius	Lithuania	537152	2013-01-01
21	Dublin	Ireland	527612	2011-04-10
22	Tallinn	Estonia	434810	2015-02-01
23	Bratislava	Slovakia Slovak Republic	417389	2013-12-31

dt.State.nunique()   #查看State列 不一样的个数

方法一：使用sort_values进行排序

dt.groupby("State")     #对国家进行分组

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001780DD17D30>

dt.groupby("State")["City"].count()       #对city进行计数

State
 Austria                     1
 Belgium                     2
 Bulgaria                    3
 Croatia                     1
 Czech Republic              2
 Denmark                     2
 Estonia                     1
 Finland                     1
 France                      5
 Germany                    19
 Greece                      2
 Hungary                     1
 Ireland                     1
 Italy                      10
 Latvia                      1
 Lithuania                   2
 Netherlands                 4
 Poland                     10
 Portugal                    1
 Romania                     3
 Spain                      13
 Sweden                      3
 United Kingdom             16
Slovakia Slovak Republic     1
Name: City, dtype: int64

dt.groupby("State")["City"].count().sort_values(ascending = False)[:5]   #ascending表示升序   [:5] 表示显示前五行

State
 Germany           19
 United Kingdom    16
 Spain             13
 Poland            10
 Italy             10
Name: City, dtype: int64

方法二：用.nlargest()提取值最大的n个记录

dt.groupby("State")["City"].count().nlargest(6)   #前6个最大的

State
 Germany           19
 United Kingdom    16
 Spain             13
 Italy             10
 Poland            10
 France             5
Name: City, dtype: int64

5.2 查看那个国家人口最多

dt.head()

	City	State	Population	Date of census/estimate
Rank
1	London	United Kingdom	8615246	2014-06-01
2	Berlin	Germany	3437916	2014-05-31
3	Madrid	Spain	3165235	2014-01-01
4	Rome	Italy	2872086	2014-09-30
5	Paris	France	2273305	2013-01-01

dt.groupby("State")["Population"].sum().nlargest()

State
 United Kingdom    16011877
 Germany           15119548
 Spain             10041639
 Italy              8764067
 Poland             6267409
Name: Population, dtype: int32

popu = dt.groupby("State")["Population"].sum().sort_values( ascending = False)
popu

State
 United Kingdom             16011877
 Germany                    15119548
 Spain                      10041639
 Italy                       8764067
 Poland                      6267409
 France                      4395271
 Romania                     2527280
 Netherlands                 2271771
 Bulgaria                    1968755
 Austria                     1794770
 Sweden                      1747095
 Hungary                     1744665
 Belgium                     1686441
 Czech Republic              1625107
 Greece                       986286
 Denmark                      886116
 Lithuania                    844040
 Croatia                      790017
 Latvia                       701185
 Finland                      605523
 Portugal                     547631
 Ireland                      527612
 Estonia                      434810
Slovakia Slovak Republic      417389
Name: Population, dtype: int32

#设置画布
plt.figure(figsize=(15,5))               #设置大小
sns.barplot(popu.index,popu)             #绘制条形图，popu.index为横坐标，popu为纵坐标   也可以写为popu.values
plt.xticks(rotation = 45,ha = "right")   #旋转坐标轴标签  ha是对齐方式

# 设置刻度文本的大小
plt.yticks(fontsize=10)
plt.xticks(fontsize=10)

# 设置轴标签的文本以及大小
plt.ylabel('国家名',fontsize=12)
plt.xlabel('主要城市人口（千万)',fontsize=12)
plt.show()

# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ZzWolWaJ-1684310503578)(output_47_0.png)]

#画图
plt.figure(figsize=(15,5))                                 #设置大小
sns.barplot(popu.values, popu.index, orient = "h")         #orient表示转项
plt.xticks(rotation = 45, ha = "right")                    #旋转坐标轴标签  ha是对齐方式
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pIfTRJOr-1684310503579)(output_48_0.png)]

5.3 查看欧洲每年人口变化情况

#使用apply循环进行  提取x的year
dt["year"] = dt["Date of census/estimate"].apply(lambda x: x.year)     #map映射也行
dt.head()

	City	State	Population	Date of census/estimate	year
Rank
1	London	United Kingdom	8615246	2014-06-01	2014
2	Berlin	Germany	3437916	2014-05-31	2014
3	Madrid	Spain	3165235	2014-01-01	2014
4	Rome	Italy	2872086	2014-09-30	2014
5	Paris	France	2273305	2013-01-01	2013

dt.groupby("year")["Population"].sum().plot(kind = "line",color = "r",title = "欧洲人口变化情况",xlabel = "年份",ylabel = "人口数量");

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3Qi1jJoW-1684310503579)(output_51_0.png)]

#设置画布
plt.figure(figsize=(10,5))               #设置大小
sns.lineplot(dt.year,dt.Population)             #绘制条形图，popu.index为横坐标，popu为纵坐标   也可以写为popu.values
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cLJ8F3Dp-1684310503579)(output_52_0.png)]

dt.pivot_table(values="Population",index="State",columns="year",aggfunc='sum',margins=True)

year	2010	2011	2012	2013	2014	2015	All
State
Austria	NaN	NaN	NaN	NaN	NaN	1794770.0	1794770
Belgium	NaN	NaN	NaN	NaN	1686441.0	NaN	1686441
Bulgaria	NaN	NaN	NaN	676860.0	1291895.0	NaN	1968755
Croatia	NaN	790017.0	NaN	NaN	NaN	NaN	790017
Czech Republic	NaN	NaN	NaN	1625107.0	NaN	NaN	1625107
Denmark	NaN	NaN	NaN	559440.0	326676.0	NaN	886116
Estonia	NaN	NaN	NaN	NaN	NaN	434810.0	434810
Finland	NaN	NaN	NaN	605523.0	NaN	NaN	605523
France	1269450.0	NaN	852516.0	2273305.0	NaN	NaN	4395271
Germany	NaN	NaN	3979488.0	7702144.0	3437916.0	NaN	15119548
Greece	NaN	986286.0	NaN	NaN	NaN	NaN	986286
Hungary	NaN	NaN	NaN	NaN	1744665.0	NaN	1744665
Ireland	NaN	527612.0	NaN	NaN	NaN	NaN	527612
Italy	NaN	NaN	NaN	1399736.0	7364331.0	NaN	8764067
Latvia	NaN	NaN	NaN	NaN	701185.0	NaN	701185
Lithuania	NaN	NaN	NaN	844040.0	NaN	NaN	844040
Netherlands	NaN	NaN	NaN	616528.0	1655243.0	NaN	2271771
Poland	NaN	NaN	1888240.0	NaN	4379169.0	NaN	6267409
Portugal	NaN	547631.0	NaN	NaN	NaN	NaN	547631
Romania	NaN	2527280.0	NaN	NaN	NaN	NaN	2527280
Spain	NaN	NaN	2557947.0	NaN	7483692.0	NaN	10041639
Sweden	NaN	NaN	NaN	837119.0	909976.0	NaN	1747095
United Kingdom	NaN	495360.0	5212391.0	1688880.0	8615246.0	NaN	16011877
Slovakia Slovak Republic	NaN	NaN	NaN	417389.0	NaN	NaN	417389
All	1269450.0	5874186.0	14490582.0	19246071.0	39596435.0	2229580.0	82706304