案例一:欧洲城市人口

#载入包
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline      
#这句魔法命令的作用是:在notebook中启动静态图形

plt.rcParams['axes.unicode_minus'] = False    # 解决坐标轴刻度负号乱码
plt.rcParams['font.sans-serif'] = ['Simhei']  # 解决中文乱码问题

# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")
#导入数据
data = pd.read_csv("D:\Desktop\数据分析训练营第五期\P5 python数据清洗&案例\【案例1】欧洲人口结构探索性分析\european_cities.csv")
data
RankCityStatePopulationDate of census/estimate
01London[2]United Kingdom8,615,2461-Jun-14
12BerlinGermany3,437,91631-May-14
23MadridSpain3,165,2351-Jan-14
34RomeItaly2,872,08630-Sep-14
45ParisFrance2,273,3051-Jan-13
..................
100101BonnGermany309,86931-Dec-12
101102MalmöSweden309,10531-Mar-13
102103NottinghamUnited Kingdom308,73530-Jun-12
103104KatowicePoland308,26930-Jun-12
104105KaunasLithuania306,8881-Jan-13

105 rows × 5 columns

dt = data.copy()   #对数据进行备份

1.对数据进行简单观察

dt.head()    #查看数据前五行
RankCityStatePopulationDate of census/estimate
01London[2]United Kingdom8,615,2461-Jun-14
12BerlinGermany3,437,91631-May-14
23MadridSpain3,165,2351-Jan-14
34RomeItaly2,872,08630-Sep-14
45ParisFrance2,273,3051-Jan-13
dt.shape
(105, 5)
dt.columns
Index(['Rank', 'City', 'State', 'Population', 'Date of census/estimate'], dtype='object')
dt.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Rank                     105 non-null    int64 
 1   City                     105 non-null    object
 2   State                    105 non-null    object
 3   Population               105 non-null    object
 4   Date of census/estimate  105 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.2+ KB

2.查看数据缺失值和重复值的情况

dt.isnull().sum()   #默认按照列进行统计,sum(1)是对行进行统计
Rank                       0
City                       0
State                      0
Population                 0
Date of census/estimate    0
dtype: int64
dt.duplicated().sum()   #查看重复值
0
dt.drop_duplicates(inplace = True,ignore_index = True)   #去重
dt.shape     #查看去重之后的
(105, 5)

3.数据清洗

dt.head()
RankCityStatePopulationDate of census/estimate
01London[2]United Kingdom8,615,2461-Jun-14
12BerlinGermany3,437,91631-May-14
23MadridSpain3,165,2351-Jan-14
34RomeItaly2,872,08630-Sep-14
45ParisFrance2,273,3051-Jan-13
dt["City"].values
array(['London[2]', 'Berlin', 'Madrid', 'Rome', 'Paris', 'Bucharest',
       'Vienna', 'Hamburg[10]', 'Budapest', 'Warsaw', 'Barcelona',
       'Munich', 'Milan', 'Sofia', 'Prague', 'Brussels[17]', 'Birmingham',
       'Cologne', 'Naples', 'Stockholm', 'Turin', 'Marseille',
       'Amsterdam', 'Zagreb', 'Valencia', 'Kraków', 'Leeds', 'Łódź',
       'Frankfurt', 'Riga', 'Seville', 'Palermo', 'Zaragoza', 'Athens',
       'Wrocław', 'Rotterdam', 'Helsinki', 'Stuttgart', 'Glasgow',
       'Genoa', 'Düsseldorf', 'Dortmund', 'Essen', 'Málaga', 'Copenhagen',
       'Sheffield', 'Lisbon', 'Poznań', 'Bremen', 'Vilnius', 'Leipzig',
       'Dresden', 'Gothenburg', 'Dublin', 'Bradford', 'Hanover',
       'The Hague', 'Manchester', 'Antwerp', 'Edinburgh', 'Nuremberg',
       'Duisburg', 'Lyon', 'Liverpool', 'Gdańsk', 'Toulouse', 'Murcia',
       'Tallinn', 'Bristol', 'Bratislava', 'Szczecin',
       'Palma de Mallorca', 'Bologna', 'Las Palmas', 'Florence', 'Brno',
       'Bydgoszcz', 'Bochum', 'Bilbao', 'Cardiff', 'Lublin', 'Nice',
       'Wuppertal', 'Plovdiv', 'Varna', 'Alicante', 'Leicester',
       'Utrecht', 'Córdoba', 'Bielefeld', 'Wakefield', 'Aarhus',
       'Cluj-Napoca', 'Coventry', 'Bari', 'Thessaloniki', 'Wirral',
       'Timișoara', 'Catania', 'Valladolid', 'Bonn', 'Malmö',
       'Nottingham', 'Katowice', 'Kaunas'], dtype=object)
dt["City"].str.isalpha()       #判断   是否为字母
0      False
1       True
2       True
3       True
4       True
       ...  
100     True
101     True
102     True
103     True
104     True
Name: City, Length: 105, dtype: bool
#将布尔型  放在数据[]后   会返回为True的原数据
dt["City"][~dt["City"].str.isalpha()]   #~表示取反
0             London[2]
7           Hamburg[10]
15         Brussels[17]
56            The Hague
71    Palma de Mallorca
73           Las Palmas
92          Cluj-Napoca
Name: City, dtype: object
dt.iloc[[0,7,15],1]    #只能用数字
0        London[2]
7      Hamburg[10]
15    Brussels[17]
Name: City, dtype: object
dt.iloc[[0,7,15],1] = ["London","Hamburg","Brussels"]
dt.iloc[[0,7,15],1]  
0       London
7      Hamburg
15    Brussels
Name: City, dtype: object
dt["Population"].values
array(['8,615,246', '3,437,916', '3,165,235', '2,872,086', '2,273,305',
       '1,883,425', '1,794,770', '1,746,342', '1,744,665', '1,729,119',
       '1,602,386', '1,407,836', '1,332,516', '1,291,895', '1,246,780',
       '1,175,831', '1,092,330', '1,034,175', '989,845', '909,976',
       '898,095', '852,516', '813,562', '790,017', '786,424', '760,700',
       '757,655', '709,757', '701,350', '701,185', '696,676', '677,015',
       '666,058', '664,046', '632,432', '616,528', '605,523', '604,297',
       '596,550', '594,774', '593,682', '575,944', '569,884', '566,913',
       '559,440', '557,382', '547,631', '547,161', '546,451', '537,152',
       '531,562', '530,754', '528,014', '527,612', '524,619', '514,137',
       '510,909', '510,772', '510,610', '495,360', '495,121', '486,816',
       '484,344', '469,690', '460,354', '441,802', '441,354', '434,810',
       '432,451', '417,389', '409,211', '407,648', '384,202', '382,296',
       '377,207', '378,327', '362,286', '362,213', '351,629', '348,493',
       '348,120', '343,304', '342,885', '341,041', '335,819', '334,678',
       '331,606', '330,772', '328,841', '328,314', '327,627', '326,676',
       '324,576', '323,132', '322,751', '322,240', '320,229', '319,279',
       '315,576', '311,501', '309,869', '309,105', '308,735', '308,269',
       '306,888'], dtype=object)
dt.Population.str.replace(",","")  #用空值 取代  ,
0      8615246
1      3437916
2      3165235
3      2872086
4      2273305
        ...   
100     309869
101     309105
102     308735
103     308269
104     306888
Name: Population, Length: 105, dtype: object
dt.Population.str.replace(",","") .astype(int)
0      8615246
1      3437916
2      3165235
3      2872086
4      2273305
        ...   
100     309869
101     309105
102     308735
103     308269
104     306888
Name: Population, Length: 105, dtype: int32
dt["Population"] = dt.Population.str.replace(",","") .astype(int)
dt.Population
0      8615246
1      3437916
2      3165235
3      2872086
4      2273305
        ...   
100     309869
101     309105
102     308735
103     308269
104     306888
Name: Population, Length: 105, dtype: int32
dt["Date of census/estimate"]
0       1-Jun-14
1      31-May-14
2       1-Jan-14
3      30-Sep-14
4       1-Jan-13
         ...    
100    31-Dec-12
101    31-Mar-13
102    30-Jun-12
103    30-Jun-12
104     1-Jan-13
Name: Date of census/estimate, Length: 105, dtype: object
dt["Date of census/estimate"] = pd.to_datetime(dt["Date of census/estimate"])
dt.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Rank                     105 non-null    int64         
 1   City                     105 non-null    object        
 2   State                    105 non-null    object        
 3   Population               105 non-null    int32         
 4   Date of census/estimate  105 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int32(1), int64(1), object(2)
memory usage: 3.8+ KB

4.重置索引 index

dt.set_index(["Rank"],inplace=True)
dt.head()
CityStatePopulationDate of census/estimate
Rank
1LondonUnited Kingdom86152462014-06-01
2BerlinGermany34379162014-05-31
3MadridSpain31652352014-01-01
4RomeItaly28720862014-09-30
5ParisFrance22733052013-01-01

5.探索性分析

  • 5.1 查看那个国家的城市最多
dt.State
Rank
1       United Kingdom
2              Germany
3                Spain
4                Italy
5               France
            ...       
101            Germany
102             Sweden
103     United Kingdom
104             Poland
105          Lithuania
Name: State, Length: 105, dtype: object
dt.duplicated("State").sum()
81
dt.drop_duplicates("State",ignore_index = True)   #去重  查看 国家多少个
CityStatePopulationDate of census/estimate
0LondonUnited Kingdom86152462014-06-01
1BerlinGermany34379162014-05-31
2MadridSpain31652352014-01-01
3RomeItaly28720862014-09-30
4ParisFrance22733052013-01-01
5BucharestRomania18834252011-10-20
6ViennaAustria17947702015-01-01
7BudapestHungary17446652014-01-01
8WarsawPoland17291192014-03-31
9SofiaBulgaria12918952014-12-14
10PragueCzech Republic12467802013-01-01
11BrusselsBelgium11758312014-01-01
12StockholmSweden9099762014-01-31
13AmsterdamNetherlands8135622014-05-31
14ZagrebCroatia7900172011-03-31
15RigaLatvia7011852014-01-01
16AthensGreece6640462011-05-24
17HelsinkiFinland6055232013-02-28
18CopenhagenDenmark5594402013-01-01
19LisbonPortugal5476312011-01-01
20VilniusLithuania5371522013-01-01
21DublinIreland5276122011-04-10
22TallinnEstonia4348102015-02-01
23BratislavaSlovakia Slovak Republic4173892013-12-31
dt.State.nunique()   #查看State列 不一样的个数
24

方法一:使用sort_values进行排序

dt.groupby("State")     #对国家进行分组
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001780DD17D30>
dt.groupby("State")["City"].count()       #对city进行计数
State
 Austria                     1
 Belgium                     2
 Bulgaria                    3
 Croatia                     1
 Czech Republic              2
 Denmark                     2
 Estonia                     1
 Finland                     1
 France                      5
 Germany                    19
 Greece                      2
 Hungary                     1
 Ireland                     1
 Italy                      10
 Latvia                      1
 Lithuania                   2
 Netherlands                 4
 Poland                     10
 Portugal                    1
 Romania                     3
 Spain                      13
 Sweden                      3
 United Kingdom             16
Slovakia Slovak Republic     1
Name: City, dtype: int64
dt.groupby("State")["City"].count().sort_values(ascending = False)[:5]   #ascending表示升序   [:5] 表示显示前五行
State
 Germany           19
 United Kingdom    16
 Spain             13
 Poland            10
 Italy             10
Name: City, dtype: int64

方法二:用.nlargest()提取值最大的n个记录

dt.groupby("State")["City"].count().nlargest(6)   #前6个最大的
State
 Germany           19
 United Kingdom    16
 Spain             13
 Italy             10
 Poland            10
 France             5
Name: City, dtype: int64
  • 5.2 查看那个国家人口最多
dt.head()
CityStatePopulationDate of census/estimate
Rank
1LondonUnited Kingdom86152462014-06-01
2BerlinGermany34379162014-05-31
3MadridSpain31652352014-01-01
4RomeItaly28720862014-09-30
5ParisFrance22733052013-01-01
dt.groupby("State")["Population"].sum().nlargest()
State
 United Kingdom    16011877
 Germany           15119548
 Spain             10041639
 Italy              8764067
 Poland             6267409
Name: Population, dtype: int32
popu = dt.groupby("State")["Population"].sum().sort_values( ascending = False)
popu
State
 United Kingdom             16011877
 Germany                    15119548
 Spain                      10041639
 Italy                       8764067
 Poland                      6267409
 France                      4395271
 Romania                     2527280
 Netherlands                 2271771
 Bulgaria                    1968755
 Austria                     1794770
 Sweden                      1747095
 Hungary                     1744665
 Belgium                     1686441
 Czech Republic              1625107
 Greece                       986286
 Denmark                      886116
 Lithuania                    844040
 Croatia                      790017
 Latvia                       701185
 Finland                      605523
 Portugal                     547631
 Ireland                      527612
 Estonia                      434810
Slovakia Slovak Republic      417389
Name: Population, dtype: int32
#设置画布
plt.figure(figsize=(15,5))               #设置大小
sns.barplot(popu.index,popu)             #绘制条形图,popu.index为横坐标,popu为纵坐标   也可以写为popu.values
plt.xticks(rotation = 45,ha = "right")   #旋转坐标轴标签  ha是对齐方式

# 设置刻度文本的大小
plt.yticks(fontsize=10)
plt.xticks(fontsize=10)

# 设置轴标签的文本以及大小
plt.ylabel('国家名',fontsize=12)
plt.xlabel('主要城市人口(千万)',fontsize=12)
plt.show()

# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ZzWolWaJ-1684310503578)(output_47_0.png)]

#画图
plt.figure(figsize=(15,5))                                 #设置大小
sns.barplot(popu.values, popu.index, orient = "h")         #orient表示转项
plt.xticks(rotation = 45, ha = "right")                    #旋转坐标轴标签  ha是对齐方式
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pIfTRJOr-1684310503579)(output_48_0.png)]

  • 5.3 查看欧洲每年人口变化情况
#使用apply循环进行  提取x的year
dt["year"] = dt["Date of census/estimate"].apply(lambda x: x.year)     #map映射也行
dt.head()
CityStatePopulationDate of census/estimateyear
Rank
1LondonUnited Kingdom86152462014-06-012014
2BerlinGermany34379162014-05-312014
3MadridSpain31652352014-01-012014
4RomeItaly28720862014-09-302014
5ParisFrance22733052013-01-012013
dt.groupby("year")["Population"].sum().plot(kind = "line",color = "r",title = "欧洲人口变化情况",xlabel = "年份",ylabel = "人口数量");

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3Qi1jJoW-1684310503579)(output_51_0.png)]

#设置画布
plt.figure(figsize=(10,5))               #设置大小
sns.lineplot(dt.year,dt.Population)             #绘制条形图,popu.index为横坐标,popu为纵坐标   也可以写为popu.values
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cLJ8F3Dp-1684310503579)(output_52_0.png)]

dt.pivot_table(values="Population",index="State",columns="year",aggfunc='sum',margins=True)
year201020112012201320142015All
State
AustriaNaNNaNNaNNaNNaN1794770.01794770
BelgiumNaNNaNNaNNaN1686441.0NaN1686441
BulgariaNaNNaNNaN676860.01291895.0NaN1968755
CroatiaNaN790017.0NaNNaNNaNNaN790017
Czech RepublicNaNNaNNaN1625107.0NaNNaN1625107
DenmarkNaNNaNNaN559440.0326676.0NaN886116
EstoniaNaNNaNNaNNaNNaN434810.0434810
FinlandNaNNaNNaN605523.0NaNNaN605523
France1269450.0NaN852516.02273305.0NaNNaN4395271
GermanyNaNNaN3979488.07702144.03437916.0NaN15119548
GreeceNaN986286.0NaNNaNNaNNaN986286
HungaryNaNNaNNaNNaN1744665.0NaN1744665
IrelandNaN527612.0NaNNaNNaNNaN527612
ItalyNaNNaNNaN1399736.07364331.0NaN8764067
LatviaNaNNaNNaNNaN701185.0NaN701185
LithuaniaNaNNaNNaN844040.0NaNNaN844040
NetherlandsNaNNaNNaN616528.01655243.0NaN2271771
PolandNaNNaN1888240.0NaN4379169.0NaN6267409
PortugalNaN547631.0NaNNaNNaNNaN547631
RomaniaNaN2527280.0NaNNaNNaNNaN2527280
SpainNaNNaN2557947.0NaN7483692.0NaN10041639
SwedenNaNNaNNaN837119.0909976.0NaN1747095
United KingdomNaN495360.05212391.01688880.08615246.0NaN16011877
Slovakia Slovak RepublicNaNNaNNaN417389.0NaNNaN417389
All1269450.05874186.014490582.019246071.039596435.02229580.082706304


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值