2019-nCOV——人口统计

import pandas as pd
import numpy as np
from datetime import datetime
time_format = "%d%b%Y %H:%M"
datetime.now().strftime(time_format)
'02Apr2020 09:13'
import os
for dirname, _, filenames in os.walk("."):
    for filename in filenames:
        print(os.path.join(dirname, filename))
.\covid-19-data-with-sir-model.ipynb
.\Thumbs.db
.\Untitled.ipynb
.\人口金字塔.png
.\.ipynb_checkpoints\covid-19-data-with-sir-model-checkpoint.ipynb
.\.ipynb_checkpoints\Untitled-checkpoint.ipynb
.\COVID-19 containment and mitigation measures\COVID 19 Containment measures data.csv
.\covid19_global_forecasting_location\locations_population.csv
.\Novel Corona Virus 2019 Dataset\COVID19_line_list_data.csv
.\Novel Corona Virus 2019 Dataset\COVID19_open_line_list.csv.zip
.\Novel Corona Virus 2019 Dataset\covid_19_data.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_confirmed.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_deaths.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_recovered.csv
Total population
population_raw = pd.read_csv("./covid19_global_forecasting_location/locations_population.csv")
population_raw.head()
Province.StateCountry.RegionPopulationProvenance
0NaNAfghanistan35530000NaN
1NaNAlbania2877000NaN
2NaNAlgeria41320000NaN
3NaNAndorra78000NaN
4NaNAntigua and Barbuda102012NaN
population_raw.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 4 columns):
Province.State    130 non-null object
Country.Region    284 non-null object
Population        284 non-null int64
Provenance        8 non-null object
dtypes: int64(1), object(3)
memory usage: 5.6+ KB
population_raw.isnull().sum()
Province.State    154
Country.Region      0
Population          0
Provenance        276
dtype: int64
pd.DataFrame(population_raw.isnull().sum()).T
Province.StateCountry.RegionPopulationProvenance
015400276
pd.DataFrame(population_raw.notnull().sum()).T
Province.StateCountry.RegionPopulationProvenance
01302842848
df = population_raw.copy()
df = df.rename({"Province.State": "Province", "Country.Region": "Country"}, axis=1)  #改列名
cols = ["Country", "Province", "Population"]
df = df.loc[:, cols].fillna("-")  # 提取某几列,并将NA值用“-”填充
df.loc[df["Country"] == df["Province"], "Province"] = "-"   #类似于Denmark	Denmark这种改为“-”

# Add total records
_total_df = df.loc[df["Province"] != "-", :].groupby("Country").sum()  #groupby会改变索引
_total_df = _total_df.reset_index().assign(Province="-")   #reset_index可以还原索引,重新变为默认的整型索引 ,assign添加新的列或者覆盖原有的列
df = pd.concat([df, _total_df], axis=0, sort=True)
df = df.drop_duplicates(subset=["Country", "Province"], keep="first")   #drop_duplicates去重,保留第一个

# Global
global_value = df.loc[df["Province"] == "-", "Population"].sum()
df = df.append(pd.Series(["Global", "-", global_value], index=cols), ignore_index=True)  #df.append加一行数据

# Global except China
china_value = df.loc[(df["Country"] == "China") & (df["Province"] == "-"), "Population"].sum()
df = df.append(pd.Series(["Except China", "-", china_value - global_value], index=cols), ignore_index=True)

# Sorting
df = df.sort_values("Population", ascending=False).reset_index(drop=True)
df = df.loc[:, cols]
population_df = df.copy()
population_df.head()
CountryProvincePopulation
0Global-7067093478
1China-1376807262
2India-1339000000
3US-327200000
4Indonesia-264000000
df = population_df.loc[population_df["Province"] == "-", :]
population_dict = df.set_index("Country").to_dict()["Population"]  #把两列变成键和值,形成字典
_age_bins = [
    "0-4", "5-9", "10-14", "15-19", "20-24", "25-29",
    "30-34", "35-39", "40-44", "45-49", "50-54", "55-59",
    "60-64", "65-69", "70-74", "75-79", "80-84", "85-89",
    "90-94", "95-99", "100+"
]
_pyramid_df = pd.DataFrame({"Age_bin": _age_bins})
_pyramid_df
Age_bin
00-4
15-9
210-14
315-19
420-24
525-29
630-34
735-39
840-44
945-49
1050-54
1155-59
1260-64
1365-69
1470-74
1575-79
1680-84
1785-89
1890-94
1995-99
20100+
# Global (WORLD)
_name = "Global"
_male = [
    349432556,
    342927576,
    331497486,
    316642222,
    308286775,
    306059387,
    309236984,
    276447037,
    249389688,
    241232876,
    222609691,
    192215395,
    157180267,
    128939392,
    87185982,
    54754941,
    33648953,
    15756942,
    5327866,
    1077791,
    124144
]
_female = [
    328509234,
    321511867,
    309769906,
    295553758,
    289100903,
    288632766,
    296293748,
    268371754,
    244399176,
    238133281,
    223162982,
    195633743,
    164961323,
    140704320,
    101491347,
    69026831,
    48281201,
    26429329,
    11352182,
    3055845,
    449279
]
_pyramid_df[_name] = np.array(_male) + np.array(_female)
_pyramid_df[_name] 
0     677941790
1     664439443
2     641267392
3     612195980
4     597387678
5     594692153
6     605530732
7     544818791
8     493788864
9     479366157
10    445772673
11    387849138
12    322141590
13    269643712
14    188677329
15    123781772
16     81930154
17     42186271
18     16680048
19      4133636
20       573423
Name: Global, dtype: int32
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值