import pandas as pd
import numpy as np
from datetime import datetime
time_format = "%d%b%Y %H:%M"
datetime.now().strftime(time_format)
'02Apr2020 09:13'
import os
for dirname, _, filenames in os.walk("."):
for filename in filenames:
print(os.path.join(dirname, filename))
.\covid-19-data-with-sir-model.ipynb
.\Thumbs.db
.\Untitled.ipynb
.\人口金字塔.png
.\.ipynb_checkpoints\covid-19-data-with-sir-model-checkpoint.ipynb
.\.ipynb_checkpoints\Untitled-checkpoint.ipynb
.\COVID-19 containment and mitigation measures\COVID 19 Containment measures data.csv
.\covid19_global_forecasting_location\locations_population.csv
.\Novel Corona Virus 2019 Dataset\COVID19_line_list_data.csv
.\Novel Corona Virus 2019 Dataset\COVID19_open_line_list.csv.zip
.\Novel Corona Virus 2019 Dataset\covid_19_data.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_confirmed.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_deaths.csv
.\Novel Corona Virus 2019 Dataset\time_series_covid_19_recovered.csv
Total population
population_raw = pd.read_csv("./covid19_global_forecasting_location/locations_population.csv")
population_raw.head()
| Province.State | Country.Region | Population | Provenance |
---|
0 | NaN | Afghanistan | 35530000 | NaN |
1 | NaN | Albania | 2877000 | NaN |
2 | NaN | Algeria | 41320000 | NaN |
3 | NaN | Andorra | 78000 | NaN |
4 | NaN | Antigua and Barbuda | 102012 | NaN |
population_raw.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 4 columns):
Province.State 130 non-null object
Country.Region 284 non-null object
Population 284 non-null int64
Provenance 8 non-null object
dtypes: int64(1), object(3)
memory usage: 5.6+ KB
population_raw.isnull().sum()
Province.State 154
Country.Region 0
Population 0
Provenance 276
dtype: int64
pd.DataFrame(population_raw.isnull().sum()).T
| Province.State | Country.Region | Population | Provenance |
---|
0 | 154 | 0 | 0 | 276 |
pd.DataFrame(population_raw.notnull().sum()).T
| Province.State | Country.Region | Population | Provenance |
---|
0 | 130 | 284 | 284 | 8 |
df = population_raw.copy()
df = df.rename({"Province.State": "Province", "Country.Region": "Country"}, axis=1)
cols = ["Country", "Province", "Population"]
df = df.loc[:, cols].fillna("-")
df.loc[df["Country"] == df["Province"], "Province"] = "-"
_total_df = df.loc[df["Province"] != "-", :].groupby("Country").sum()
_total_df = _total_df.reset_index().assign(Province="-")
df = pd.concat([df, _total_df], axis=0, sort=True)
df = df.drop_duplicates(subset=["Country", "Province"], keep="first")
global_value = df.loc[df["Province"] == "-", "Population"].sum()
df = df.append(pd.Series(["Global", "-", global_value], index=cols), ignore_index=True)
china_value = df.loc[(df["Country"] == "China") & (df["Province"] == "-"), "Population"].sum()
df = df.append(pd.Series(["Except China", "-", china_value - global_value], index=cols), ignore_index=True)
df = df.sort_values("Population", ascending=False).reset_index(drop=True)
df = df.loc[:, cols]
population_df = df.copy()
population_df.head()
| Country | Province | Population |
---|
0 | Global | - | 7067093478 |
1 | China | - | 1376807262 |
2 | India | - | 1339000000 |
3 | US | - | 327200000 |
4 | Indonesia | - | 264000000 |
df = population_df.loc[population_df["Province"] == "-", :]
population_dict = df.set_index("Country").to_dict()["Population"]
_age_bins = [
"0-4", "5-9", "10-14", "15-19", "20-24", "25-29",
"30-34", "35-39", "40-44", "45-49", "50-54", "55-59",
"60-64", "65-69", "70-74", "75-79", "80-84", "85-89",
"90-94", "95-99", "100+"
]
_pyramid_df = pd.DataFrame({"Age_bin": _age_bins})
_pyramid_df
| Age_bin |
---|
0 | 0-4 |
1 | 5-9 |
2 | 10-14 |
3 | 15-19 |
4 | 20-24 |
5 | 25-29 |
6 | 30-34 |
7 | 35-39 |
8 | 40-44 |
9 | 45-49 |
10 | 50-54 |
11 | 55-59 |
12 | 60-64 |
13 | 65-69 |
14 | 70-74 |
15 | 75-79 |
16 | 80-84 |
17 | 85-89 |
18 | 90-94 |
19 | 95-99 |
20 | 100+ |
_name = "Global"
_male = [
349432556,
342927576,
331497486,
316642222,
308286775,
306059387,
309236984,
276447037,
249389688,
241232876,
222609691,
192215395,
157180267,
128939392,
87185982,
54754941,
33648953,
15756942,
5327866,
1077791,
124144
]
_female = [
328509234,
321511867,
309769906,
295553758,
289100903,
288632766,
296293748,
268371754,
244399176,
238133281,
223162982,
195633743,
164961323,
140704320,
101491347,
69026831,
48281201,
26429329,
11352182,
3055845,
449279
]
_pyramid_df[_name] = np.array(_male) + np.array(_female)
_pyramid_df[_name]
0 677941790
1 664439443
2 641267392
3 612195980
4 597387678
5 594692153
6 605530732
7 544818791
8 493788864
9 479366157
10 445772673
11 387849138
12 322141590
13 269643712
14 188677329
15 123781772
16 81930154
17 42186271
18 16680048
19 4133636
20 573423
Name: Global, dtype: int32