Wuhan Coronavirus Data Analytics
"""
Created on Mon Mar 2 11:00:00 2020
@author: xiaoyao
"""
部分输出内容省略
# 显示当前工作目录
%pwd
# 导入必要的库
import numpy as np
import pandas as pd
# 可视化库
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
import pycountry
py.init_notebook_mode(connected=True)
import folium
from folium import plugins
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
# 增加默认的图像尺寸,设置配色方案
plt.rcParams['figure.figsize'] = 8, 5
#plt.rcParams['image.cmap'] = 'viridis'
import os
for dirname, _, filenames in os.walk('./2019_nCoV_data'):
for filename in filenames:
print(os.path.join(dirname, filename))
# 禁用警告
import warnings
warnings.filterwarnings('ignore')
# 读取数据集
data= pd.read_csv("./2019_nCoV_data/2019_nCoV_data.csv")
data.head()
data.info()
print(type(data))
# 在这里我导入常用的数据分析库
import pandas_profiling
pandas_profiling.ProfileReport(data)
# 将生成的数据分析报告生成文件,存储到本地
report = pandas_profiling.ProfileReport(data)
report.to_file("./reportfile.html")
# 如下转换之之前的数据格式
print(type(data["Date"]))
# 日期一列格式进行转换
data['Date'] = data['Date'].apply(pd.to_datetime)
# 删除编号一列
data.drop(['Sno'],axis=1,inplace=True)
data.head()
截止目前有哪些被新冠病毒影响的国家
# 受影响的国家进行输出,同时去掉重复值,转为列表形式输出
countries = data['Country'].unique().tolist()
print(countries)
# 总共受到影响的国家数量
print("\n共计受影响的国家数量: ",len(countries))
['China', 'US', 'Japan', 'Thailand', 'South Korea', 'Mainland China', 'Hong Kong', 'Macau', 'Taiwan', 'Singapore', 'Philippines', 'Malaysia', 'Vietnam', 'Australia', 'Mexico', 'Brazil', 'France', 'Nepal', 'Canada', 'Cambodia', 'Sri Lanka', 'Ivory Coast', 'Germany', 'Finland', 'United Arab Emirates', 'India', 'Italy', 'Sweden', 'Russia', 'Spain', 'UK', 'Belgium', 'Others', 'Egypt']
共计受影响的国家数量: 34
#很明显,上述的输出种含有“China”和"Mainland China"把后者替换为“China”
data['Country'].replace({'Mainland China':'China'},inplace=True)
countries = data['Country'].unique().tolist()
print(countries)
print("\n共计受影响的国家数量: ",len(countries))
['China', 'US', 'Japan', 'Thailand', 'South Korea', 'Hong Kong', 'Macau', 'Taiwan', 'Singapore', 'Philippines', 'Malaysia', 'Vietnam', 'Australia', 'Mexico', 'Brazil', 'France', 'Nepal', 'Canada', 'Cambodia', 'Sri Lanka', 'Ivory Coast', 'Germany', 'Finland', 'United Arab Emirates', 'India', 'Italy', 'Sweden', 'Russia', 'Spain', 'UK', 'Belgium', 'Others', 'Egypt']
共计受影响的国家数量: 33
# 首先将日期数据转为str类型,进行“年月日”拆分之后转为int类型
d = data['Date'][-1:].astype('str')
year = int(d.values[0].split('-')[0])
month = int(d.values[0].split('-')[1])
day = int(d.values[0].split('-')[2].split()[0])
from datetime import date
data_latest = data[data['Date'] > pd.Timestamp(date(year,month,day))]
data_latest.head()
Number_of_countries = len(data_latest['Country'].value_counts())
cases = pd.DataFrame(data_latest.groupby('Country')['Confirmed'].sum())
cases['Country'] = cases.index
cases.index=np.arange(1,Number_of_countries+1)
global_cases = cases[['Country','Confirmed']]
#global_cases.sort_values(by=['Confirmed'],ascending=False)
global_cases
# 载入world_coordinates dataset数据集合
world_coordinates = pd.read_csv('./2019_nCoV_data/world_coordinates.csv')
world_coordinates.head()
# 表合并
world_data = pd.merge(world_coordinates,global_cases,on='Country')
world_data.head()
当前世界各地疫情的可视化
# 使用folium绘制地图
world_map = folium.Map(location=[10, -20], zoom_start=2.3,tiles='Stamen Toner')
for lat, lon, value, name in zip(world_data['latitude'], world_data['longitude'], world_data['Confirmed'], world_data['Country']):
folium.CircleMarker([lat, lon],
radius=10,
popup = ('<strong>Country</strong>: ' + str(name).capitalize() + '<br>'
'<strong>Confirmed Cases</strong>: ' + str(value) + '<br>'),
color='red',
fill_color='red',
fill_opacity=0.7 ).add_to(world_map)
world_map
# 全球确诊,死亡,确诊的人数
print('Globally Confirmed Cases: ',data_latest['Confirmed'].sum())
print('Global Deaths: ',data_latest['Deaths'].sum())
print('Globally Recovered Cases: ',data_latest['Recovered'].sum())
Globally Confirmed Cases: 71226
Global Deaths: 1770
Globally Recovered Cases: 10865
data_latest.groupby(['Country','Province/State']).sum()
data_latest.groupby('Country')['Deaths'].sum().sort_values(ascending=False)[:5]
Country
China 1765
Taiwan 1
France 1
Hong Kong 1
Philippines 1
Name: Deaths, dtype: int64
data_latest.groupby('Country')['Recovered'].sum().sort_values(ascending=False)[:5]
Country
China 10748
Singapore 18
Thailand 14
Japan 12
South Korea 9
Name: Recovered, dtype: int64
China = data_latest[data_latest['Country']=='China']
China['Province/State']=China['Province/State'].map(lambda x:x.lower())
China
f, ax = plt.subplots(figsize=(12, 8))
sns.set_color_codes("pastel")
sns.barplot(x="Confirmed", y="Province/State", data=China[1:],
label="Confirmed", color="r")
sns.set_color_codes("muted")
sns.barplot(x="Recovered", y="Province/State", data=China[1:],
label="Recovered", color="g")
# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 400), ylabel="",
xlabel="Stats")
sns.despine(left=True, bottom=True)
latitude = 39.91666667
longitude = 116.383333
china_map = folium.Map(location=[latitude, longitude], zoom_start=12)
china_coordinates= pd.read_csv("./2019_nCoV_data/china_Province_coordinates.csv")
china_coordinates.rename(columns={'name3':'Province/State'},inplace=True)
china_coordinates.head()
df_china_virus = China.merge(china_coordinates)
df_china_virus.head()
data = pd.DataFrame({
'name':list(df_china_virus['name']),
'lat':list(df_china_virus['lat']),
'lon':list(df_china_virus['lon']),
'Confirmed':list(df_china_virus['Confirmed']),
'Recovered':list(df_china_virus['Recovered']),
'Deaths':list(df_china_virus['Deaths'])
})
data.head()
china_map1 = folium.Map(location=[latitude, longitude], zoom_start=4,tiles='Stamen Toner')
for lat, lon, value, name in zip(data['lat'], data['lon'], data['Confirmed'], data['name']):
folium.CircleMarker([lat, lon],
radius=13,
popup = ('Province: ' + str(name).capitalize() + '<br>'
'Confirmed: ' + str(value) + '<br>'),
color='red',
fill_color='red',
fill_opacity=0.7 ).add_to(china_map1)
folium.Map(titles='jj', attr="attribution")
china_map1
china_map = folium.Map(location=[latitude, longitude], zoom_start=4,tiles='Stamen Toner')
for lat, lon, value, name in zip(data['lat'], data['lon'], data['Deaths'], data['name']):
folium.CircleMarker([lat, lon],
radius=13,
popup = ('Province: ' + str(name).capitalize() + '<br>'
'Deaths: ' + str(value) + '<br>'),
color='black',
fill_color='red',
fill_opacity=0.7 ).add_to(china_map)
folium.Map(titles='jj', attr="attribution")
china_map
china_map = folium.Map(location=[latitude, longitude], zoom_start=4,tiles='Stamen Toner')
for lat, lon, value, name in zip(data['lat'], data['lon'], data['Recovered'], data['name']):
folium.CircleMarker([lat, lon],
radius=10,
popup = ('Province: ' + str(name).capitalize() + '<br>'
'Recovered: ' + str(value) + '<br>'),
color='green',
fill_color='green',
fill_opacity=0.7 ).add_to(china_map)
china_map