导入依赖
import csv
import time
import requests
from bs4 import BeautifulSoup
爬取的域名确定
url = 'https://ncov2019.live/'
写入UserAgent
header = {
'content-type': 'text/html;charset=UTF-8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36 Edg/87.0.664.41',
}
数据种类
data = {'Name': '', 'Confirmed': '', 'Confirmed Per Million': '', 'Confirmed Changes Today': '',
'Confirmed Percentage Day Change': '', 'Critical': '', 'Deceased': '', 'Deceased Per Million': '',
'Deceased Changes Today': '', 'Death Percentage Day Change': '', 'Tests': '', 'Active': '', 'Recovered': '',
'Recovered Per Million': '', 'Population': ''}
配置输出相关
csv_file = open('csvFile/Covid19Data' + time.strftime("%Y-%m-%d") + '.csv', "w", newline='', encoding="utf_8_sig")
csv_writer = csv.DictWriter(csv_file,
fieldnames=data.keys())
csv_writer.writeheader()
r = requests.get(url=url, headers=header)
with open('htmlFile/page' + time.strftime("%Y-%m-%d-%H-%M") + '.html', 'w', encoding='utf-8') as f:
f.write(r.text)
soup = BeautifulSoup(r.text, 'html5lib')
items = soup.find('table', id='sortable_table_world').find('tbody').find_all('tr')
循环抓数据
for item in items:
index = 0
# 获得各种数据
for key in data.keys():
s = item.select('td')[index].text
# 处理特殊字符
if '★' in s:
s = ' '.join(s.split()[1:])
s = s.strip()
# 处理空数据
if s == 'Unknown':
s = 'NA'
# 处理人口不足百万的国家
if s == '0' and (
key == 'Confirmed Per Million' or key == 'Deceased Per Million' or key == 'Recovered Per Million'):
s = 'NA'
data[key] = s
index += 1
csv_writer.writerow(data)
print(data['Name'])
csv_file.close()
整体绘图part
# 15 天中,全球新冠疫情的总体变化趋势
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 用来正常显示中文标签
plt.rcParams['savefig.dpi'] = 300 # 图片像素
plt.rcParams['figure.dpi'] = 300 # 分辨率
plt.style.use('Solarize_Light2')
df = DataFrame()
for i in range(1, 16):
if i >= 10:
str_num = str(i)
else:
str_num = '0' + str(i)
df[str(i)] = \
pd.read_csv('csvFile/Covid19Data2020-12-' + str_num + '.csv',
encoding='utf-8', thousands=',', nrows=1).loc[0] # 只读第一行的全球数据,并且去除千分位的逗号
print(df)
print(df.at['Confirmed', '1'])
print(df.loc['Confirmed'].to_list())
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(12, 8)) # nrows=2, ncols=2 figsize=(13, 13)
day_list = list(range(1, 16))
# 1.展示全球新冠疫情总确认数量变化
Confirmed_list = df.loc['Confirmed'].to_list()
ax[0, 0].plot(day_list, Confirmed_list, linewidth=2.0, marker='o')
ax[0, 0].text(day_list[0] - 1.5, Confirmed_list[0] + 3e5, '{:.2e}'.format(Confirmed_list[0]), color='r', size=13,
weight='bold')
ax[0, 0].text(day_list[-1] - 1.5, Confirmed_list[-1] + 3e5, '{:.2e}'.format(Confirmed_list[-1]), color='r', size=13,
weight='bold')
ax[0, 0].set_xticks(day_list)
ax[0, 0].set_xlabel('日期')
ax[0, 0].set_ylabel('确诊人数')
ax[0, 0].set_title('12月1日至15日全球新冠累计确诊人数变化', y=1.1, size=13)
# 2.展示全球新冠疫情增长速度变化
Confirmed_Percentage_list = df.loc['Confirmed Percentage Day Change'].to_list()
Confirmed_Percentage_list = [float(i[:-1]) for i in Confirmed_Percentage_list]
ax[0, 1].plot(day_list, Confirmed_Percentage_list, linewidth=2.0, marker='o')
ax[0, 1].set_xticks(day_list)
ax[0, 1].set_xlabel('日期')
ax[0, 1].set_yticks([0.10, 0.15, 0.20, 0.25, 0.30, 0.35])
ax[0, 1].set_yticklabels(['0.10%', '0.15%', '0.20%', '0.25%', '0.30%', '0.35%'])
ax[0, 1].set_ylabel('确诊人数日增长率')
ax[0, 1].set_title('12月1日至15日全球新冠总确诊人数增长率', y=1.1, size=13)
# 3.展示现存确诊人数
Active_list = df.loc['Active'].to_list()
ax[0, 2].plot(day_list, Active_list, linewidth=2.0, marker='o')
ax[0, 2].text(day_list[0] - 1.5, Active_list[0] + 8e4, '{:.2e}'.format(Active_list[0]), color='r', size=13,
weight='bold')
ax[0, 2].text(day_list[-1] - 1.5, Active_list[-1] + 8e4, '{:.2e}'.format(Active_list[-1]), color='r', size=13,
weight='bold')
ax[0, 2].set_xticks(day_list)
ax[0, 2].set_xlabel('日期')
ax[0, 2].set_ylabel('现存确诊人数')
ax[0, 2].set_title('12月1日至15日全球新冠现存确诊人数净变化', y=1.1, size=13)
# 4.展示全球新冠疫情死亡人数变化
Deceased_list = df.loc['Deceased'].to_list()
ax[1, 0].plot(day_list, Deceased_list, linewidth=2.0, marker='o')
ax[1, 0].text(day_list[0] - 1.5, Deceased_list[0] + 6e3, '{:.2e}'.format(Deceased_list[0]), color='r', size=13,
weight='bold')
ax[1, 0].text(day_list[-1] - 1.5, Deceased_list[-1] + 6e3, '{:.2e}'.format(Deceased_list[-1]), color='r', size=13,
weight='bold')
ax[1, 0].set_xticks(day_list)
ax[1, 0].set_xlabel('日期')
ax[1, 0].set_ylabel('死亡人数')
ax[1, 0].set_title('12月1日至15日全球新冠死亡人数变化', y=1.1, size=13)
# 5.展示全球新冠疫情康复人数变化
Recovered_list = df.loc['Recovered'].to_list()
ax[1, 1].plot(day_list, Recovered_list, linewidth=2.0, marker='o')
ax[1, 1].text(day_list[0] - 1.5, Recovered_list[0] + 3e5, '{:.2e}'.format(Recovered_list[0]), color='r', size=13,
weight='bold')
ax[1, 1].text(day_list[-1] - 1.5, Recovered_list[-1] + 3e5, '{:.2e}'.format(Recovered_list[-1]), color='r', size=13,
weight='bold')
ax[1, 1].set_xticks(day_list)
ax[1, 1].set_xlabel('日期')
ax[1, 1].set_ylabel('康复人数')
ax[1, 1].set_title('12月1日至15日全球新冠康复人数变化', y=1.1, size=13)
# 5.展示全球新冠疫情检测人数变化
Tests_list = df.loc['Tests'].to_list()
ax[1, 2].plot(day_list, Tests_list, linewidth=2.0, marker='o')
ax[1, 2].text(day_list[0] - 1.5, Tests_list[0] + 3e6, '{:.2e}'.format(Tests_list[0]), color='r', size=13,
weight='bold')
ax[1, 2].text(day_list[-1] - 1.5, Tests_list[-1] + 3e6, '{:.2e}'.format(Tests_list[-1]), color='r', size=13,
weight='bold')
ax[1, 2].set_xticks(day_list)
ax[1, 2].set_xlabel('日期')
ax[1, 2].set_ylabel('检测人数')
ax[1, 2].set_title('12月1日至15日全球新冠检测人数变化', y=1.1, size=13)
plt.tight_layout()
plt.subplots_adjust()
plt.savefig('imgResult/总体变化趋势.png')
plt.show()
恢复速度,即恢复人数和确诊比例最高的排行
# """
# 康复率(康复人数/确诊人数)最高的 10 个国家;
# """
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 用来正常显示中文标签
plt.rcParams['savefig.dpi'] = 300 # 图片像素
plt.rcParams['figure.dpi'] = 300 # 分辨率
plt.style.use('Solarize_Light2')
df = pd.read_csv('csvFile/Covid19Data2020-12-15.csv', encoding='utf-8', skiprows=[1], thousands=',', usecols=[0, 1, 12])
df['Recovered rate'] = df['Recovered'] / df['Confirmed']
print(df)
df.sort_values(by='Recovered rate', inplace=True, ascending=False)
# 取出死亡率最低的 10 个国家
df_res = df[0:10]
df_res = df_res.reset_index(drop=True) # 重置索引
print(df_res)
plt.bar(list(range(0, 50, 5)), df_res['Recovered rate'].to_list(), width=2, alpha=0.5, color='orange')
plt.xticks(list(range(0, 50, 5)), labels=df_res['Name'].to_list(), rotation=35)
plt.yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], ['0', '0.2%', '0.4%', '0.6%', '0.8%', '1.0%'])
plt.tick_params(labelsize=9)
for a, b in zip(list(range(0, 50, 5)), df_res['Recovered rate'].to_list()): # 在直方图上显示数字
plt.text(a, b + 0.008, '%.2f%%' % (b * 100), ha='center', va='bottom', fontsize=9, color='black')
plt.title('康复率最高的 10 个国家')
plt.xlabel("国家")
plt.ylabel("康复率")
plt.tight_layout()
plt.savefig('imgResult/康复率最高的10个国家.png')
plt.show()
df_res.to_csv('csvResult/康复率最高的10个国家.csv', index=False)