import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm import tqdm
class CoronaVirusSpider(object):
def __init__(self):
self.home_url = 'http://ncov.dxy.cn/ncovh5/view/pneumonia'
def get_content_from_url(self, url):
"""
根据URL,获取响应内容的字符串数据
:param url: 请求URL
:return:响应内容的字符串
"""
# 获取到页面包含标签所有内容
response = requests.get(url)
home_page = response.content.decode()
return home_page
# print(home_page)
def parse_home_page(self, home_page):
"""
解析首页内容,获取解析后的
:param home_page:
:return:
"""
# 通过标签筛选出所需要的内容
soup = BeautifulSoup(home_page, 'html5lib')
script = soup.find(id='getListByCountryTypeService2true')
text = script.text
# print(script.text)
# 使用正则表达式匹配json
json_str = re.findall(r'\[.+\]', text)[0]
# print(json_str)
# 把json字符串转换为python类型
last_day_corona_virus = json.loads(json_str)
return last_day_corona_virus
def save(self, data, path):
# 以json格式保存文件
with open(path, 'w') as fp:
json.dump(data, fp)
def crawl_last_day_corona_virus(self):
"""
采集最近一天的各国疫情数据
:return:
"""
# 发送请求,获取首页内容
home_page = self.get_content_from_url(self.home_url)
# 解析首页内容,获取最近一天的各国疫情数据
last_day_corona_virus = self.parse_home_page(home_page)
# 保存数据
self.save(last_day_corona_virus, 'json/last_day_corona_virus.json')
def crawl_corona_virus(self):
"""
采集从几号以来的数据
:return:
"""
# 加载已存储json各国疫情数据
with open('json/last_day_corona_virus.json') as fp:
last_day_corona_virus = json.load(fp)
corona_virus = []
# 遍历数据获取统计的URL
for country in tqdm(last_day_corona_virus, '采集各国统计数据'):
# 获取URL数据
statistics_data_url = country['statisticsData']
# 请求URL获取内容
statistics_data_json_str = self.get_content_from_url(statistics_data_url)
# 把数据转为python类型数据
statistics_data = json.loads(statistics_data_json_str)['data']
# print(statistics_data)
# 增加国家信息
for one_day in statistics_data:
one_day['provinceName'] = country['provinceName']
one_day['countryShortCode'] = country['countryShortCode']
corona_virus.extend(statistics_data)
# 把数据保存文件
self.save(corona_virus, 'json/corona_virus.json')
# print(last_day_corona_virus)
def run(self):
# self.crawl_last_day_corona_virus()
self.crawl_corona_virus()
if __name__ == '__main__':
spider = CoronaVirusSpider()
spider.run()