2019-疫情可视化项目

最新推荐文章于 2024-04-29 09:28:28 发布

浪漫攻城狮莉小妖

最新推荐文章于 2024-04-29 09:28:28 发布

阅读量586

点赞数

文章标签：爬虫 python

本文链接：https://blog.csdn.net/qq_43527345/article/details/120308481

版权

疫情可视化项目

爬虫抓取各个省市疫情数据

发起请求。

通过HTTP库向目标站点（各个省市的卫健委）发起请求，即放一个Request，请求还可以包含额外的headers等信息，伪造自己是一个浏览器在发送请求，使用浏览器头，还可以添加代理池或者多个User-Agent，防止被禁

请求方式GET POST

请求URL

请求头:User-Agent Host Cookies等

请求体
获取响应内容

服务器正常响应的话会得到一个Response，Response的内容就是要获取的内容，类型主要为html和Json字符串

响应状态

响应头

响应体：要获取的数据
解析内容

得到的内容使用BeautifulSoup库进行解析，使用正则将数据提取出来

Json,XPath,BeautifulSoup,正则
保存数据

将数据保存为json或者csv文件

# import json
# from urllib.request import Request, urlopen
# import time
# # 2-20,4-20
# for i in range(9):
#     date="2020-04-0"+str(1+i)
#     print(date)
#     foreign_data = "http://api.tianapi.com/txapi/ncovabroad/index?key=c3b1d267b0d5a944cb93715dd8b72101&date="+date
#     print(foreign_data)
#     request_data = Request(foreign_data)  #获取接口的响应头
#     html_area = urlopen(request_data).read()  #从响应头中读取数据
#     foreignData = json.loads(html_area.decode('utf-8'))#将字符串转换为字典
#     print(foreignData)
#     with open('./'+date+'.json','w',encoding = 'utf-8') as jfile:  #将字典写入json文件
#           json.dump(foreignData,jfile,ensure_ascii=False,indent = 4)#即添加参数 ensure_ascii=False，它默认的是Ture
#     time.sleep(5)

# import requests
# from bs4 import BeautifulSoup
# import datetime
# import re
# from selenium import webdriver
# import time
# import xlwings as xw
#
# def get_sh_data(url):
#     '''获得上海卫健委的数据'''
#     r = requests.get(url=url, headers=sh_headers)
#     sh_dict = {}
#     soup = BeautifulSoup(r.text, 'lxml')
#     # print(soup)
#     ivs_content = soup.find(name='div', attrs={'id':'ivs_content', 'class':'Article_content'})
#     new_text = ivs_content.get_text()
#     # print(new_text)
#     sh_dict['累计排除疑似'] = re.search('已累计排除疑似病例(\d+)例', new_text).group(1)
#     sh_dict['累计确诊'] = re.search('发现确诊病例(\d+)例', new_text).group(1)
#     style2 = '(\d+)例病情危重，(\d+)例重症，(\d+)例治愈出院，(\d+)例死亡'
#     sh_dict['累计重症'] = int(re.search(style2, new_text).group(1)) + int(re.search(style2, new_text).group(2))
#     sh_dict['累计治愈'] = re.search(style2, new_text).group(3)
#     sh_dict['累计死亡'] = re.search(style2, new_text).group(4)
#     sh_dict['累计疑似'] = re.search('尚有(\d+)例疑似病例正在排查中', new_text).group(1)
#     return sh_dict
#
# def get_sh_today_news():
#     '''获得上海卫健委的新闻'''
#     url = r'http://wsjkw.sh.gov.cn/xwfb/index.html'
#     r = requests.get(url=url, headers=sh_headers)
#     soup = BeautifulSoup(r.text, 'lxml')
#     # print(soup)
#     today_format = datetime.datetime.today().strftime('%Y-%m-%d')
#     today_sh_news = soup.find_all(name='span', text=today_format)
#     today_counts = len(today_sh_news)
#     for i in range(today_counts-1, -1, -1):
#         title = today_sh_news[i].find_previous_sibling(name='a').attrs['title']  # 标题
#         href = 'http://wsjkw.sh.gov.cn' + today_sh_news[i].find_previous_sibling(name='a').attrs['href'] #网址
#         if title.startswith('上海新增'):
#             # print(title)
#             return get_sh_data(href)
#
# def get_all_today_news():
#     '''获得国家卫健委的新闻'''
#     url = 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd.shtml'
#     r = requests.get(url, headers=quanguo_headers)
#     soup = BeautifulSoup(r.text, 'lxml')
#     # print(soup)
#     today_format = datetime.datetime.today().strftime('%Y-%m-%d')
#     latest_news_title = soup.find(name='span', text=today_format).find_previous_sibling(name='a').attrs['title']
#     latest_news_href = 'http://www.nhc.gov.cn' + soup.find(name='span', text=today_format).find_previous_sibling(name='a').attrs['href']
#     # print(latest_news_href)
#     return get_all_today_data(latest_news_href)
#
# def get_all_today_data(url):
#     '''获得国家卫健委的数据'''
#     r = requests.get(url, headers=quanguo_headers)
#     all_dict = {}
#     hubei_dict = {}
#     soup = BeautifulSoup(r.text, 'lxml')
#     news = soup.find(name='p').get_text()
#     # print(news)
#     all_dict['新增疑似'] = re.search('新增疑似病例(\d+)例', news).group(1)
#     all_dict['累计疑似'] = re.search('现有疑似病例(\d+)例', news).group(1)
#     all_dict['累计确诊'] = re.search('累计报告确诊病例(\d+)例', news).group(1)
#     all_dict['累计重症'] = re.search('其中重症病例(\d+)例', news).group(1)
#     all_dict['累计死亡'] = re.search('累计死亡病例(\d+)例', news).group(1)
#     all_dict['累计治愈'] = re.search('累计治愈出院病例(\d+)例', news).group(1)
#
#     hubei_dict['新增疑似'] = re.search('新增疑似病例(\d+)例.*?（武汉(\d+)例', news).group(1)
#     hubei_dict['新增确诊'] = re.search('湖北新增确诊病例(\d+)例.*?（武汉(\d+)例', news).group(1)
#     hubei_dict['新增死亡'] = re.search('新增死亡病例(\d+)例.*?（武汉(\d+)例', news).group(1)
#     hubei_dict['新增治愈'] = re.search('新增治愈出院病例(\d+)例（武汉(\d+)例）', news).group(1)
#     hubei_dict['累计重症'] = re.search('其中重症病例(\d+)例.*?（武汉(\d+)例', news).group(1)
#     # print(all_dict, hubei_dict)
#     return all_dict, hubei_dict
#
# def get_cookie(url):
#     driver = webdriver.Chrome()
#     driver.get(url)
#     time.sleep(3)
#     cookies = driver.get_cookies()
#     driver.quit()
#     items = []
#     for i in range(len(cookies)):
#         cookie_value = cookies[i]
#         item = cookie_value['name'] + '=' + cookie_value['value']
#         items.append(item)
#     cookiestr = '; '.join(a for a in items)
#     return cookiestr
#
# def get_into_excel():
#     '''把数据贴到excel里'''
#     app = xw.App(visible=True, add_book=False)
#     app.display_alerts = False
#     app.screen_updating = False
#
#     wb = app.books.open('新型冠状病毒每日数据.xlsx')
#     ws = wb.sheets['all']
#     max_row = ws.api.UsedRange.Rows.count
#     ws.range('C' + str(max_row)).value = hubei_data['新增确诊']
#     ws.range('K' + str(max_row)).value = hubei_data['新增死亡']
#     ws.range('O' + str(max_row)).value = hubei_data['新增治愈']
#     ws.range('S' + str(max_row)).value = hubei_data['新增疑似']
#     ws.range('AA' + str(max_row)).value = hubei_data['累计重症']
#
#     ws.range('R' + str(max_row)).value = all_data['新增疑似']
#     ws.range('AL' + str(max_row)).value = all_data['累计疑似']
#     ws.range('V' + str(max_row)).value = all_data['累计确诊']
#     ws.range('Z' + str(max_row)).value = all_data['累计重症']
#     ws.range('AD' + str(max_row)).value = all_data['累计死亡']
#     ws.range('AH' + str(max_row)).value = all_data['累计治愈']
#
#     ws.range('AN' + str(max_row)).value = sh_data['累计排除疑似']
#     ws.range('Y' + str(max_row)).value = sh_data['累计确诊']
#     ws.range('AC' + str(max_row)).value = sh_data['累计重症']
#     ws.range('AK' + str(max_row)).value = sh_data['累计治愈']
#     ws.range('AG' + str(max_row)).value = sh_data['累计死亡']
#     ws.range('AM' + str(max_row)).value = sh_data['累计疑似']
#
#     wb.save()
#     wb.close()
#     app.quit()
#
#
#
# if __name__ == "__main__":
#     sh_headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
#         'Cookie': get_cookie('http://wsjkw.sh.gov.cn/xwfb/index.html'),
#         # 'Cookie': 'zh_choose=s; zh_choose=s; _gscu_2010802395=80620430ie0po683; yd_cookie=12f170fc-e368-4a662db5220af2d434160e259b2e31585efb; _ydclearance=2cd0a8873fd311efcda1c1aa-05fc-4001-a108-0e86b80b3fee-1580700296; _gscbrs_2010802395=1; _pk_ref.30.0806=%5B%22%22%2C%22%22%2C1580693101%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DDVUbOETLyMZLC5c_V7RJRbAYPvyqaU3f2PCBi2-E6KC2QEFltdrKWGmhgA5NbC3c%26wd%3D%26eqid%3Df38b30250015e1c5000000045e365a8d%22%5D; _pk_ses.30.0806=*; _pk_id.30.0806=35b481da38abb562.1580620431.6.1580694952.1580693101.; _gscs_2010802395=80693100qds57e17|pv:6; AlteonP=ALa1BGHbHKyWUqcNUGRETw$$',
#         'Host': 'wsjkw.sh.gov.cn'
#     }
#     quanguo_headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
#         'Cookie': 'oHAcoULcWCQb80S=pxzexGFCvyGV4xDkaMHSyjBmzwXn5O4vfCbxFCgMDfcBaKqsFU9FHstqjFY6wJt9; yfx_c_g_u_id_10006654=_ck20020209283417867964364567575; _gscu_2059686908=81579037nbf5xc58; insert_cookie=67313298; yfx_f_l_v_t_10006654=f_t_1580606914774__r_t_1581643181169__v_t_1581678949269__r_c_14; security_session_verify=a2efd6893c3ad08675db9b0f5c365ecf; oHAcoULcWCQb80T=4Ywh2qE8IiJP44ThdpW0fs7Yqi1Hwlh9RhJHrW2WVl536y4eCIgXxGh9M8IuYUqGUCCtBO5kBc2DB6Kewd3naLK_O2bK5W3w3pcqT.uX3asTXxC2SGBqy9eV2DoGB0ZXb4uTPzPGbXebmT6xIYxbAmGbm_kZVX_nUvBL4nkAuFAVvcGLBmXr8nsdEToXztqZUlYnTjn9niwHMcg3th7XhJvFS_tckqRq5bLpvS_IKPuYn2JLraIIejlErBhA5IQhyHXFekNynv5PYgpzu2PguGccrP3c_bcg1MFViQjKVhgs_B22Nv4NxdHdiIk9GdZDZBjQ',
#         'Host': 'www.nhc.gov.cn'
#     }
#     # #一、全国和湖北的数据
#     # all_data, hubei_data, sh_data = {}, {}, {}
#     # try:
#     #     all_data, hubei_data = get_all_today_news()
#     #     print('全国数据：{}\n'
#     #       '湖北数据：{}'.format(all_data, hubei_data))
#     # except:
#     #     print('全国数据未更新')
#     #二、上海的数据
#     try:
#         sh_data = get_sh_today_news()
#         print('上海数据：{}'.format(sh_data))
#     except:
#         print('上海数据未更新')
#     #三、导出到excel里
#     if sh_data != {} and all_data != {}:
#         get_into_excel()
#         print('Excel刷新成功！')

import requests
from bs4 import BeautifulSoup
import re
import json
import csv


headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0'
    }
u1 = "http://wsjkw.gd.gov.cn/zwyw_yqxx/index.html"
u2 = "http://wsjkw.cq.gov.cn/ztzl_242/qlzhxxgzbdfyyqfkgz/yqtb"
def page(url):

    url = url
    #伪造请求头
    req = requests.get(url,headers=headers)
    # 设置编码格式
    req.encoding = 'utf-8'
    soup = BeautifulSoup(req.text,'html.parser')
    data = soup.find_all("p")
    return data
# 重庆
def ch_parse(url):

    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.text, 'html.parser')
    for item in soup.find_all("a"):
        data = item.get("href")
        # 获取某一个的连接
        d = re.compile('\.(.*?html)').findall(str(data))
        if d:
            page_url = 'http://wsjkw.cq.gov.cn/ztzl_242/qlzhxxgzbdfyyqfkgz/yqtb' + d[0]
            data = page(page_url)
            # with open('./重庆.json', 'a', encoding='utf-8') as jfile:  # 将字典写入json文件
            #     for item in data:
            #         item = str(item)
            #         d1 = re.compile('<p class=\"tit\">(.*?)</p>').findall(item)
            #         d2 = re.compile('(.*?境外.*)</p>').findall(item)
            #         if d1:
            #             json.dump(d1[0], jfile, ensure_ascii=False)
            #         if d2:
            #             json.dump(d2[0], jfile, ensure_ascii=False)
            f = open('重庆.csv', 'w', encoding='utf-8',newline='')
            # 2. 基于文件对象构建 csv写入对象
            csv_writer = csv.writer(f)

            # 3. 构建列表头 date,countryName,provinceName,count
            csv_writer.writerow(["date", "countryName", "provinceName","count"])

            # 4. 写入csv文件内容
            date=0
            countryName=0
            provinceName=0
            count=0
            for item in data:
                item = str(item)
                d = re.compile('<p class=\"tit\">(.*?)重庆市新冠肺炎疫情情况</p>').findall(item)
                if d:
                    date=d
                c=re.compile('为(.*?)输入').findall(item)
                if c:
                    countryName=c
                co = re.compile('境外输入新冠肺炎确诊病例(.*?)例').findall(item)
                if co:
                    count=co
            print(date)
            print(countryName)
            print(count)
            provinceName="重庆"
            if count:
                csv_writer.writerow([date, countryName, provinceName,count])

            # 5. 关闭文件
            f.close()
# 广东
def gd_parse(url):

    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.text, 'html.parser')
    for item in soup.find_all("a"):
        data = item.get("href")
        d = re.compile('(http://wsjkw.gd.gov.cn/zwyw_yqxx/.*?html)').findall(str(data))
        print(d)
        if d:
            page_url = d[0]
            data = page(page_url)
            with open('./广东.json', 'a', encoding='utf-8') as jfile:  # 将字典写入json文件
                for item in data:
                    item = str(item)
                    print(item)
                    d2 = re.compile('<p indenttext="　　" noextractcontent="true" style="text-align: left;">(.*?)</p>').findall(item)
                    d1 = re.compile('<p class="margin_top15 c999999 text_cencer">(.*?)</p>').findall(item)
                    if d1:
                        json.dump(d1[0], jfile, ensure_ascii=False)
                    if d2:
                        json.dump(d2[0], jfile, ensure_ascii=False)
def hn_parse(url):
    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.text, 'html.parser')
    for item in soup.find_all("a"):
        data = item.get("href")
        d = re.compile('(/wjw/xxgk.*?html)').findall(str(data))
        # print(d)
        if d:
            page_url = 'http://wjw.hunan.gov.cn'+d[0]
            print(page_url)
            data = page(page_url)

            with open('./湖南.json', 'a', encoding='utf-8') as jfile:  # 将字典写入json文件
                json.dump(page_url, jfile, ensure_ascii=False)
                for item in data:
                    item = str(item)
                    # print(item)
                    # d2 = re.compile(
                    #     '<p indenttext="　　" noextractcontent="true" style="text-align: left;">(.*?)</p>').findall(item)
                    d1 = re.compile('[0-9_\u4e00-\u9fa5]{,}').findall(item)
                    print(item)
                    print(d1)
                    for i in range(len(d1)):
                        if d1[i] != '':
                            json.dump(d1[i], jfile, ensure_ascii=False)
                            print(d1[i])
                    # if d2:
                    #     json.dump(d2[0], jfile, ensure_ascii=False)
# 河南 山东 江苏 黑龙江 浙江 广东
def parse1(url):
    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.text, 'html.parser')
    # print(soup)
    # for item in soup.find_all("a"):
    #     data = item.get("href")
    #     print(data)
    d = re.compile('/(.*?html)').findall(str(soup))
    print(d)
    for i in range(len(d)):
        if d[i]:
            page_url = "http://www.zjwjw.gov.cn/"+d[i]
            # print(page_url)
            data = page(page_url)

            with open('./浙江2.json', 'a', encoding='utf-8') as jfile:  # 将字典写入json文件
                json.dump(page_url, jfile, ensure_ascii=False)
                for item in data:
                    item = str(item)
                    # print(item)
                    # d2 = re.compile(
                    #     '<p indenttext="　　" noextractcontent="true" style="text-align: left;">(.*?)</p>').findall(item)
                    d1 = re.compile('[0-9_\u4e00-\u9fa5]{,}').findall(item)
                    print(item)
                    print(d1)
                    for i in range(len(d1)):
                        if d1[i] != '':
                            json.dump(d1[i], jfile, ensure_ascii=False)
                            print(d1[i])
                    # if d2:
                    #     json.dump(d2[0], jfile, ensure_ascii=False)

# 四川
def sc_parse(url):
    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.text, 'html.parser')
    for item in soup.find_all("a"):
        # data = item.get("href")
        d = re.compile('(".*?)</a>').findall(str(item))
        print(d)
        with open('./四川.json', 'a', encoding='utf-8') as jfile:  # 将字典写入json文件
            json.dump(d, jfile, ensure_ascii=False)
# 上海 北京
def parse2(url):
    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.text, 'html.parser')
    # print(soup)
    # for item in soup.find_all("a"):
    #     data = item.get("href")
    #     print(data)
    d = re.compile('(http:.*?)"').findall(str(soup))
    print(d)
    for i in range(len(d)):
        if d[i]:
            page_url = d[i]
            # print(page_url)
            data = page(page_url)

            with open('./北京.json', 'a', encoding='utf-8') as jfile:  # 将字典写入json文件
                json.dump(page_url, jfile, ensure_ascii=False)
                for item in data:
                    item = str(item)
                    # print(item)
                    # d2 = re.compile(
                    #     '<p indenttext="　　" noextractcontent="true" style="text-align: left;">(.*?)</p>').findall(item)
                    d1 = re.compile('[0-9_\u4e00-\u9fa5]{,}').findall(item)
                    print(item)
                    print(d1)
                    for i in range(len(d1)):
                        if d1[i] != '':
                            json.dump(d1[i], jfile, ensure_ascii=False)
                            print(d1[i])
def main():
    # 重庆
    url=["http://wsjkw.cq.gov.cn/ztzl_242/qlzhxxgzbdfyyqfkgz/yqtb/index.html",
         "http://wsjkw.cq.gov.cn/ztzl_242/qlzhxxgzbdfyyqfkgz/yqtb/index_1.html",
         "http://wsjkw.cq.gov.cn/ztzl_242/qlzhxxgzbdfyyqfkgz/yqtb/index_2.html",
         "http://wsjkw.cq.gov.cn/ztzl_242/qlzhxxgzbdfyyqfkgz/yqtb/index_3.html"]
    # for i in range(len(url)):
    #     ch_parse(url[i])

    # 广东
    url = ["http://wsjkw.gd.gov.cn/zwyw_yqxx/index.html",
           "http://wsjkw.gd.gov.cn/zwyw_yqxx/index_1.html",
           "http://wsjkw.gd.gov.cn/zwyw_yqxx/index_2.html",
           "http://wsjkw.gd.gov.cn/zwyw_yqxx/index_3.html",
           "http://wsjkw.gd.gov.cn/zwyw_yqxx/index_4.html"]
    # for i in range(len(url)):
    #     gd_parse(url[i])
    # 湖南
    url = ["http://wjw.hunan.gov.cn/wjw/qwfb/yqfkgz_list.html",
           "http://wjw.hunan.gov.cn/wjw/qwfb/yqfkgz_list__1.html",
           "http://wjw.hunan.gov.cn/wjw/qwfb/yqfkgz_list_2.html",
           "http://wjw.hunan.gov.cn/wjw/qwfb/yqfkgz_list_3.html",
           "http://wjw.hunan.gov.cn/wjw/qwfb/yqfkgz_list_4.html",
           "http://wjw.hunan.gov.cn/wjw/qwfb/yqfkgz_list_5.html"]
    # for i in range(len(url)):
    #     hn_parse(url[i])
    # 河南
    url=["http://wsjkw.henan.gov.cn/ztzl/xxgzbdfyyqfk/yqtb/index.html",
         "http://wsjkw.henan.gov.cn/ztzl/xxgzbdfyyqfk/yqtb/index_1.html",
         "http://wsjkw.henan.gov.cn/ztzl/xxgzbdfyyqfk/yqtb/index_2.html",
         "http://wsjkw.henan.gov.cn/ztzl/xxgzbdfyyqfk/yqtb/index_3.html"]
    # for i in range(len(url)):
    #     parse1(url[i])
    # 四川
    url = ["http://www.sc.gov.cn/10462/c102248/fkxxhzbdone_list.shtml",
           "http://www.sc.gov.cn/10462/c102248/fkxxhzbdone_list_1.shtml",
           "http://www.sc.gov.cn/10462/c102248/fkxxhzbdone_list_2.shtml",
           "http://www.sc.gov.cn/10462/c102248/fkxxhzbdone_list_3.shtml",
           "http://www.sc.gov.cn/10462/c102248/fkxxhzbdone_list_4.shtml",
           "http://www.sc.gov.cn/10462/c102248/fkxxhzbdone_list_5.shtml",
           "http://www.sc.gov.cn/10462/c102248/fkxxhzbdone_list_6.shtml",
           "http://www.sc.gov.cn/10462/c102248/fkxxhzbdone_list_7.shtml"]
    # for i in range(len(url)):
    #     sc_parse(url[i])
    # 山东
    url = ["http://wsjkw.shandong.gov.cn/ztzl/rdzt/qlzhfkgz/tzgg/index.html",
            "http://wsjkw.shandong.gov.cn/ztzl/rdzt/qlzhfkgz/tzgg/index_1.html",
           "http://wsjkw.shandong.gov.cn/ztzl/rdzt/qlzhfkgz/tzgg/index_2.html",
           "http://wsjkw.shandong.gov.cn/ztzl/rdzt/qlzhfkgz/tzgg/index_3.html",
           "http://wsjkw.shandong.gov.cn/ztzl/rdzt/qlzhfkgz/tzgg/index_4.html"]
    # for i in range(len(url)):
    #     parse1(url[i])
    # 江苏
    url = ["http://wjw.jiangsu.gov.cn/col/col7290/index.html?uid=222741&pageNum=1",
           "http://wjw.jiangsu.gov.cn/col/col7290/index.html?uid=222741&pageNum=2",
           "http://wjw.jiangsu.gov.cn/col/col7290/index.html?uid=222741&pageNum=3",
           "http://wjw.jiangsu.gov.cn/col/col7290/index.html?uid=222741&pageNum=4",
           "http://wjw.jiangsu.gov.cn/col/col7290/index.html?uid=222741&pageNum=5"]
    # for i in range(len(url)):
    #     parse1(url[i])
    # 黑龙江
    url = ["http://wsjkw.hlj.gov.cn/index.php/Home/Zwgk/all.shtml?typeid=42&p=1",
           "http://wsjkw.hlj.gov.cn/index.php/Home/Zwgk/all.shtml?typeid=42&p=2",
           "http://wsjkw.hlj.gov.cn/index.php/Home/Zwgk/all.shtml?typeid=42&p=3",
           "http://wsjkw.hlj.gov.cn/index.php/Home/Zwgk/all.shtml?typeid=42&p=4",
           "http://wsjkw.hlj.gov.cn/index.php/Home/Zwgk/all.shtml?typeid=42&p=5",
           "http://wsjkw.hlj.gov.cn/index.php/Home/Zwgk/all.shtml?typeid=42&p=6"]
    # for i in range(len(url)):
    #     parse1(url[i])
    # 浙江
    url = ["http://www.zjwjw.gov.cn/col/col1202101/index.html?uid=4978845&pageNum=1",
           "http://www.zjwjw.gov.cn/col/col1202101/index.html?uid=4978845&pageNum=2",
           "http://www.zjwjw.gov.cn/col/col1202101/index.html?uid=4978845&pageNum=3",
           "http://www.zjwjw.gov.cn/col/col1202101/index.html?uid=4978845&pageNum=4",
           "http://www.zjwjw.gov.cn/col/col1202101/index.html?uid=4978845&pageNum=5",
           "http://www.zjwjw.gov.cn/col/col1202101/index.html?uid=4978845&pageNum=6",
           "http://www.zjwjw.gov.cn/col/col1202101/index.html?uid=4978845&pageNum=7"]
    for i in range(len(url)):
        parse1(url[i])
# 广东
    url = [
           "http://wsjkw.gd.gov.cn/zwyw_yqxx/index.html",
           "http://wsjkw.gd.gov.cn/zwyw_yqxx/index_2.html",
           "http://wsjkw.gd.gov.cn/zwyw_yqxx/index_3.html",
           "http://wsjkw.gd.gov.cn/zwyw_yqxx/index_4.html",
           "http://wsjkw.gd.gov.cn/zwyw_yqxx/index_5.html"]
    # for i in range(len(url)):
    #     parse1(url[i])
    # 上海
    # parse2("http://wsjkw.sh.gov.cn/xwfb/index.html")
    # for i in range(1,16):
    #     parse2("http://wsjkw.sh.gov.cn/xwfb/index_"+str(i)+".html")
    # 北京
    # parse2("http://wjw.beijing.gov.cn/xwzx_20031/mtjj/index.html")
    # for i in range(1,13):
    #     parse2("http://wjw.beijing.gov.cn/xwzx_20031/mtjj/index_"+str(i)+".html")
if __name__ == "__main__":
    main()

疫情防控措施数据处理

import csv
import json
# 数据说明本数据通过整理经济观察日报的‘疫情响应哪个省市更果断有效，我们做了些数据分析’https://baijiahao.baidu.com/s?id=1662225757963480864&wfr=spider&for=pc文章的数据手工整理，可能存在误差
# 治愈率，死亡率，新增治愈率（医疗领域干涉力），新增确诊人数（疫情控制程度），各省份决策速度，各省份风险应对执行力 重大事件全方位执行力 疫情防控效果
                # 每个省份三个数据 2020-02-23 2020-03-28'2020-04-29
# 各省份决策速度 湖北和西藏的数据没有,只有29各省份的数据['安徽 80 80 100 80', '北京60 100 100 100', '重庆80 100 100 100', '福建 100 80 100 80', '甘肃80 100 100 80', '广东60 60 80 100', '广西100 100 80 80', '贵州100 80 100 80', '海南60 60 60 60', '河北80 80 100 80', '黑龙江60 100 80 60', '河南100 80 100 100', '湖北', '湖南80 60 100 80', '内蒙古60 60 100 80', '江苏80 80 100 100', '江西80 100 100 60', '吉林60 60 60 80', '辽宁80 100 80 100', '宁夏100 80 100 60', '青海80 100 100 80', '陕西80 60 100 80', '山东100 80 100 100', '上海100 100 100 100', '山西60 60 80 80', '四川100 100 100 100', '天津80 100 100 100',  '新疆60 60 80 80', '云南60 80 80 80', '浙江100 60 80 100']
def write(jlist):
    json_data=json.dumps(jlist,ensure_ascii=False)
    with open('./yiqing_data.json','a+', encoding='utf-8') as f_obj:
            f_obj.write(json_data)
with open('./provinceData (1).csv', 'r', encoding='utf-8', erro22rs='ignore') as thefile:
    reader = csv.DictReader(thefile)
    list1 = []
    list_province = [] #省份列表 除去香港，澳门和台湾
    for row in reader:  # 遍历每一行
        if row['Province']!='台湾'and row['Province']!='澳门'and row['Province']!='香港'and row['Province']!='西藏'and row['Province']!='湖北' :
            list1.append(row['Province'])
    for i in list1:
        if i not in list_province:
            list_province.append(i)
    print(list_province)
    write(list_province)
with open('./省份人数.csv', 'r', encoding='utf-8', errors='ignore') as file:
    reader = csv.DictReader(file)
    list_province_person = []  # 省份列表 除去香港，澳门和台湾
    for i in range(32):
        list_province_person.append(0)
    for row in reader:  # 遍历每一行
        print(row['Province'])

        for i in range(len(list_province)):
            if(list_province[i] == row['Province']):
                list_province_person[i]=row['Person']
    print(list_province_person)
list_prov_detail=[]#各省份详细信息 1-22-3-22 10+31+22=63
province_list = locals() #33个省份
list_detail = locals()
# 治愈率cured_rate 死亡率death_rate
for i in range(len(list_province)):
    province_list['province_list' + str(i)] = []
    # print(province_list['province_list' + str(i)])

    for j in range(3):
        list_detail['list_detail' + str(j)] = []

with open('./provinceData (1).csv', 'r', encoding='utf-8', errors='ignore') as thefile:
    reader = csv.DictReader(thefile)
    for row in reader:  # 遍历每一行
        print(row['Province'])
        for i in range(len(list_province)):
            j = 0
            if row['Province'] == list_province[i]:
                decide_speed = [80, 60, 80, 100, 80, 60, 100, 100, 60, 80, 60, 100, 80, 60, 80, 80, 60, 80, 100, 80, 80,
                                100, 100, 60, 100, 80, 60, 60, 100]
                #     各省份风险应对执行力
                risk_speed = [80, 100, 100, 80, 100, 60, 100, 80, 60, 80, 100, 80, 60, 60, 80, 100, 60, 100, 80, 100,
                              60, 80, 100, 60, 100, 100, 60, 80, 60]
                # 重大事件全方位执行力
                event_speed = [100, 100, 100, 100, 100, 80, 80, 100, 60, 100, 80, 100, 100, 100, 100, 100, 60, 80, 100,
                               100, 100, 100, 100, 100, 80, 100, 100, 80, 80]
                # 疫情防控效果
                effect_speed = [80, 100, 100, 80, 80, 100, 80, 80, 60, 80, 60, 100, 80, 80, 100, 60, 80, 100, 60, 80,
                                80, 100, 100, 80, 100, 100, 80, 80, 100]
                # for z in range(100):

                if row['Date']=='2020-02-23':
                    print(row['Date'])
                    last_confirmed=0
                    last_recoverd=0
                    death_rate = float(row['Deaths']) / float(row['Confirmed'])
                    # 治愈率
                    cured_rate=float(row['Recovered'])/float(row['Confirmed'])
                    # 疫情控制程度
                    # 新增确诊人数比例
                    i_confirmed = (int(row['Confirmed']) - int(last_confirmed)) / int(list_province_person[i])
                    # 新增治愈人数比例
                    i_recoverd=(int(row['Recovered'])-int(last_recoverd))/float(row['Confirmed'])
                    last_confirmed=row['Confirmed']
                    last_recoverd=row['Recovered']
                    list_detail['list_detail' + str(j)].append(cured_rate)
                    list_detail['list_detail' + str(j)].append(death_rate)
                    list_detail['list_detail' + str(j)].append(i_confirmed)
                    list_detail['list_detail' + str(j)].append(i_recoverd)
                    list_detail['list_detail' + str(j)].append(decide_speed[i])
                    list_detail['list_detail' + str(j)].append(risk_speed[i])
                    list_detail['list_detail' + str(j)].append(event_speed[i])
                    list_detail['list_detail' + str(j)].append(effect_speed[i])
                    if len(list_detail['list_detail' + str(j)]):
                        province_list['province_list' + str(i)].append(list_detail['list_detail' + str(j)])
                    j+=1

                elif row['Date']=='2020-03-28':
                    print(row['Date'])
                    death_rate = float(row['Deaths']) / float(row['Confirmed'])
                    # 治愈率
                    cured_rate = float(row['Recovered']) / float(row['Confirmed'])
                    # 疫情控制程度
                    # 新增确诊人数比例
                    i_confirmed = (int(row['Confirmed']) - int(last_confirmed)) / int(list_province_person[i])
                    # 新增治愈人数比例
                    i_recoverd = (int(row['Recovered']) - int(last_recoverd)) / float(row['Confirmed'])
                    last_confirmed = row['Confirmed']
                    last_recoverd = row['Recovered']
                    list_detail['list_detail' + str(j)].append(cured_rate)
                    list_detail['list_detail' + str(j)].append(death_rate)
                    list_detail['list_detail' + str(j)].append(i_confirmed)
                    list_detail['list_detail' + str(j)].append(i_recoverd)
                    list_detail['list_detail' + str(j)].append(decide_speed[i])
                    list_detail['list_detail' + str(j)].append(risk_speed[i])
                    list_detail['list_detail' + str(j)].append(event_speed[i])
                    list_detail['list_detail' + str(j)].append(effect_speed[i])
                    if len(list_detail['list_detail' + str(j)]):
                        province_list['province_list' + str(i)].append(list_detail['list_detail' + str(j)])
                    j += 1
                elif row['Date']=='2020-04-29':
                    print(row['Date'])
                    death_rate = float(row['Deaths']) / float(row['Confirmed'])
                    # 治愈率
                    cured_rate = float(row['Recovered']) / float(row['Confirmed'])
                    # 疫情控制程度
                    # 新增确诊人数比例
                    i_confirmed = (int(row['Deaths']) - int(last_confirmed)) / int(list_province_person[i])
                    # 新增治愈人数比例
                    i_recoverd = (int(row['Recovered']) - int(last_recoverd)) / float(row['Confirmed'])
                    list_detail['list_detail' + str(j)].append(cured_rate)
                    list_detail['list_detail' + str(j)].append(death_rate)
                    list_detail['list_detail' + str(j)].append(i_confirmed)
                    list_detail['list_detail' + str(j)].append(i_recoverd)
                    list_detail['list_detail' + str(j)].append(decide_speed[i])
                    list_detail['list_detail' + str(j)].append(risk_speed[i])
                    list_detail['list_detail' + str(j)].append(event_speed[i])
                    list_detail['list_detail' + str(j)].append(effect_speed[i])
                    if len(list_detail['list_detail' + str(j)]):
                        province_list['province_list' + str(i)].append(list_detail['list_detail' + str(j)])
                    write(province_list['province_list' + str(i)])
                    # for j in range(32):
                    #     # 治愈率cured_rate
                    #     if row['Confirmed'] == 0.0:
                    #         cured_rate = 1.0
                    #         death_rate = 0.0
                    #     else:
                    #         # 死亡率death_rate
                    #         death_rate += float(row['Deaths']) / float(row['Confirmed'])
                    #         # 治愈率
                    #         cured_rate+=float(row['Recovered'])/float(row['Confirmed'])
                    #         #疫情控制程度
                    #         # 新增死亡人数比例
                    #         i_death=int(row['Deaths'])-int(last_death)
                    #         # 新增治愈人数比例
                    #         i_recoverd=int(row['Recovered'])-int(last_recoverd)
                    #         i_death_rate=i_death/int(list_province_person[i])
                    #         i_recoverd_rate=int(i_recoverd)/int(list_province_person[i])



                # print(cured_rate)
                # add_confirmed=



            # print(len(province_list['province_list' + str(i)]))
            # write(dict)
            # print(province_list['province_list' + str(i)])

浪漫攻城狮莉小妖

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2019-疫情可视化项目

疫情可视化项目爬虫抓取各个省市疫情数据发起请求。通过HTTP库向目标站点（各个省市的卫健委）发起请求，即放一个Request，请求还可以包含额外的headers等信息，伪造自己是一个浏览器在发送请求，使用浏览器头，还可以添加代理池或者多个User-Agent，防止被禁请求方式GET POST请求URL请求头:User-Agent Host Cookies等请求体获取响应内容服务器正常响应的话会得到一个Response，Response的内容就是要获取的内容，类型主要为html和Js
复制链接

扫一扫