python疫情数据爬取与可视化展示

最新推荐文章于 2024-07-09 14:43:42 发布

ch1762

最新推荐文章于 2024-07-09 14:43:42 发布

阅读量4.1k

点赞数 6

分类专栏： technology 文章标签： python

本文链接：https://blog.csdn.net/weixin_43150428/article/details/108077622

版权

本文介绍如何使用Python的requests和BeautifulSoup进行疫情数据爬取，并通过正则表达式处理数据，最后将数据导入数据库进行可视化展示。

摘要由CSDN通过智能技术生成

python疫情数据爬取与可视化展示

爬虫&正则

requests&Beautiful Soup

requests爬取网页(练手)

import requests

url = 'http://wsjkw.sc.gov.cn/scwsjkw/gzbd/fyzt.shtml'  #爬取网页的ＵＲＬ
res = requests.get(url)          
'''
有些网页采用了一些反爬措施，使用上述代码爬取网页可能会返回403的状态码
可以通过设置header绕过
header格式　header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'}
此时　res = requests.get(url,headers=header) 
'''
print(res.encoding)              #查看网页编码格式
res.encoding = 'utf-8'           #网页编码格式设置为utf8
print(res.headers)               #查看网页头信息
print(res.url)                   #查看网页返回的地址
print(res.status_code)           #查看网页返回的状态码    
print(res.text)                  #查看网页内容

####Beautiful Soup解析网页数据

html = res.text
soup = BeautifulSoup(html)                       #使用BeautifulSoup解析网页内容            
g_url = soup.find('a').attrs['href']             #获取网页中第一个a标签中href属性的值 
#结果为(/scwsjkw/gzbd01/2020/8/16/2ad3f5e305ee4d1c8bbd56cef4bbaa84.shtml)    
new_url = 'http://wsjkw.sc.gov.cn/'　+　g_url     #拼接网址，得到需求数据的地址          
res = requests.get(new_url)                             
res.encoding = 'utf-8'                                 
html = res.text                                          
s = BeautifulSoup(html)                                   
s.find_all('p')[1]                                #获取网页中所有p标签的值，通过下标取值

正则

re

正则表达式速查表:https://www.jb51.net/tools/regexsc.htm

re.search(regex,str)

import re

html = s.find_all('p')[1].text                        
confirm_add_patten = "新增(.*?)确诊病例(\d+)"           #设置正则式  ()内的为需要返回的内容，若不加括号则表示即使符合正则也不会返回
#"新增(.*?)确诊病例(\d+)"       返回 新型冠状病毒肺炎 和 1 
#"新增.*?确诊病例(\d+)"         返回　1   
confirm_add = re.search(confirm_add_patten,html)      #正则匹配  若匹配不到返回None,匹配到则返回一个元组
print(confirm_add.groups())                           #显示匹配到的内容     
print(confirm_add.group(0))                           #显示匹配到的所有文本内容        
print(confirm_add.group(1))                           #显示匹配到的第一个内容      
print(confirm_add.group(2))                           #显示匹配到的第二个内容     
'''
新增新型冠状病毒肺炎确诊病例1
('新型冠状病毒肺炎', '1')
新型冠状病毒肺炎
1
'''

数据搜集

爬取腾讯数据

import datetime
import json
import requests
from bs4 import BeautifulSoup
import time
### 封装函数 返回各省市疫情情况
def get_data():
    today = str(datetime.date.today())
    
    url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_other'
    res = requests.get(url)
    d = json.loads(res.text)
    data = json.loads(d['data'])
    province = data['provinceCompare']
    
    list1 = []
    for pro in province.keys(