Python爬取疫情实战

Python爬取疫情实战

项目环境

环境:python 3.7
工具:pycharm

爬取疫情数据

用到的包

import requests, time, json
from datetime import datetime

爬取使用的网站

url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_other'
# 中国疫情总信息网站
url2 = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'
# 中国各省市的详细信息网站
header = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
}  # 请求头

获取中国疫情总信息

def run(url, header):
    """获取中国疫情总数据"""
    # 爬取网站
    r = requests.get(url, headers=header)
    r.encoding = r.apparent_encoding
    # 改变text格式
    dic = json.loads(r.text)
    # 获取data中的内容
    data_all = json.loads(dic['data'])

    # 获取历史
    history = {}  # 以字典形式存储
    for i in data_all['chinaDayList']:
        date = '2020.'+i['date']  # 获取日期
        new_date = datetime.strptime(date, '%Y.%m.%d')  # 转换成datetime
        date = new_date.strftime('%Y-%m-%d')  # 转换成str
        confirm = i['confirm']  # 确诊人数
        suspect = i['suspect']  # 疑似人数
        heal = i['heal']  # 治愈人数
        dead = i['dead']  # 死亡人数
        history[date] = {'confirm': confirm,
                         'suspect': suspect,
                         'heal': heal,
                         'dead': dead
                         }
    for j in data_all['chinaDayAddList']:
        date = '2020.'+j['date']  # 获取日期
        new_date = datetime.strptime(date, '%Y.%m.%d')
        date = new_date.strftime('%Y-%m-%d')
        confirm_add = j['confirm']  # 新增确诊人数
        suspect_add = j['suspect']  # 新增疑似人数
        heal_add = j['heal']  # 新增治愈人数
        dead_add = j['dead']  # 新增死亡人数
        history[date].update({'confirm_add': confirm_add, 'suspect_add': suspect_add, 'heal_add': heal_add, 'dead_add': dead_add})
    return history

获取中国各省市的详细信息

def detail(detail_url, header):
    """获取各个省市的详细信息"""
    r = requests.get(detail_url)
    r.encoding = r.apparent_encoding
    dic = json.loads(r.text)
    data_all = json.loads(dic['data'])
    d = data_all['areaTree']  # 获取地区
    date = data_all['lastUpdateTime']  # 获取日期
    details = []  # 以列表形式存储
    for i in d[0]['children']:
        # d[0]表示中国,中国各个省份信息存储在d[0]['children']中,获取中国各省信息
        province_name = i['name']  # 省名
        for j in i['children']:  # 市的详细信息
            city_name = j['name']  # 市名
            confirm = j['total']['confirm']
            heal = j['total']['heal']
            dead = j['total']['dead']
            new_confirm = j['today']['confirm']
            details.append([date, province_name, city_name, confirm, new_confirm, heal, dead])
    return details

向数据库中插入、更新数据

创建数据库

创建数据库,包含两个表details,history

CREATE TABLE `details` (
  `id` int NOT NULL AUTO_INCREMENT,
  `update_time` datetime DEFAULT NULL COMMENT '数据最后更新时间',
  `province` varchar(50) DEFAULT NULL COMMENT '省',
  `city` varchar(50) DEFAULT NULL COMMENT '市',
  `confirm` int DEFAULT NULL COMMENT '累计确诊',
  `confirm_add` int DEFAULT NULL COMMENT '新增确诊',
  `heal` int DEFAULT NULL COMMENT '累计治愈',
  `dead` int DEFAULT NULL COMMENT '累计死亡',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=923 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci
CREATE TABLE `history` (
  `ds` datetime NOT NULL COMMENT '日期',
  `confirm` int DEFAULT NULL COMMENT '累计确诊',
  `confirm_add` int DEFAULT NULL COMMENT '当日新增确诊',
  `suspect` int DEFAULT NULL COMMENT '剩余疑似',
  `suspect_add` int DEFAULT NULL COMMENT '当日新增疑似',
  `heal` int DEFAULT NULL COMMENT '累计治愈',
  `heal_add` int DEFAULT NULL COMMENT '当日新增治愈',
  `dead` int DEFAULT NULL COMMENT '累计死亡',
  `dead_add` int DEFAULT NULL COMMENT '当日新增死亡',
  PRIMARY KEY (`ds`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci

用到的包

import time, pymysql, traceback

编写连接、断开连接数据库函数

def get_conn():
    """创建连接"""
    # 建立连接
    # user为数据库用户,password是数据库密码,db为已建的数据库名字
    conn = pymysql.connect(user='#', password='#', db='#')
    # 创建游标,元组类型
    cursor = conn.cursor()

    return conn, cursor


def close_conn(conn, cursor):
    """关闭连接"""
    if cursor:
        cursor.close()
    if conn:
        conn.close()

插入以及更新各省市详细信息

def update_details():
    # 插入、更新详细信息
    cursor = None
    conn = None
    try:
        d = detail(url2, header)
        conn, cursor = get_conn()  # 开始连接
        sql = "insert into details(update_time,province,city,confirm,confirm_add,heal,dead) values(%s,%s,%s,%s,%s,%s,%s)"
        sql_query = 'select %s=(select update_time from details order by id desc limit 1)'
        cursor.execute(sql_query, d[0][0])  # 数据库日期与最新日期的比较
        if not cursor.fetchone()[0]:  # 若数据库日期不是最新日期,则更新数据库
            print(f"{time.asctime()}开始更新最新数据")
            for i in d:
                cursor.execute(sql, i)  # 向数据库插入数据
            conn.commit()  # 提交事务
            print(f"{time.asctime()}更新最新数据完毕")
        else:
            print(f"{time.asctime()}已是最新数据")
    except:
        traceback.print_exc()  # 以日志形式打印错误
    finally:
        close_conn(conn, cursor)  # 关闭连接

插入、更新中国疫情总信息

def insert_history():  # 插入历史数据
    cursor = None
    conn = None
    try:
        h = run(url, header)
        print(f"{time.asctime()}开始插入历史数据")
        conn, cursor = get_conn()
        sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        for k, v in h.items():
            cursor.execute(sql, [k, v.get('confirm'), v.get('confirm_add'), v.get('suspect'),
                                 v.get('suspect_add'), v.get('heal'), v.get('heal_add'),
                                 v.get('dead'), v.get('dead_add')])
        conn.commit()
        print(f"{time.asctime()}插入历史数据完毕")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)
def update_history():
    # 更新历史数据
    cursor = None
    conn = None
    try:
        dic = run(url, header)
        print(f"{time.asctime()}开始更新历史数据")
        conn, cursor = get_conn()
        sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        sql_query = "select confirm from history where ds=%s"
        for k, v in dic.items():
            if not cursor.execute(sql_query, k):
                cursor.execute(sql, [k, v.get('confirm'), v.get('confirm_add'), v.get('suspect'),
                                 v.get('suspect_add'), v.get('heal'), v.get('heal_add'),
                                 v.get('dead'), v.get('dead_add')])
            conn.commit()
            print(f"{time.asctime()}历史数据更新完毕")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

利用selnium爬取热搜

def get_baidu():
    option = FirefoxOptions()
    option.add_argument("--headless")  # 隐藏浏览器
    option.add_argument("--no-sandbox")
    driver = Firefox(executable_path='C:\\Users\\84572\\Desktop\\geckodriver.exe', options=option)

    news_url = u"https://voice.baidu.com/act/virussearch/virussearch?from=osari_map&tab=0&infomore=1"

    driver.get(news_url)
    but = driver.find_element_by_css_selector('.VirusHot_1-5-6_1Fqxy-')  # 自动展开列表
    but.click()
    time.sleep(1)
    # 获取热搜词条
    c = driver.find_elements_by_xpath(
        '/html/body/div[2]/div/div/div/section/div[2]/div[1]/div/div[1]/section/a/div/span[2]')
    context = [i.text for i in c]  # 存储在list中
    driver.close()
    return context

将热搜插入数据库中

创建数据库

CREATE TABLE `hotsearch` (
  `id` int NOT NULL AUTO_INCREMENT,
  `dt` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
  `content` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=21 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci

插入数据库

def insert_baidu():
    # 热搜插入数据库
    cursor = None
    conn = None
    try:
        context = get_baidu()
        print(f"{time.asctime()}开始更新热搜数据")
        conn, cursor = get_conn()
        sql = "insert into hotsearch(dt,content) values(%s,%s)"
        ts = time.strftime("%Y-%m-%d %X")
        for i in context:
            cursor.execute(sql, (ts, i))
        conn.commit()
        print(f"{time.asctime()}数据更新完毕")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

制作网页

可视化布局

HTML文件

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link href="../static/css/main.css" rel="stylesheet"/>
</head>
<body>
    <div id="title">全国疫情实时追踪</div>
    <div id="time">我是时间</div>
    <div id="l1">我是左1</div>
    <div id="l2">我是左2</div>
    <div id="c1">我是中1</div>
    <div id="c2">我是中2</div>
    <div id="r1">我是右1</div>
    <div id="r2">我是右2</div>
</body>
</html>

CSS文件

body{
    margin: 0;
    background: #333;

}
#title{
    position: absolute;
    width: 40%;
    height: 10%;
    top:0;
    left: 30%;
    right: 30%;
   /* background: #666666;*/
   color: white;
   font-size: 30px;
   display: flex;
   align-items: center;
   justify-content: center;

}
#time{
    position: absolute;
    /*width: 30%;*/
    height: 10%;
    top: 5%;
    right: 2%;
    color: #FFFFFF;
    font-size: 20px;
}
#l1{
    position: absolute;
    width: 30%;
    height: 45%;
    top: 10%;
    left: 0%;
    right: 70%;
    background: #777777;
}
#l2{
    position: absolute;
    width: 30%;
    height: 45%;
    top: 55%;
    left: 0%;
    background: #666666;
}
#c1{
    position: absolute;
    width: 40%;
    height: 30%;
    top:10%;
    left: 30%;
    right: 30%;
    color: white;
    /*background: #666666;*/
}
.num {
    width: 25%;
    float: left;
    display: flex;
    align-items: center;
    justify-content: center;
    color: gold;
    font-size: 20px;
}
.txt {
    width: 25%;
    float: left;
    font-family: "幼圆";
    display: flex;
    align-items: center;
    justify-content: center;
}
.txt h2 {
    margin: 0;
}
#c2{
    position: absolute;
    width: 40%;
    height: 60%;
    top:40%;
    left: 30%;
    right: 30%;
    background: #666666;
}
#r1{
    position: absolute;
    width: 30%;
    height: 45%;
    top: 10%;
    left: 70%;
    right: 0;
    background: #666666;
}
#r2{
    position: absolute;
    width: 30%;
    height: 45%;
    top: 55%;
    left: 70%;
    right: 0;
    background: #666666;
}

可视化布局界面如下

在这里插入图片描述

时间实时更新

前台操作

新建一个contorller.js文件,写一个函数获取本地时间

function gettime(){
    $.ajax({
        url:"/time",
        timeout:10000,
        success:function(data){
        $('#time').html(data) // 返回本地时间
        },
        error:function(xhr,type,errorThrown){
                }
            });
        }
   setInterval(gettime,1000)  //1000毫秒刷新一次时间

在html中引入js文件,实现局部刷新

<head>
<script src="../static/js/jquery-1.11.1.min.js"></script>
</head>
<body>
<script src="../static/js/contorller.js"></script>
</body>
后台操作

新建一个utils.py文件,定义一个获取时间的函数

def get_time():
    time_str = time.strftime("%Y{}%m{}%d{} %X")
    return time_str.format("年", "月", "日")

在后台页面引用该函数

from flask import Flask, request, render_template, jsonify
import utils, string
from jieba.analyse import extract_tags

app = Flask(__name__)
@app.route('/time')
def get_time():
    return utils.get_time()

由于需要一直使用sql语句查询,所以使用封装函数进行查询

def get_conn():
    # 连接数据库
    # 创建连接
    conn = pymysql.connect(host="localhost",
                           user="",
                           password="",
                           db="",
                           charset="utf8",)
    # 创建游标
    cursor = conn.cursor()
    return conn, cursor


def close_conn(conn, cursor):
    if cursor:
        cursor.close()
    if conn:
        conn.close()


def query(sql, *args):
    conn, cursor = get_conn()
    cursor.execute(sql, args)
    res = cursor.fetchall()
    close_conn(conn, cursor)  # 返回所有结果
    return res

更新关键数据

后台操作

在utils.py文件中,定义一个获取数据库数据的函数

def get_cl_data():
    # 返回最新时间的数据
    sql = "select sum(confirm)," \
          "(select suspect from history order by ds desc limit 1)," \
          "sum(heal)," \
          "sum(dead)" \
          "from details where update_time=(select update_time from details order by update_time desc limit 1) "
    res = query(sql)
    return res[0]

定义路由

@app.route('/c1', methods=["get","post"])
def get_cl_data():
    data = utils.get_cl_data()
    return jsonify({"confirm": data[0], "suspect": data[1], "heal":data[2], "dead": data[3]})

前台操作

更改HTML文件代码

<div id="c1">
        <div class="num"><h1></h1></div>
        <div class="num"><h1></h1></div>
        <div class="num"><h1></h1></div>
        <div class="num"><h1></h1></div>
        <div class="txt"><h2>累计确诊</h2></div>
        <div class="txt"><h2>剩余疑似</h2></div>
        <div class="txt"><h2>累计治愈</h2></div>
        <div class="txt"><h2>累计死亡</h2></div>
    </div>

在js文件中

function get_cl_data(){
    $.ajax({
        url:"/c1",
            success:function(data){
                 $(".num h1").eq(0).text(data.confirm);
                 $(".num h1").eq(1).text(data.suspect);
                 $(".num h1").eq(2).text(data.heal);
                 $(".num h1").eq(3).text(data.dead);
                },
                error:function(xhr,type,errorThrown){

                }
            })
        }

中国地图实现

前台操作

在html文件中引入js文件

 <head>
    <script src="../static/js/echarts.min.js"></script>
    <script src="../static/js/china.js"></script>
</head>
<body>
<script src="../static/js/ec_center.js"></script>
</body>

编写ec_center.js文件

var ec_center = echarts.init(document.getElementById('c2'), "dark");

var mydata = [{'name': '上海', 'value': 318}, {'name': '云南', 'value':162}]

var ec_center_Option = {
     title: {
         text: '',
         subtext: '',
         x: 'left'
     },
     tooltip: {
         trigger: 'item'
     },
     //左侧小导航
     visualMap: {
         show: true,
         x: 'left',
         y: 'bottom',
         textStyle: {
             fontSize: 8,
         },
         splitList: [ {start:1,end:9},
             {start: 10, end: 99},
             {start: 100, end: 999},
             {start: 1000, end: 9999},
             {start: 10000}],
         color:['#8A3310', '#C64918', '#E55B25', '#F2AD92', '#F9DCD1'],
     },
     //配置属性
     series: [{
         name: '累计确诊人数',
         type: 'map',
         mapType: 'china',
         roam : false, // 拖动和缩放
         itemStyle: {
             normal: {
                 borderWidth: .5, //区域边框宽度
                 borderColor: '#009fe8', //区域边框颜色
                 areaColor: '#ffefd5', //区域颜色
             },
             emphasis: {  //鼠标滑过地图高亮的相关设置
                 borderWidth: .5,
                 borderColor: '#4b0082',
                 areaColor: "#fff",
             }
         },
         label: {
             normal: {
                 show: true, //省份名称
                 fontSize: 8,
             },
             emphasis:{
                 show: true,
                 fontSize: 8,
             }
         },
         data: mydata
     }]
};
ec_center.setOption(ec_center_Option);

后台操作

在utils.py中获取数据库的数据

def get_c2_data():
    # 获取各个省的确诊总人数
    sql = "select province,sum(confirm) from details " \
         "where update_time=(select update_time from details order by update_time desc limit 1) group by province"
    res = query(sql)
    return res

在app.py中定义路由

@app.route('/c2')
def get_c2_data():
    res = []
    total = utils.get_c2_data()
    for t in total:
        res.append({'name': t[0], 'value': int(t[1])})
    return jsonify({"data": res})

前台获取后台数据

function get_c2_data(){
    $.ajax({
        url:"/c2",
        success:function(data){
            ec_center_Option.series[0].data=data.data
            ec_center.setOption(ec_center_Option)
        },
        error:function(xhr,type,errorThrown){
                }
    });
}
get_c2_data()

全国累计趋势折线图

前台操作

写一个ec_left1.js,在HTML页面引入该js

var ec_left1 = echarts.init(document.getElementById('l1'), "dark");
var ec_left1_Option = {
	//标题样式
	title: {
		text: "全国累计趋势",
		textStyle: {
			// color: 'white',
		},
		left: 'left',
	},
	tooltip: {
		trigger: 'axis',
		//指示器
		axisPointer: {
			type: 'line',
			lineStyle: {
				color: '#7171C6'
			}
		},
	},
	legend: {
		data: ['累计确诊', '现有疑似', '累计治愈', '累计死亡'],
		left: "right"
	},
	
	//图形位置
	grid: {
		left: '4%',
		right: '6%',
		bottom: '4%',
		top: 50,
		containLabel: true
	},
	xAxis: [{
		type: 'category',
		//x轴坐标点开始与结束点位置都不在最边缘
		// boundaryGap : true,
		data: ['01.20', '01.21', '01.22']
	}],
	yAxis: [{
		type: 'value',
		//y轴字体设置
		axisLabel: {
			show: true,
			color: 'white',
			fontSize: 12,
			formatter: function(value) {
				if (value >= 1000) {
					value = value / 1000 + 'k';
				}
				return value;
			}
		},
		//y轴线设置
		axisLine: {
			show: true
		},
		//与x轴平行的线样式
		splitLine: {
			show: true,
			lineStyle: {
				color: '#17273B',
				width: 1,
				type: 'solid',
			}
		}
	}],
	series: [{
		name: "累计确诊",
		type: 'line',
		smooth: true,
		data: [260, 406, 529]
	}, {
		name: "现有疑似",
		type: 'line',
		smooth: true,
		data: [54, 37, 3935]},
		{
		name: "累计治愈",
		type: 'line',
		smooth: true,
		data: [25, 25, 25]
		},{
		name: "累计死亡",
		type: 'line',
		smooth: true,
		data: [6, 9, 17]
	}]
};

ec_left1.setOption(ec_left1_Option);

后台操作

在utils.py中,获取数据库数据

def get_l1_data():
    sql = "select ds,confirm,suspect,heal,dead from history"
    res = query(sql)
    return res

在app.py中,写一个路由

@app.route('/l1')
def get_l1_data():
    day, confirm, suspect, heal, dead = [], [], [], [], []
    total = utils.get_l1_data()
    for t in total[20:26]:
        day.append(t[0].strftime("%m-%d"))
        confirm.append(t[1])
        suspect.append(t[2])
        heal.append(t[3])
        dead.append(t[4])
    return jsonify({"day": day, "confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead})

获取后台数据

function get_l1_data(){
    $.ajax({
        url:"/l1",
        success:function(data){
            ec_left1_Option.xAxis[0].data=data.day
            ec_left1_Option.series[0].data=data.confirm
            ec_left1_Option.series[1].data=data.suspect
            ec_left1_Option.series[2].data=data.heal
            ec_left1_Option.series[3].data=data.dead
            ec_left1.setOption(ec_left1_Option)
        },
        error:function(xhr,type,errorThrown){
                }
    });
}


get_l1_data()

折线图效果图
在这里插入图片描述

全国新增趋势折线图

与全国累计趋势折线图步骤相同

条形统计图

创建js文件

var ec_right1 = echarts.init(document.getElementById('r1'), "dark");

var ec_right1_Option = {
    //标题
	title : {
	    text : "非湖北地区城市确诊TOP5",
	    textStyle : {
	        color : 'white',
	    },
	    left : 'left'
	},

    color: ['#3398DB'],
    tooltip: {
        trigger: 'axis',
        axisPointer: {  //坐标轴指示器,坐标轴触发有效
            type: 'shadow'    //默认为直线,可选'line'|'shadow'
        }
    },
    xAxis: {
        type: 'category',
        data: ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    },
    yAxis: {
        type: 'value'
    },
    series: [{
        data: [10, 52, 200, 334, 390, 330, 220],
        type: 'bar',
        barMaxWidth: "50%"
    }]
};
ec_right1.setOption(ec_right1_Option);

后台获取数据库数据

def get_r1_data():
    sql = "select province,sum(confirm) from details " \
          "where update_time=(select update_time from details order by update_time desc limit 1) group by province "\
          "ORDER BY sum(confirm) DESC LIMIT 6"
    res = query(sql)
    return res

@app.route('/r1')
def get_r1_data():
    province, confirm = [], []
    for t in utils.get_r1_data()[1:]:
        province.append(t[0])
        confirm.append(int(t[1]))
    return jsonify({"province": province, "confirm": confirm})

获取后台数据

function get_r1_data() {
    $.ajax({
        url: '/r1',
        success:function(data){
            ec_right1_Option.xAxis.data = data.province
            ec_right1_Option.series[0].data = data.confirm
            ec_right1.setOption(ec_right1_Option)
        },
        error:function(xhr,type,errorThrown){
                }
    });
}
get_r1_data()

热搜文字图

引入js文件

<script src="../static/js/echarts-wordcloud.min.js"></script>

编写js文件

var ec_right2 = echarts.init(document.getElementById('r2'), "dark");

var d = [{'name': '肺炎', 'value': '12734670'}, {'name': '实时', 'value': '12734670'},
{'name': '新型', 'value': '12734670'}];

var ec_right2_Option = {
     title : {
         text : "今日疫情热搜",
         textStyle : {
              color : 'white',
         },
         left : 'left'
     },
     tooltip: {
         show: false
     },
     series: [{
         type: 'wordCloud',
         gridSize: 1,
         sizeRange: [12,55],  //字体大小
         rotationRange: [-45, 0, 45, 90],  //字体旋转角度
         textStyle: {
             normal:{ //字体颜色
                 color: function(){
                     return 'rgb(' +
                            Math.round(Math.random()*255)+
                            ', '+Math.round(Math.random()*255)+
                            ', '+Math.round(Math.random()*255)+')'
                 }
             }
         },
     right: null,
     bottom: null,
     data: []
     }]
}
ec_right2.setOption(ec_right2_Option);

后台获取数据库数据

def get_r1_data():
    sql = "select province,sum(confirm) from details " \
          "where update_time=(select update_time from details order by update_time desc limit 1) group by province "\
          "ORDER BY sum(confirm) DESC LIMIT 6"
    res = query(sql)
    return res

@app.route('/r2')
def get_r2_data():
    data = utils.get_r2_data()
    d = []
    for t in data:
        name = t[0].rstrip(string.digits)  # 去除数字
        value = t[0][len(name):]  # 获取数字
        ks = extract_tags(name)  # 提取关键字
        for k in ks:
            if not k.isdigit():
                d.append({'name': k, 'value': value})
    return jsonify({"data": d})

从后台获取数据

function get_r2_data() {
    $.ajax({
        url: '/r2',
        success:function(data){
            ec_right2_Option.series[0].data = data.data
            ec_right2.setOption(ec_right2_Option)
        }
    });
}
get_r2_data()

最终效果图

在这里插入图片描述

©️2020 CSDN 皮肤主题: 数字20 设计师:CSDN官方博客 返回首页