python爬虫01 -- 小小爬虫概述-CSDN博客

本文链接：https://blog.csdn.net/qq_34536604/article/details/106997753

爬虫概述

爬虫就是给网站发起请求，并从响应中提取需要的数据的自动化程序

爬虫原理

发起请求，获取相应

通过http库，对目标站点进行请求，等同于自己打开浏览器，输入网址
常用库：urllib（标准库）, urllib3, requests
服务器会返回请求的内容，一般为：html，二进制文件（音频/视频），文档，json字符串等

解析内容

寻找需要的信息，就是利用正则表达式或者其他库提目标信息

F12 查看网页源码: Elements：源码 Console: 页面后台输出 Network：监控网络状况（有我们需要的一些关键信息，如头信息，User-Agent,cookies，响应response等）

常用库：re, beautifulsoup4

保存数据

将解析得到的数据持久化到文件或者数据库中

爬虫–请求

使用urllib发送请求

from urllib import request

url = 'http://www.baidu.com'

res = request.urlopen(url)  # 访问url并获取响应

print(res.geturl())  # 获取主机地址
print(res.getcode())  # 获取请求状态码
print(res.info())  # 获取响应头

html = res.read()  # 获取的是字节形式的内容
html.decode("utf-8")  # 解码

print(html)

状态码

2xx：正常访问

3xx：发生了重定向，访问A内部转发到B网站

4xx：404，页面不存在 403，请求被禁止(反爬)

5xx：服务器内部错误

出现403则存在反爬，需要添加User-Agent HTTP Error 403: Forbidden
反爬措施：将自己的User-agent信息加入到header中（基本）

from urllib import request

url = 'http://www.dianping.com'
header = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel 
        Mac OS X 10_15_4) AppleWebKit/537.36 
        (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36"
}
req = request.Request(url, headers=header)  # 访问url并获取响应
res = request.urlopen(req)

使用requests发送请求

安装：pip install requests
发起请求requests.get()

import requests

url = "http://www.dianping.com"

# 反爬--添加header
header = {
    "Host": "www.dianping.com",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/81.0.4044.122 Safari/537.36"
}
resp = requests.get(url, headers=header)  
# 发起请求,不需要像上面进行处理直接使用
print(resp.encoding)  # 查看编码
print(resp.status_code)  # 查看状态码
# html = resp.text 二进制
resp.encoding = "utf-8"
# 如果里面没有Content-Type, encoding=utf-8,就以设置为准，否则就是iso8859-1
html = resp.text
print(html)

爬虫–解析

使用beautifulsoup4解析内容

beautifulsoup4将复杂的HTML文档转换成一个树形结构，每个节点都是python对象
安装：pip install beautifulsoup4
Beautifulsoup4(html)

获取节点：find()、fiand_all(）/select()

获取属性：attrs

获取文本：text

from bs4 import BeautifulSoup
import requests

# 请求
url = 'http://wsjkw.sc.gov.cn/scwsjkw/gzbd/fyzt.shtml'
res =requests.get(url)
res.encoding = "utf-8"
html = res.text

# 解析，需要规定解析器features
soup = BeautifulSoup(html, features="html.parser")
soup.find("h2")  # 解析所有的h2标签
a = soup.find("a")  # 解析获取第一个a标签对象
# print(a.attrs)  # 获取该对象的属性信息
u = a.attrs['href']

# 获取的url与其前面部分拼接成新的url
url_new = "http://wsjkw.sc.gov.cn" + u
print(url_new)

# 再次请求
res_new = requests.get(url_new)
res_new.encoding = "utf-8"

# 再次解析
soup_new = BeautifulSoup(res_new.text, features="html.parser")
s = soup_new.find("p") 
print(s)

使用re解析内容

re是python自带的正则表达式模块
re.search(regex, str)

在str中查找满足条件的字符串，匹配不上返回none

对返回的结果分组，可以在字符串内添加小括号分离数据：groups(), groups(index):返回指定分组内容

import re

confirm_add_patten = "确诊病例(\d+)例"
confirm_add = re.search(confirm_add_patten, res_new.text)
print(confirm_add)
print(confirm_add.group())
print(confirm_add.group(0))  # 所有符合的
print(confirm_add.group(1))  # 匹配正则里的

爬取腾讯疫情数据

部分网站反爬手段较高，需要较好的反反爬策略

数据抓取

分析网站
F12 查看网站信息，然后在network中查看js的数据信息找到需要的json数据，在headers中复制相关的request url(注意其url取舍)

import json

from bs4 import BeautifulSoup
import requests

url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'

res = requests.get(url)
# xx = res.text # json 字符串文本
res.encoding = 'utf-8'
xx = res.text
data = json.loads(xx)  # 加载json字符串
print(type(data['data']))  # str

data_all = json.loads(data["data"])  # 继续加载json，变成字典
print(type(data_all))  # dict
for k, v in data_all.items():
    print(k, v)

分析处理

lastUpdateTime #最后更新时间

chinaTotal #总数

chinaAdd # 新增

areaTree :

-name #areaTree[0]: 中国数据

-today/-total

-childre：name # 市级数据，列表 today/total

from datetime import time

from bs4 import BeautifulSoup
import requests
import time
import pymysql
import traceback  # 追踪异常


def get_tencent_data():
    """

    :return: 返回历史数据和当日详细数据
    """

    url = 'https://view.inews.qq.com/g2/getOnsInfo?
    				name=disease_other'

    headers = {
        "User - Agent": "Mozilla /
         5.0(Macintosh;IntelMacOSX10_15_4) 
         AppleWebKit / 537.36(KHTML, "
         "likeGecko) Chrome / 
         81.0.4044.122Safari / 
         537.36"
    }

    r = requests.get(url, headers)
    res = json.loads(r.text) # json转字典
    data_all = json.loads(res['data'])

    history = {}  # 每日总数据
    for i in data_all['chinaDayList']:
        ds = "2020."+i['date']
        tup = time.strptime(ds, "%Y.%m.%d")
        ds = time.strftime("%Y.%m.%d", tup)
        confirm = i['confirm']
        suspect = i['suspect']
        heal = i['heal']
        dead = i['dead']
        history[ds] = {
        "confirm": confirm, 
        "suspect": suspect, 
        "heal": heal, "dead": dead}

    for i in data_all['chinaDayAddList']:
        ds = "2020."+i['date']
        tup = time.strptime(ds, "%Y.%m.%d")
        ds = time.strftime("%Y.%m.%d", tup)
        confirm = i['confirm']
        suspect = i['suspect']
        heal = i['heal']
        dead = i['dead']
        history[ds].update({"confirm_add": 
        confirm, "suspect_add": suspect, 
        "heal_add": heal, 
        "dead_add": dead}) # 更新
    return history


def get_day_details():
    url = 'https://view.inews.qq.com/g2/getOnsInfo?
    name=disease_h5'

    headers = {
        "User - Agent": "Mozilla / 
        5.0(Macintosh;IntelMacOSX10_15_4) 
        AppleWebKit / 537.36(KHTML, "
        "likeGecko) Chrome / 81.0.4044.122Safari 
        / 537.36"
    }

    r = requests.get(url, headers)
    res = json.loads(r.text) # json转字典
    data_all = json.loads(res['data'])
    datails = [] # 当日详细数据
    update_time = data_all["lastUpdateTime"]
    data_country = data_all["areaTree"]  # 25个国家
    data_provice = data_country[0]["children"]  # 中国省份
    for pro_infos in data_provice:
        provice = pro_infos["name"]  # 名
        for city_infos in pro_infos['children']:
            city = city_infos['name']
            confirm = city_infos['total']["confirm"]
            confirm_add = city_infos['today']['confirm']
            heal = city_infos['total']['heal']
            dead = city_infos['total']['dead']
            datails.append([update_time, provice, 
            city, confirm, confirm_add, heal, dead])

    return datails


# 建立数据库连接
def get_conn():
    config = dict(host='localhost',
                  user='root',
                  password='123456',
                  cursorclass=pymysql.cursors.DictCursor,
                  db='cov',
                  charset="utf8"
                  )

    conn = pymysql.connect(**config)
    cursor = conn.cursor()  # 创建游标， 默认元祖
    print('数据库连接开启')
    return conn, cursor


def close_conn(conn, cursor):
    if cursor:
        cursor.close()
    if conn:
        conn.close()
    print('数据库连接关闭')


def updata_details():
    """
    更新details表
    :return:
    """

    cursor = None
    conn = None
    try:
        li = get_day_details() # 0是历史数据字典,1最新详细数据列表
        conn, cursor = get_conn()
        sql = "insert into details(update_time, 
        							province, 
        							city, 
        							confirm, 
        							confirm_add, 
        							heal, dead) 
        							values(%s,%s,%s,%s,%s,%s,%s)"
        sql_query = "select %s=(
        select update_time from details order 
        by id desc limit 1)" # 对比当前最大时间戳
        cursor.execute(sql_query, li[0][0])
        if not cursor.fetchone():
            print(f"{time.asctime()}开始更新数据")
            for item in li:
                cursor.execute(sql, item)
            conn.commit()
            print(f"{time.asctime()}更新数据完毕")
        else:
            print(f"{time.asctime()}已经是最新数据")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)


def insert_history():
    """
    插入历史数据
    :return:
    """
    cursor = None
    conn = None
    try:
        dic = get_tencent_data()
        print(f"{time.asctime()}开始插入数据")
        conn, cursor = get_conn()
        sql = 'insert into history 
        			values(%s,%s,%s,%s,%s,%s,%s,%s)'
        for k, v in dic.items():
            # item 格式{'2020-1-1':
            		{"confirm": 41, 'suspect': 0 ...}}
            cursor.execute(sql, [k, 
            v.get("confirm"), 
            v.get("confirm_add"), 
            v.get("suspect"), 
            v.get("suspect_add"),
            v.get('heal'), 
            v.get('dead'), 
            v.get('dead_add')])
            conn.commit()
            print(f"{time.asctime()}插入历史数据完毕")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)


def update_history():
    """
    更新历史数据
    """
    cursor = None
    conn = None
    try:
        dic = get_tencent_data()
        print(f"{time.asctime()}开始更新数据")
        conn, cursor = get_conn()
        sql = "insert into history 
        values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        sql_query = "select confirm 
        				from history where ds = %s"
        for k ,v in dic.items():
            if not cursor.execute(sql_query, k):
                cursor.execute(sql, [k, 
                v.get("confirm"), 
                v.get("confirm_add"), 
                v.get("suspect"), 
                v.get("suspect_add"),
                v.get('heal'), 
                v.get('heal_add'), 
                v.get('dead'), 
                v.get('dead_add')])
        conn.commit()
        print(f"{time.asctime()}历史数据更新完毕")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

存储数据

数据库建表

CREATE TABLE `history`(
	`ds` datetime Not NULL COMMENT '日期',
	`confirm` int(11) DEFAULT NULL COMMENT '累计确诊',
	`confirm_add` int(11) DEFAULT NULL COMMENT '当日新增确诊',
	`suspect` int(11) DEFAULT NULL COMMENT '剩余疑似',
	`suspect_add` int(11) DEFAULT NULL COMMENT '当日新增疑似',
	`heal` int(11) DEFAULT NULL COMMENT '累计治愈',
	`dead` int(11) DEFAULT NULL COMMENT '累计死亡',
	`dead_add` int(11) DEFAULT NULL COMMENT '当日新增死亡',
	PRIMARY KEY (`ds`) USING BTREE
)ENGINE = INNODB DEFAULT CHARSET = utf8mb4;

CREATE TABLE `details`(
	`id` int(11) NOT NULL AUTO_INCREMENT,
	`update_time` datetime DEFAULT NULL COMMENT '数据最后更新时间',
	`province` VARCHAR(50) DEFAULT NULL COMMENT '省',
	`city` VARCHAR(50) DEFAULT NULL COMMENT '市',
	`confirm` int(11) DEFAULT NULL COMMENT '累计确诊',
	`confirm_add` int(11) DEFAULT NULL COMMENT '当日新增确诊',
	`heal` int(11) DEFAULT NULL COMMENT '累计治愈',
	`dead` int(11) DEFAULT NULL COMMENT '累计死亡',
	PRIMARY KEY (`id`)
)ENGINE = INNODB DEFAULT CHARSET = utf8mb4;

数据存储 –pymysql


# 建立数据库连接
config = dict(host='localhost', 
              user='root', 
              password='123456',
              cursorclass=pymysql.cursors.DictCursor, 
              db='cov')

conn = pymysql.Connect(**config)

cursor = conn.cursor()  # 创建游标， 默认元祖

sql = "select * from history"
cursor.execute(sql)

# conn.commit() # 提交事物
res = cursor.fetchall()  # 获取所有查询结果
print(res)

cursor.close()
conn.close()

抓取百度热搜数据

抓取数据

数据都是通过js动态加载的
安装浏览器（谷歌）打开浏览器设置-关于谷歌-可以查看版本号
下载对应版本浏览器驱动：http://npm.taobao.org/mirrors/chromedriver/

mac下载的驱动需要放在/usr/local/bin 通过chromedriver –version 查看，需要在安全与隐私中把chromedriver去掉才能用

或者直接放在你当前执行的目录下

创建浏览器对象

浏览器.get()

浏览器.find()

实现代码

def get_baidu_hot():
    """
    返回百度疫情热搜
    :return:
    """

    option = ChromeOptions()
    option.add_argument("--headless")
    # 隐藏浏览器
    option.add_argument("--no-sandbox")
    # linux部署需要禁用

    brower = Chrome(options=option)
    # executable_path= 浏览器.exe文件位置
    brower.get(url="https://voice.baidu.com/act"
                   "/virussearch/virussear"
                   "ch?from=osari_map&tab=0&infomore=1")
    # print(brower.page_source)   # 查看网页源码

    # 展开按钮的selector
    buttur = brower.find_element_by_css_selector(
        '#ptab-0 > div > div.VirusHot_1-5-6_32'
        'AY4F.VirusHot_1-5-6_2RnRvg > section > div')
    buttur.click()  # 点击
    time.sleep(1)

    c = brower.find_elements_by_xpath(
        '//*[@id="ptab-0"]/div/div[1]/section/a/div/span[2]')
    context = [i.text for i in c]
    print(context)
    return context


def update_hotsearch():
    """
    保存到数据库
    :return:
    """

    cursor = None
    conn = None
    try:
        contect = get_baidu_hot()
        print(f"{time.asctime()}：开始更新数据")
        conn, cursor = get_conn()
        sql = 'insert into hotsearch(dt, content) values(%s, %s)'
        ts = time.strftime("%y-%m-%d %X")
        for i in contect:
            cursor.execute(sql, (ts, i))
        conn.commit()
        print(f"{time.asctime()}：数据更新完毕")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

Flask

基础

Flask 是一个使用python编写的轻量级web应用框架。其WSGI(Python Web Server ..)工具包采用Werkzeug,模版引擎则使用Jinja2，目前比较流行

小应用

安装：pip install flask

from flask import Flask
from flask import request
app = Flask(__name__)


@app.route('/')   # 使用装饰器定义路由
def hello_world():
    return 'Hello World!'


@app.route('/login')  # 获取表单信息并返回给页面
def hello_world2():
    name = request.values.get("name")
    pwd = request.values.get('pwd')
    return f"name={name}, pwd={pwd}"


@app.route('/abc')
def hello_world1():  # 获取ID 提交表单
    id = request.values.get("id")
    return f'<form action="/login">账号：<input name="name" 
    value="{id}"><br>密码：<input name="pwd">
    <input type="submit"></form>'


if __name__ == '__main__':
    app.run()

template模版使用

模版就是事先写好的页面，里面可以使用特殊语法引入变量
使用render_template返回模版页面就是前端页面
return的有三种：字符串、模版、json

from flask import render_template
app = Flask(__name__)


@app.route('/')   # 使用装饰器定义路由
def hello_world():
    return 'Hello World!'

@app.route("/mypage")
def moban():
    return render_template("new.html")

if __name__ == '__main__':
    app.run()

使用ajax局部刷新页面

ajax 是Asynchronous Javascript and XML 的简称，通过Ajax向服务器发送请求，接收服务器返回的json数据，然后使用javascript修改网页来实现页面局部数据更新
使用jquery框架可方便的编写ajax代码，需要jquery.js文件
使用时需要在jquery官网下载相关jquery.js,将其文件放入static/js文件夹中，在相关html调用，就是在head头信息中加入<script src="../static/js/jquery-3.5.0.js"></script>
异步请求–局部刷新代码

<body>

<h1>疫情追踪</h1>
<h2>实时报道</h2>
<button>点击</button>


<script>
    $("button").click(function () {
        //定义一个button标签点击click函数 点击后执行下面ajax内容
        $.ajax({
            url:"/ajax",  //请求路由，后端定义的相关路由
            type:"post",  //请求类型
            data:{
                "name":"xxx",
                "pwd":"asadwqdw"
            }, //数据
            success:function (d) {  
            //请求成功的回调函数，d是后端返回的数据
                $("h1").html("实时报道替换成了"+d)
            },error:function () {  //请求失败执行的函数
                alert("发送ajax请求失败")
            }
        })
    })
</script>

</body>

@app.route("/ajax", methods=["get", "post"]) 
# 默认为get请求 在ajax中定义url的路由
def ajax_ceshi():
    name = request.values.get("name") 
    # 获取前端ajax的date中的数据
    pwd = request.values.get("pwd")
    print(f"{name, pwd}")
    return "10000" #前端接收到的d='10000'


@app.route("/index") #ajax直接写在该页面中，在该页面操作
def index_ap():
    return render_template("index.html")

if __name__ == '__main__':
    app.run()

可视化大屏模版制作

Hbuider–前端软件

使用绝对定位划分板块

#.css 采用id选择器
#c0{
	position: absolute; /*绝对定位*/
	width: 40%;
	height: 10%;
	top: 0;
	left: 30%;
	/* background: #666666; */
	color: white;  /* 字体颜色*/
	font-size: 30px;
	
	display: flex;  /*弹性布局*/
	align-items: center;   /*横居中*/
	justify-content: center; /*竖居中*/
}

# .html 引用前面的选择器
	<head>
		<meta charset="utf-8">
		<title>疫情监控</title>
		<link rel="stylesheet" type="text/css" href="../static/css/main.css"/>
		<style></style>
	</head>
	<body>
		<div id="c0">全国疫情实时追踪</div>
	</body>

flask – 后端

自定义工具，utils.py

import time
import pymysql


def get_time():
    time_str = time.strftime("%Y{}%m{}%d %X")
    return time_str.format("年", "月", "日")


def get_conn():
    """
    连接，游标
    :return:
    """
    conn = pymysql.connect(
        host="localhost",
        user="root",
        password="123456",
        db="cov",
        charset="utf8",
    )
    cursor = conn.cursor()
    return conn, cursor


def close_conn(conn, cursor):
    cursor.close()
    conn.close()


def query(sql, *args):
    """
    封装通用查询
    :param sql:
    :param args:
    :return:返回查询结果，((),())
    """

    conn, cursor = get_conn()
    cursor.execute(sql, args)
    res = cursor.fetchall()
    close_conn(conn, cursor)
    return res


def get_cl_data():
    """
    返回ID= c1 的数据
    :return:
    """
    sql = "SELECT SUM(confirm), " \
          "(SELECT suspect from history " \
          "ORDER BY ds DESC LIMIT 1)," \
          "SUM(heal),SUM(dead) from details " \
          "WHERE update_time=(SELECT update_time 
          from details ORDER BY update_time desc LIMIT 1)"
    res = query(sql)
    return res[0]

app.py

from flask import Flask, jsonify
from flask import render_template
app = Flask(__name__)


@app.route("/")
def index_ap():
    return render_template("main.html")

import utils


@app.route("/time")
def get_time():

    return utils.get_time()


@app.route("/c1")
def get_c1_data():
    data = utils.get_cl_data()
    return jsonify({
    "confirm":int(data[0]),
    "suspect":int(data[1]),
    "heal":int(data[2]),
    "dead":int(data[3])})