Python爬虫之Selenium爬取途牛全国的酒店数据进行地图可视化

因为找不太到途牛的url规律,就只能慢慢爬取数据,由于页面加载的时间很慢,用一台电脑爬取4000+数据可能需要数小时,这里只是简单实现了每个城市的第一页数据,可以在这个基础上实现多个页面一起爬和强化翻页个功能

爬取全国酒店数据+可视化

爬取数据

在这里插入图片描述

一个py文件和一个文本文件就可以爬取了

首先是py文件

import json
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import re
import pymysql

# 获取谷歌驱动
driver = webdriver.Chrome("chromedriver.exe")
# 访问途牛网
driver.get("https://hotel.tuniu.com/list/602p0s0b0?cityName=%E5%B9%BF%E5%B7%9E")
# 最大化
driver.maximize_window()
# 睡眠3秒等待页面加载
time.sleep(3)

# 判断一下数据为不为空 为空就将字符串"null"返回去
def judgeLen(temp):
    if len(temp) > 0:
        data = temp[0]
    else:
        data = "null"
    return data

def getData():
    # 连接数据库
    connect = pymysql.connect(host="xxxxx", port=12345, user="xxx", passwd="xxxx",database="mydata",charset="utf8")
    # 获取游标
    cursor = connect.cursor()

    # 建表操作在可视化提前建好即可,或者自行写代码创建
    
    # 打开准备好的全部城市名字的文本文件
    with open("AllCity.txt",mode="r",encoding="utf-8") as file:
        # 将文本读取进来
        text = file.read()
        # 用json解析文本文件
        jsondata = json.loads(text)
        # 遍历解析出来的字典 pro就是key 省份
        for pro in jsondata:
            tempList = jsondata[pro]
            # 通过key遍历values 这里遍历出来的就是city
            for city in tempList:
                # 通过切割得到后面中文的城市名
                place = (str(city).split("|")[1])

                # -----------------自动修改城市名进行跳转-----------------------
                # 清空一下输入城市那个标签的内容
                driver.find_element_by_css_selector(".city-div > input:nth-child(1)").clear()
                # 将遍历出来的中文城市名填进去
                driver.find_element_by_css_selector(".city-div > input:nth-child(1)").send_keys(place)
                time.sleep(2)
                # 点击一下提示框的第一个地点 就会自动跳转到那个城市
                ActionChains(driver).move_by_offset(226, 263).click().perform()
                # 回到原点
                ActionChains(driver).move_by_offset(-226, -263).perform()
                time.sleep(5)

                # 对驱动返回的页面内容进行解析
                bs = BeautifulSoup(driver.page_source, "html.parser")
                # 获取每个酒店div标签
                data = bs.find_all("div", class_="hotel-item")
                # 遍历div标签
                for div in data:
                    # 正则表达式获取每个数据
                    # 酒店名
                    namepatt = re.compile(r'span.*?hotel-name f-m.*?>(.*?)</span>')
                    # 钻石图标,当做星星用了
                    diapatt = re.compile(r'(icon icon-diamond)')
                    # 星星
                    starpatt = re.compile(r'(icon icon-star)')
                    # 评分
                    ratingpatt = re.compile(
                        r'"hotel-score f-b f-DINA" data-v-74d0f10f="" style="background: rgb.*?;">(.*?)</div')
                    # 评论数
                    commpatt = re.compile(r'</span><span class="comment-amount f-r" data-v-74d0f10f="">(.*?)条评论')
                    # 价格
                    pricepatt = re.compile(
                        r'<span class="amount f-b f-DINA" data-v-74d0f10f="">(.*?)</span><span class="qi')

                    # -----------------匹配环节-----------------
                    # 匹配酒店名字和品牌
                    name = judgeLen(re.findall(namepatt, str(div)))
                    # 如果有找到"("
                    if name.find("(") > 0:
                        # 酒店名
                        hname = name.split("(")[1][:-1]
                        # 品牌
                        hbrand = name.split("(")[0]
                    else:
                        # 酒店名
                        hname = name
                        # 品牌
                        hbrand = "其他"
                    # 匹配星级 通过星星标签数量
                    if len(re.findall(diapatt, str(div))) > 0:
                        star = str(len(re.findall(diapatt, str(div)))) + "星"
                    else:
                        star = str(len(re.findall(starpatt, str(div)))) + "星"
                    # 评分
                    rating = judgeLen(re.findall(ratingpatt, str(div)))
                    # 评论数
                    comm = judgeLen(re.findall(commpatt, str(div)))
                    # 价格
                    price = judgeLen(re.findall(pricepatt, str(div)))
                    # 往数据库插入数据
                    insertSql = """
                                insert into `TC_hotel` (hname,hbrand,province,city,starlevel,rating,comment_count,price)values
                                ('{}','{}','{}','{}','{}','{}','{}','{}')
                                """.format(str(hname), str(hbrand), str(pro), str(place), str(star), str(rating), str(comm), str(price))
                    # 预编译sql语句
                    cursor.execute(insertSql)
                    # 提交
                    connect.commit()
                    # 打印插入信息
                    print("插入数据   "+str(pro), str(place), str(hname), str(hbrand), str(star), str(rating), str(comm), str(price))

if __name__ == '__main__':
    getData()

还有一个文本文件
copy过去即可

{
    "北京": ["bj|北京"],

    "天津": ["tj|天津"],

    "上海": ["sh|上海"],

    "台湾": ["tw|台湾"],

    "香港": ["hk|香港"],

    "澳门": ["am|澳门"],

    "河北": ["bd|保定", "cangzhou|沧州", "chengde|承德", "dingzhou|定州", "gt|馆陶", "hd|邯郸", "hs|衡水", "lf|廊坊", "qhd|秦皇岛", "sjz|石家庄", "ts|唐山", "xt|邢台", "zjk|张家口", "zd|正定", "zx|赵县", "zhangbei|张北"],

    "河南": ["ay|安阳", "changge|长葛", "hb|鹤壁", "jiaozuo|焦作", "jiyuan|济源", "kaifeng|开封", "luoyang|洛阳", "luohe|漯河", "mg|明港", "ny|南阳", "pds|平顶山", "puyang|濮阳", "sq|商丘", "smx|三门峡", "xx|新乡", "xc|许昌", "xy|信阳", "yuzhou|禹州", "yanling|鄢陵", "zz|郑州", "zk|周口", "zmd|驻马店"],

    "黑龙江": ["dq|大庆","dxal|大兴安岭", "hrb|哈尔滨", "hegang|鹤岗", "heihe|黑河", "jms|佳木斯", "jixi|鸡西", "mdj|牡丹江", "qqhr|齐齐哈尔", "qth|七台河", "suihua|绥化", "sys|双鸭山", "yich|伊春"],

    "吉林": ["bc|白城", "baishan|白山", "cc|长春", "jl|吉林", "liaoyuan|辽源", "songyuan|松原", "sp|四平", "th|通化", "yanbian|延边"],

    "辽宁" : ["as|鞍山", "benxi|本溪", "cy|朝阳", "dl|大连", "dandong|丹东", "fushun|抚顺", "fx|阜新", "hld|葫芦岛", "jinzhou|锦州", "liaoyang|辽阳", "pj|盘锦", "sy|沈阳", "tl|铁岭", "wfd|瓦房店", "yk|营口", "pld|庄河"],

    "山东": ["bz|滨州", "dz|德州", "dy|东营", "heze|菏泽", "jn|济南", "jining|济宁", "kl|垦利", "linyi|临沂", "lc|聊城", "lw|莱芜", "qd|青岛", "rizhao|日照", "shouguang|寿光", "longkou|龙口", "ta|泰安", "wf|潍坊", "weihai|威海", "yt|烟台", "zb|淄博", "zaozhuang|枣庄", "zhangqiu|章丘", "zc|诸城"],

    "内蒙古": ["alsm|阿拉善盟", "bt|包头", "bycem|巴彦淖尔", "chifeng|赤峰", "erds|鄂尔多斯", "hu|呼和浩特", "hlbe|呼伦贝尔", "hlr|海拉尔", "tongliao|通辽", "wuhai|乌海", "wlcb|乌兰察布", "xl|锡林郭勒", "xam|兴安盟"],

    "江苏": ["cz|常州", "dafeng|大丰", "danyang|丹阳", "dongtai|东台", "donghai|东海", "ha|淮安", "haimen|海门", "haian|海安", "jingjiang|靖江", "jianhu|建湖", "liyang|溧阳", "lyg|连云港", "nj|南京", "nt|南通", "pizhou|邳州", "qidong|启东", "rugao|如皋", "rudong|如东", "su|苏州", "shuyang|沭阳", "suqian|宿迁", "taizhou|泰州", "taixing|泰兴", "wx|无锡", "xinghuashi|兴化", "xinyishi|新沂", "xz|徐州", "xzpeixian|沛县", "yangzhong|扬中", "yz|扬州", "yancheng|盐城", "zj|镇江"],

    "安徽": ["anqing|安庆", "bengbu|蚌埠", "bozhou|亳州", "ch|巢湖", "chizhou|池州", "chuzhou|滁州", "fy|阜阳", "hf|合肥", "hn|淮南", "huaibei|淮北", "huangshan|黄山", "hexian|和县", "hq|霍邱", "la|六安", "mas|马鞍山", "ningguo|宁国", "suzhou|宿州", "tianchang|天长", "tongling|铜陵", "tongcheng|桐城", "wuhu|芜湖", "xuancheng|宣城"],

    "山西": ["changzhi|长治", "dt|大同", "jincheng|晋城", "jz|晋中", "lvliang|吕梁", "linfen|临汾", "linyixian|临猗", "qingxu|清徐", "shuozhou|朔州", "ty|太原", "xinzhou|忻州", "yuncheng|运城", "yq|阳泉"],

    "陕西": ["ankang|安康", "baoji|宝鸡", "hanzhong|汉中", "sl|商洛", "tc|铜川", "wn|渭南", "xa|西安", "xianyang|咸阳", "yanan|延安", "yl|榆林"],

    "甘肃": ["by|白银", "dx|定西", "gn|甘南", "jinchang|金昌", "jyg|嘉峪关", "jq|酒泉", "lz|兰州", "linxia|临夏", "ln|陇南", "pl|平凉", "qingyang|庆阳", "tianshui|天水", "wuwei|武威", "zhangye|张掖"],

    "浙江": ["hz|杭州", "cixi|慈溪", "changxing|长兴", "deqing|德清", "dongyang|东阳", "haining|海宁", "huzhou|湖州", "jiashanx|嘉善", "jx|嘉兴", "jh|金华", "lishui|丽水", "nb|宁波", "quzhou|衢州", "ruiancity|瑞安", "sx|绍兴", "tongxiang|桐乡", "tz|台州", "wenling|温岭", "wz|温州", "xiangshanxian|象山", "yiwu|义乌", "yueqingcity|乐清", "yuyao|余姚", "zhoushan|舟山", "zhuji|诸暨"],

    "江西": ["fuzhou|抚州", "ganzhou|赣州", "jj|九江", "ja|吉安", "jdz|景德镇", "nc|南昌", "px|萍乡", "sr|上饶", "xinyu|新余", "yingtan|鹰潭", "yichun|宜春", "yxx|永新"],

    "湖北": ["es|恩施", "ez|鄂州", "hshi|黄石", "hg|黄冈", "jingzhou|荆州", "jingmen|荆门", "qianjiang|潜江", "shiyan|十堰", "snj|神农架", "suizhou|随州", "tm|天门", "wh|武汉", "xf|襄阳", "xiaogan|孝感", "xiantao|仙桃", "xianning|咸宁", "yc|宜昌", "yidou|宜都"],

    "湖南": ["cs|长沙", "changde|常德", "chenzhou|郴州", "hy|衡阳", "hh|怀化", "ld|娄底", "shaoyang|邵阳", "xiangtan|湘潭", "xiangxi|湘西", "yy|岳阳", "yongzhou|永州", "yiyang|益阳", "zhuzhou|株洲", "zjj|张家界"],

    "贵州": ["anshun|安顺", "bijie|毕节", "gy|贵阳", "lps|六盘水", "qdn|黔东南", "qn|黔南", "qxn|黔西南", "tr|铜仁", "zunyi|遵义"],

    "四川": ["ab|阿坝", "bazhong|巴中", "cd|成都", "deyang|德阳", "dazhou|达州", "ga|广安", "guangyuan|广元", "ganzi|甘孜", "ls|乐山", "luzhou|泸州", "liangshan|凉山", "mianyang|绵阳", "ms|眉山", "scnj|内江", "nanchong|南充", "panzhihua|攀枝花", "suining|遂宁", "yb|宜宾", "ya|雅安", "zg|自贡", "zy|资阳"],

    "云南": ["bs|保山", "cx|楚雄", "dali|大理", "diqing|迪庆", "dh|德宏", "honghe|红河", "km|昆明", "lj|丽江", "lincang|临沧", "nujiang|怒江", "pe|普洱", "qj|曲靖", "ws|文山", "bn|西双版纳", "yx|玉溪", "zt|昭通"],

    "新疆":  ["aks|阿克苏", "ale|阿拉尔", "bygl|巴音郭楞", "betl|博尔塔拉", "changji|昌吉", "hami|哈密", "ht|和田", "klmy|克拉玛依", "kel|库尔勒", "ks|喀什", "kzls|克孜勒苏", "shz|石河子", "tlf|吐鲁番", "tmsk|图木舒克", "xj|乌鲁木齐", "wjq|五家渠", "yili|伊犁", "alt|阿勒泰", "tac|塔城"],

    "宁夏": ["guyuan|固原", "szs|石嘴山", "wuzhong|吴忠", "yinchuan|银川", "zw|中卫"],

    "青海":  ["guoluo|果洛", "huangnan|黄南", "hx|海西", "haidong|海东", "haibei|海北", "hainan|海南", "xn|西宁", "ys|玉树"],

    "西藏": ["al|阿里", "changdu|昌都", "lasa|拉萨", "linzhi|林芝", "nq|那曲", "rkz|日喀则", "sn|山南", "rituxian|日土", "gaizexian|改则"],

    "广西": ["baise|百色", "bh|北海", "chongzuo|崇左", "fcg|防城港", "gl|桂林", "gg|贵港", "hc|河池", "hezhou|贺州", "liuzhou|柳州", "lb|来宾", "nn|南宁", "qinzhou|钦州", "wuzhou|梧州", "yulin|玉林"],

    "广东": ["chaozhou|潮州", "dg|东莞", "fs|佛山", "gz|广州", "huidong|惠东", "huizhou|惠州", "heyuan|河源", "jm|江门", "jy|揭阳", "mm|茂名", "mz|梅州", "qingyuan|清远", "sd|顺德", "sz|深圳", "st|汕头", "sg|韶关", "sw|汕尾", "taishan|台山", "yj|阳江", "yangchun|阳春", "yf|云浮", "zh|珠海", "zs|中山", "zhanjiang|湛江", "zq|肇庆", "boluo|博罗"],

    "福建": ["fz|福州", "jinjiangshi|晋江", "ly|龙岩", "nd|宁德", "np|南平", "nananshi|南安", "pt|莆田", "qz|泉州", "sm|三明", "shishi|石狮", "wuyishan|武夷山", "xm|厦门", "zhangzhou|漳州"],

    "海南": ["haikou|海口", "sansha|三沙", "sanya|三亚", "wzs|五指山", "qh|琼海", "wenchang|文昌", "wanning|万宁", "tunchang|屯昌", "qiongzhong|琼中", "lingshui|陵水", "df|东方", "da|定安", "cm|澄迈", "baoting|保亭", "baish|白沙", "tanzhou|儋州"]


}

爬出来的数据表(星钻可以不作区分)
在这里插入图片描述
设计表(方便插入数据就全部varchar,见谅)
在这里插入图片描述

可视化Flask+Echarts

在这里插入图片描述

圈起来的就是用到的
在这里插入图片描述
首先是app.py文件

from flask import Flask, render_template
from flask_sqlalchemy import SQLAlchemy

app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://用户名:密码@域名:端口/数据库?charset=utf8'
app.config.setdefault('SQLALCHEMY_TRACK_MODIFICATIONS', True)
db = SQLAlchemy(app)

"""
1)	编写程序,计算每个酒店的综合得分

先对星级(starlevel)、评价(rating)、评论数(comment_count)3个字段做以下转换:

对评价(rating)和评论数(comment_count)两个字段做归一化,调整到[0, 1]区间,得到评价得分和评论得分;
星级得分为: 星数  *   0.2 。

综合得分为:  星级得分(30%)、评价得分(50%)、评论得分(20%)的加权平均

2)	统计每个省份酒店的平均总得分

3)	主标题为“全国各省酒店综合得分”(红色,加粗)

4)	输出全国各省综合得分情况地图


"""
# SQLAlchemy映射
class yang_Table(db.Model):
    __tablename__ = 'tc_hotel'
    hname =db.Column(db.String(50),primary_key=True)
    hbrand =db.Column(db.String(50))
    province = db.Column(db.String(50))
    city = db.Column(db.String(50))
    starlevel = db.Column(db.String(50))
    rating = db.Column(db.String(50))
    comment_count = db.Column(db.String(50))
    price = db.Column(db.String(50))

@app.route("/")
@app.route("/china")
def china():
    data = []
    # 将表里的数据获取到,得到的是一个列表
    ds = db.session.query(yang_Table.hname, yang_Table.hbrand, yang_Table.province, yang_Table.city,yang_Table.starlevel,yang_Table.rating,yang_Table.comment_count,yang_Table.price).all()
    # 根据题目计算各个城市的综合分数 以下代码可以根据逻辑自行敲
    # 根据需求进行归一化
    # 定义并初始化最大最小值
    ramax  = 0
    ramin = 5
    comin = 9999999
    comax = 0
    # 然后遍历列表的数据,求出最大最最小值
    for i in ds:
        ramax = max(ramax, float(i[5]))
        ramin = min(ramin, float(i[5]))
        comax = max(comax, float(i[6]))
        comin = min(comin, float(i[6]))
    # 计算最大减最小的差
    racha = ramax-ramin
    cocha = comax-comin
    rating = []
    comment = []
    star = []
    # 进行归一化计算
    for i in ds:
        rating.append(round((float(i[5])-ramin)/racha, 2))
        comment.append(round((float(i[6])-comin)/cocha,2))
        star.append(round(float(int(i[4][:-1])*0.2), 2))
    # 对评价和评论数计算分数
    for i in range(len(star)):
        data.append(round(float(star[i] * 30 + rating[i] * 50 + comment[i] * 20), 2))
    print(data)
    pro = []
    temp = ds[0][2]
    dicData = {"北京":[]}
    for i in range(len(data)):
        if ds[i][2] != temp:
            temp = ds[i][2]
            pro.append(temp)
            dicData[temp] = []
        else:
            dicData[temp].append(data[i])
    avgdata = dict()
    for key in dicData:
        sum = 0
        for item in dicData[key]:
            sum += item
        avgdata[key] = round(float(sum/len(dicData[key])), 2)
    # 将数据转为Echarts可以接受的数据
    result = []
    for key in avgdata:
        result.append({"name": key, "value": avgdata[key]})
    print(len(avgdata))
    title = "全国各省酒店综合得分"
    tips = '综合得分'
    # 将数据传到前端
    return render_template("china.html", data=result, title=title, tips=tips)

if __name__ == "__main__":
    app.run(host='127.0.0.1', port=5222, debug=True)

然后就是html文件

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>数据可视化</title>
    <style>
        #map {
            width : 1000px;
            height: 600px;
            margin : 50px auto;
        }
    </style>
</head>
<body>
    <div id="map">
    </div>
</body>
   <!--echarts引入-->
<script src="../static/js/echarts.js" charset="utf-8"></script>
<script src="../static/js/china.js" charset="utf-8"></script>
<script>
var myChart = echarts.init(document.getElementById('map'));
    var option = {
            title: {
                text: '{{title|safe}}',
                textStyle: {
                    color: 'red' ,
                    fontSize: 16 ,
                    fontWeight: 'bolder',
                },
                left: '40%'
            },
            tooltip: {
                    formatter:function(params,ticket, callback){
                        return params.seriesName+'<br />'+params.name+':'+params.value
                    }
                },
            visualMap: {
                min: 0,
                max: 100,
                left: 'left',
                top: 'bottom',
                text: ['高','低'],
                inRange: {
                    color:  ['#00FF00', '#FFFF00', '#FF0000']
                },
                show:true
            },
            geo: {
                map: 'china',
                roam: false,
                zoom:1.23,
                label: {
                    normal: {
                        show: true,
                        fontSize:'10',
                        color: 'rgba(0,0,0,0.7)'
                    }
                },
                itemStyle: {
                    normal:{
                        borderColor: 'rgba(0, 0, 0, 0.2)'
                    },
                    emphasis:{
                        areaColor: '#F3B329',
                        shadowOffsetX: 0,
                        shadowOffsetY: 0,
                        shadowBlur: 20,
                        borderWidth: 0,
                        shadowColor: 'rgba(0, 0, 0, 0.5)'
                    }
                }
            },
            series : [
                {
                    name: '{{tips|safe}}',
                    type: 'map',
                    geoIndex: 0,
                    data:{{data|safe}}
                }
            ]
        }

    myChart.setOption(option);
</script>
</html>

js文件如果没有的话可以私信我

在这里插入图片描述
原创不易,请给博主一个小小的赞吧~

  • 21
    点赞
  • 85
    收藏
    觉得还不错? 一键收藏
  • 12
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 12
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值