大众点评爬虫

代码包括大众点评商铺 商家名称商家链接标签位置评论数量人均消费评分口味环境服务地址电话。五一假期来了,小伙伴们可以好好放松一下啦。

# 加载库

# 数据分析库
import pandas as pd
import numpy as np
import pickle
import os

# 爬虫库
import re
import time
import requests
import json
import ast
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from lxml import etree

设置函数

# '商家名称' + '商家链接'
def get_name_url(res_text):
    
    name_url = re.findall('title="(.+?)" target="_blank" href="(.+?)"  >\n',res_text)
    
    name = []
    url = []
    
    for v in name_url:
        name.append(v[0])
        url.append(v[1])
    
    return name, url
# '标签' + '位置'
def get_tag_loc(res_text):
    
    tag_temp = re.findall('<span class="tag">.+</span></a>\n        <em class="sep">',res_text)
    tag_temp = [v.replace('</a>\n        <em class="sep">','') for v in tag_temp]
    loc_temp = re.findall('</em>\n        .+(<span class="tag">.+</span>)',res_text)
    
    tag = [[v.replace('<svgmtsi class="tagName">','') for v in n] for n in [re.findall('>(.+?)<',v) for v in tag_temp]]
    loc = [[v.replace('<svgmtsi class="tagName">','') for v in n] for n in [re.findall('>(.+?)<',v) for v in loc_temp]]
    
    tag = [v[0] for v in tag]
    loc = [v[0] for v in loc]
    
    return tag, loc
# '评论数量'
def get_comment(res_text):
    
    comment_list = []
    comment = re.findall('<b>.+\n条评价',res_text)
    
    for a in comment:
        r = [v.replace('<svgmtsi class="shopNum">','') for v in re.findall('>(.+?)<',a)]
        comment_list.append(r)
    
    comment_list = [v[0] for v in comment_list]
    
    return comment_list
# '人均消费'
def get_per(res_text):
    per_temp = re.findall('人均\n            (<b>¥.+</b>)', res_text)
    per = [[v.replace('<svgmtsi class="shopNum">','').replace('¥','') for v in re.findall('>(.+?)<', n)] for n in per_temp]
    per = [v[0] for v in per]
    return per
# '评分'
def get_score(res_text):
    score = ast.literal_eval(res_text).get('fiveScore')
    return score

# '口味'
def get_score_flavor(res_text):
    score_flavor = json.loads(res_text).get('shopRefinedScoreValueList')[0]
    return score_flavor

# '环境'
def get_score_env(res_text):
    score_env = json.loads(res_text).get('shopRefinedScoreValueList')[1]
    return score_env

# '服务'
def get_score_service(res_text):
    score_service = json.loads(res_text).get('shopRefinedScoreValueList')[2]
    return score_service
# '地址'
def get_address(res_text):
    tag = re.findall('<div class="expand-info address" itemprop="street-address">.+?</div>', res_text)[0]
    address_temp = re.findall('>(.+?)<', tag)
    address = [re.sub('<e class="address">|<d class="num">| ','',v) for v in address_temp if v != ' '][1]
    return address
# '电话'
def get_tel(res_text):
    tag = re.findall('<p class="expand-info tel">.+?</p>', res_text)[0]
    address_temp = re.findall('>(.+?)<', tag)
    address_temp_1 = [re.sub('<d class="num">| ','',v) for v in address_temp if v != ' ']
    address = [v.replace('&nbsp;',' ') for v in address_temp_1][1]
    return address

爬取数据

# 获取商家信息页面

# 设置爬取内容
res_text = []

# 设置爬取页数
page = 3

# 开始爬取
for i in range(page):
    
    # 设置当前页码
    page_num = str(i+1)
    
    # 设置链接
    url = 'https://www.dianping.com/shanghai/ch10/g117p' + page_num
    
    # 设置 headers
    # User_Agent 与 Cookie 替换为对应内容
    headers = {
        'User-Agent': User_Agent,
        'Cookie': Cookie
    }
    
    # 进行访问
    res = requests.get(url, headers=headers)
    
    # 当访问报错时停留 5s 继续访问
    while res.status_code != 200:
        print('                  ', end='\r')
        print('请等一下', end='\r')
        time.sleep(5)
        res = requests.get(url, headers=headers)
    
    # 存储页面内容
    res_text.append(res.text)
    
    # 停留 5s
    time.sleep(5)
    
    # 打印进度
    print(str(i+1)+'/'+str(page)+' 页爬取完成', end='\r')
# 3/3 页爬取完成
# 全量爬取

# 设置最终数据
df_final = pd.DataFrame()

# 开始爬取
for i in range(len(res_text)):

    # 初始化数据
    score_list = []
    score_list_flavor = []
    score_list_env = []
    score_list_service = []
    address_list = []
    tel_list = []
    df = pd.DataFrame()
    df_score = pd.DataFrame()
    
    # ['商家名称', '商家链接', '标签', '位置', '评论数量', '人均消费']
    
    name_temp, _ = get_name_url(res_text[i])                         # 商家名称
    _, url_temp = get_name_url(res_text[i])                          # 商家链接
    tag_temp, _ = get_tag_loc(res_text[i])                           # 标签
    _, loc_temp = get_tag_loc(res_text[i])                           # 位置
    comment_temp = get_comment(res_text[i])                          # 评论数量
    per_temp = get_per(res_text[i])                                  # 人均消费
    
    # 合并标签页数据
    df_columns = ['商家名称', '商家链接', '标签', '位置', '评论数量', '人均消费']
    df_temp = pd.DataFrame([name_temp, url_temp, tag_temp, loc_temp, comment_temp, per_temp], index = df_columns).T
    
    # 合并数据
    df = pd.concat([df, df_temp], ignore_index = True)
    
    # ['评分', '口味', '环境', '服务', '地址', '电话']
    
    # 获取店铺名称
    title = get_name_url(res_text[i])[0]
    # 获取店铺 shopid
    shopid = [v.split('/')[-1] for v in get_name_url(res_text[i])[1]]

	# 爬取店铺信息
    for v in shopid:

        # ['评分', '口味', '环境', '服务']
        
        # 设置链接
        url = 'https://www.dianping.com/ajax/json/shopDynamic/reviewAndStar'

        # 设置 params
        # 打开 F12 找到 'https://www.dianping.com/ajax/json/shopDynamic/reviewAndStar' 对应标签查找相应内容
        # token 与 uuid 替换为对应内容
        params = {
            'shopId': v,
            'cityId': '1',
            'mainCategoryId': '34245',
            '_token': token,
            'uuid': uuid,
            'platform': '1',
            'partner': '150',
            'optimusCode': '10',
            'originUrl': 'https://www.dianping.com/shop/'+v
        }

        # 设置 headers
        # User_Agent 与 Cookie 替换为对应内容
        headers = {
            'User-Agent': User_Agent,
            'Cookie': Cookie
        }

        # 进行访问
        res = requests.get(url, headers = headers, params = params)
        while res.status_code != 200:
            print('休息一下。')
            time.sleep(5)
            res = requests.get(url, headers = headers, params = params)

        # 获取页面文本
        res_text_score = res.text

        # 获取店铺 ['评分']
        score = get_score(res_text_score)

        # 获取店铺 ['口味', '环境', '服务'] 评分
        lst = ['flavor', 'env', 'service']
        for l in lst:
            exec('score_'+l+' = get_score_'+l+'(res_text_score)')

        # 合并数据
        score_list.append(score)
        score_list_flavor.append(score_flavor)
        score_list_env.append(score_env)
        score_list_service.append(score_service)

        # ['地址', '电话']
        
        # 设置链接
        url = 'https://www.dianping.com/shop/'+v

        # 设置 headers
        # User_Agent 与 Cookie 替换为对应内容
        headers = {
            'User-Agent': User_Agent,
            'Cookie': Cookie
        }

        # 进行访问
        res = requests.get(url, headers = headers)

        # 获取店铺 ['地址']
        address = get_address(res.text)
        
        # 获取店铺 ['电话']
        tel = get_tel(res.text)

        # 合并数据
        address_list.append(address)
        tel_list.append(tel)

        # 停留 5s
        time.sleep(5)
        
        # 打印进度
        print(str(i+1)+'/'+str(len(res_text))+' 页 '+str(shopid.index(v)+1)+'/'+str(len(shopid))+' 爬取完成', end='\r')
    
    # 合并评分数据
    df_score_columns = ['评分', '口味', '环境', '服务', '地址', '电话']
    df_score_temp = pd.DataFrame([score_list, score_list_flavor, score_list_env, score_list_service, address_list, tel_list], index = df_score_columns).T
    df_score = pd.concat([df_score, df_score_temp], ignore_index=True)
    
    # 合并最终数据
    df_final_temp = pd.concat([df, df_score], axis=1)
    df_final = pd.concat([df_final, df_final_temp], ignore_index= True)
    
    # 停留 5s
    time.sleep(5)
    
    # 打印进度
    print('                             ', end='\r')
    print(str(i+1)+'/'+str(len(res_text))+' 页爬取完成', end='\r')
# 3/3 页爬取完成 

导出数据

# 导出数据
df_final.to_excel('./大众点评爬取.xlsx', index=False)
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值