代码包括大众点评商铺 商家名称
、商家链接
、标签
、 位置
、 评论数量
、 人均消费
、评分
、口味
、环境
、服务
、地址
、 电话
。五一假期来了,小伙伴们可以好好放松一下啦。
# 加载库
# 数据分析库
import pandas as pd
import numpy as np
import pickle
import os
# 爬虫库
import re
import time
import requests
import json
import ast
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from lxml import etree
设置函数
# '商家名称' + '商家链接'
def get_name_url(res_text):
name_url = re.findall('title="(.+?)" target="_blank" href="(.+?)" >\n',res_text)
name = []
url = []
for v in name_url:
name.append(v[0])
url.append(v[1])
return name, url
# '标签' + '位置'
def get_tag_loc(res_text):
tag_temp = re.findall('<span class="tag">.+</span></a>\n <em class="sep">',res_text)
tag_temp = [v.replace('</a>\n <em class="sep">','') for v in tag_temp]
loc_temp = re.findall('</em>\n .+(<span class="tag">.+</span>)',res_text)
tag = [[v.replace('<svgmtsi class="tagName">','') for v in n] for n in [re.findall('>(.+?)<',v) for v in tag_temp]]
loc = [[v.replace('<svgmtsi class="tagName">','') for v in n] for n in [re.findall('>(.+?)<',v) for v in loc_temp]]
tag = [v[0] for v in tag]
loc = [v[0] for v in loc]
return tag, loc
# '评论数量'
def get_comment(res_text):
comment_list = []
comment = re.findall('<b>.+\n条评价',res_text)
for a in comment:
r = [v.replace('<svgmtsi class="shopNum">','') for v in re.findall('>(.+?)<',a)]
comment_list.append(r)
comment_list = [v[0] for v in comment_list]
return comment_list
# '人均消费'
def get_per(res_text):
per_temp = re.findall('人均\n (<b>¥.+</b>)', res_text)
per = [[v.replace('<svgmtsi class="shopNum">','').replace('¥','') for v in re.findall('>(.+?)<', n)] for n in per_temp]
per = [v[0] for v in per]
return per
# '评分'
def get_score(res_text):
score = ast.literal_eval(res_text).get('fiveScore')
return score
# '口味'
def get_score_flavor(res_text):
score_flavor = json.loads(res_text).get('shopRefinedScoreValueList')[0]
return score_flavor
# '环境'
def get_score_env(res_text):
score_env = json.loads(res_text).get('shopRefinedScoreValueList')[1]
return score_env
# '服务'
def get_score_service(res_text):
score_service = json.loads(res_text).get('shopRefinedScoreValueList')[2]
return score_service
# '地址'
def get_address(res_text):
tag = re.findall('<div class="expand-info address" itemprop="street-address">.+?</div>', res_text)[0]
address_temp = re.findall('>(.+?)<', tag)
address = [re.sub('<e class="address">|<d class="num">| ','',v) for v in address_temp if v != ' '][1]
return address
# '电话'
def get_tel(res_text):
tag = re.findall('<p class="expand-info tel">.+?</p>', res_text)[0]
address_temp = re.findall('>(.+?)<', tag)
address_temp_1 = [re.sub('<d class="num">| ','',v) for v in address_temp if v != ' ']
address = [v.replace(' ',' ') for v in address_temp_1][1]
return address
爬取数据
# 获取商家信息页面
# 设置爬取内容
res_text = []
# 设置爬取页数
page = 3
# 开始爬取
for i in range(page):
# 设置当前页码
page_num = str(i+1)
# 设置链接
url = 'https://www.dianping.com/shanghai/ch10/g117p' + page_num
# 设置 headers
# User_Agent 与 Cookie 替换为对应内容
headers = {
'User-Agent': User_Agent,
'Cookie': Cookie
}
# 进行访问
res = requests.get(url, headers=headers)
# 当访问报错时停留 5s 继续访问
while res.status_code != 200:
print(' ', end='\r')
print('请等一下', end='\r')
time.sleep(5)
res = requests.get(url, headers=headers)
# 存储页面内容
res_text.append(res.text)
# 停留 5s
time.sleep(5)
# 打印进度
print(str(i+1)+'/'+str(page)+' 页爬取完成', end='\r')
# 3/3 页爬取完成
# 全量爬取
# 设置最终数据
df_final = pd.DataFrame()
# 开始爬取
for i in range(len(res_text)):
# 初始化数据
score_list = []
score_list_flavor = []
score_list_env = []
score_list_service = []
address_list = []
tel_list = []
df = pd.DataFrame()
df_score = pd.DataFrame()
# ['商家名称', '商家链接', '标签', '位置', '评论数量', '人均消费']
name_temp, _ = get_name_url(res_text[i]) # 商家名称
_, url_temp = get_name_url(res_text[i]) # 商家链接
tag_temp, _ = get_tag_loc(res_text[i]) # 标签
_, loc_temp = get_tag_loc(res_text[i]) # 位置
comment_temp = get_comment(res_text[i]) # 评论数量
per_temp = get_per(res_text[i]) # 人均消费
# 合并标签页数据
df_columns = ['商家名称', '商家链接', '标签', '位置', '评论数量', '人均消费']
df_temp = pd.DataFrame([name_temp, url_temp, tag_temp, loc_temp, comment_temp, per_temp], index = df_columns).T
# 合并数据
df = pd.concat([df, df_temp], ignore_index = True)
# ['评分', '口味', '环境', '服务', '地址', '电话']
# 获取店铺名称
title = get_name_url(res_text[i])[0]
# 获取店铺 shopid
shopid = [v.split('/')[-1] for v in get_name_url(res_text[i])[1]]
# 爬取店铺信息
for v in shopid:
# ['评分', '口味', '环境', '服务']
# 设置链接
url = 'https://www.dianping.com/ajax/json/shopDynamic/reviewAndStar'
# 设置 params
# 打开 F12 找到 'https://www.dianping.com/ajax/json/shopDynamic/reviewAndStar' 对应标签查找相应内容
# token 与 uuid 替换为对应内容
params = {
'shopId': v,
'cityId': '1',
'mainCategoryId': '34245',
'_token': token,
'uuid': uuid,
'platform': '1',
'partner': '150',
'optimusCode': '10',
'originUrl': 'https://www.dianping.com/shop/'+v
}
# 设置 headers
# User_Agent 与 Cookie 替换为对应内容
headers = {
'User-Agent': User_Agent,
'Cookie': Cookie
}
# 进行访问
res = requests.get(url, headers = headers, params = params)
while res.status_code != 200:
print('休息一下。')
time.sleep(5)
res = requests.get(url, headers = headers, params = params)
# 获取页面文本
res_text_score = res.text
# 获取店铺 ['评分']
score = get_score(res_text_score)
# 获取店铺 ['口味', '环境', '服务'] 评分
lst = ['flavor', 'env', 'service']
for l in lst:
exec('score_'+l+' = get_score_'+l+'(res_text_score)')
# 合并数据
score_list.append(score)
score_list_flavor.append(score_flavor)
score_list_env.append(score_env)
score_list_service.append(score_service)
# ['地址', '电话']
# 设置链接
url = 'https://www.dianping.com/shop/'+v
# 设置 headers
# User_Agent 与 Cookie 替换为对应内容
headers = {
'User-Agent': User_Agent,
'Cookie': Cookie
}
# 进行访问
res = requests.get(url, headers = headers)
# 获取店铺 ['地址']
address = get_address(res.text)
# 获取店铺 ['电话']
tel = get_tel(res.text)
# 合并数据
address_list.append(address)
tel_list.append(tel)
# 停留 5s
time.sleep(5)
# 打印进度
print(str(i+1)+'/'+str(len(res_text))+' 页 '+str(shopid.index(v)+1)+'/'+str(len(shopid))+' 爬取完成', end='\r')
# 合并评分数据
df_score_columns = ['评分', '口味', '环境', '服务', '地址', '电话']
df_score_temp = pd.DataFrame([score_list, score_list_flavor, score_list_env, score_list_service, address_list, tel_list], index = df_score_columns).T
df_score = pd.concat([df_score, df_score_temp], ignore_index=True)
# 合并最终数据
df_final_temp = pd.concat([df, df_score], axis=1)
df_final = pd.concat([df_final, df_final_temp], ignore_index= True)
# 停留 5s
time.sleep(5)
# 打印进度
print(' ', end='\r')
print(str(i+1)+'/'+str(len(res_text))+' 页爬取完成', end='\r')
# 3/3 页爬取完成
导出数据
# 导出数据
df_final.to_excel('./大众点评爬取.xlsx', index=False)