1.由于大众点评的反爬虫措施(如Cookie就是必须放入请求头Header中)太过严禁,博主本人在爬取测试过程中IP被封,更换了IP才得以继续测试,并且后来博主在爬取过程中设置了小型防崩溃措施。
2.爬取速度不宜太快,爬取次数同一个IP下有限制
3.网上好多爬取方法已经失效或者是不怎么关用,博主花下大量时间才得以爬取
首先先抓取各个美食商铺的名称、链接、星级、价格、地址等基本信息,并导入CSV表格中
import csv
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
# 提前限制JS与图片加载
option = webdriver.ChromeOptions()
prefs = {
'profile.default_content_setting_values': {
'images': 2,
'javascript':2
}
}
option.add_experimental_option('prefs',prefs)
# 创建chrome参数对象
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
wait = WebDriverWait(driver, 10)
def insert_csv(output_list):
with open('restaurant_list.csv', 'a+', newline='', encoding='UTF-8') as csvfile:
spamwriter = csv.writer(csvfile, dialect='excel')
spamwriter.writerows(output_list)
csvfile.close()
def page_search(i, output_list):
try:
print('页数'+ str(i))
driver.get('http://www.dianping.com/shanghai/ch10/p' + str(i))
driver.implicitly_wait(6)
soup = BeautifulSoup(driver.page_source, 'html.parser')
return outputOneCommentResult(i, soup, output_list)
except Exception as e:
print('Error:', e)
time.sleep(random.randint(2, 3) + random.random())
page_search(i)
def outputOneCommentResult(page_id, soup, output_list):
try:
wait.until(EC.presence_of_element_located((By.ID, 'shop-all-list')))
for item in soup.find(id='shop-all-list').ul:
try:
title = item.find(class_='tit').a.text.strip()
except:
title = ''
try:
link = item.find(class_='tit').a['href'] + '/review_more'
except:
link = ''
try:
star = item.find(class_='comment').span['title']
except:
star = ''
try:
comment_link = item.find(class_='review-num')['href']
except:
comment_link = ''
try:
comment = item.find(class_='review-num').b.text.strip()
except:
comment = ''
try:
price = item.find(class_='mean-price').b.text.strip()
except:
price = ''
try:
tag = item.find(class_='tag').text.strip()
except:
tag = ''
try:
addr = item.find(class_='addr').text.strip()
except:
addr = ''
try:
commentlist = commentlist.replace('\n', '|')
except:
commentlist = ''
if title != '':
output_list.append([str(page_id), title, link, star, comment_link, comment, price, tag, addr, commentlist])
# print(output_new)
return output_list
except TimeoutError as e:
print('Error:', e)
outputOneCommentResult(page_id, soup)
if __name__ == '__main__':
print('正在爬取大众点评网站餐厅数据:')
for i in range(1, 3):
output_list = []
output_list = page_search(i, output_list)
insert_csv(output_list)
time.sleep(random.randint(2, 3) + random.random())
driver.close()
两页的采集数据:
之后从restaurant_list中获取美食商铺链接,再进行解析后获取店铺详细信息
import csv
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
# 提前限制JS与图片加载
option = webdriver.ChromeOptions()
prefs = {
'profile.default_content_setting_values': {
'images': 2,
'javascript':2
}
}
option.add_experimental_option('prefs',prefs)
# 创建chrome参数对象
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
# driver = webdriver.Chrome()
driver.implicitly_wait(6)
wait = WebDriverWait(driver, 10)
link_list = []
def get_csv():
with open('restaurant_list.csv', encoding='UTF-8') as f:
csv_file = csv.reader(f)
link_list = [[row[1], row[2], row[3]] for row in csv_file]
f.close()
return link_list
def insert_csv(output_list):
with open('restaurant_detail.csv', 'a+', newline='', encoding='UTF-8') as csvfile:
spamwriter = csv.writer(csvfile, dialect='excel')
spamwriter.writerows(output_list)
csvfile.close()
def dishes_detail(eachone, dishes_list):
try:
dishes = eachone.text
dishes_list.append(dishes)
return dishes_list
except Exception as e:
print('Dishes Error:', e)
def comment_detail(eachone, comment_list):
try:
comment_tag = eachone.text.strip().replace(' ', '').replace('\n', '')
comment_list.append(comment_tag)
return comment_list
except Exception as e:
print('Comment Error:', e)
def link_detail(eachone, output_list):
name = eachone[0]
str = eachone[2]
print('正在爬取:',name, str)
# 进入评论页面
link = eachone[1] + '/review_all'
driver.get(link)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# 爬取优秀评论
comment_list = []
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#review-list > div.review-list-container > div.review-list-main > div.reviews-wrapper > div.reviews-tags > div.content')))
for ea in soup.select('.tag'):
comment_list = comment_detail(ea, comment_list)
comment_list = '|'.join(comment_list)
# 爬取优秀菜谱
dishes_list = []
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#review-list > div.review-list-container > div.review-list-aside > div.shop-dish > div.dish-list.clearfix')))
for ea in soup.select('.dish-list .dish-name'):
dishes_list = dishes_detail(ea, dishes_list)
dishes_list = '|'.join(dishes_list)
output_list.append([name, eachone[1], dishes_list, comment_list])
return output_list
if __name__ == '__main__':
link_list = get_csv()
print('开始爬取大众点评热门商家:')
for eachone in link_list:
output_list = []
output_list = link_detail(eachone, output_list)
insert_csv(output_list)
time.sleep(random.randint(2, 3) + random.random())
driver.close()
采用格式转换将csv格式转换成xlsx格式:
爬取结束.