网络爬虫-大众点评-获取美食商铺评论标签与推荐美食-本地Excel表格

1.由于大众点评的反爬虫措施(如Cookie就是必须放入请求头Header中)太过严禁,博主本人在爬取测试过程中IP被封,更换了IP才得以继续测试,并且后来博主在爬取过程中设置了小型防崩溃措施。

2.爬取速度不宜太快,爬取次数同一个IP下有限制

3.网上好多爬取方法已经失效或者是不怎么关用,博主花下大量时间才得以爬取

 

首先先抓取各个美食商铺的名称、链接、星级、价格、地址等基本信息,并导入CSV表格中

import csv
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC


# 提前限制JS与图片加载
option = webdriver.ChromeOptions()
prefs = {
     'profile.default_content_setting_values': {
        'images': 2,
        'javascript':2
    }
}
option.add_experimental_option('prefs',prefs)

# 创建chrome参数对象
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)

wait = WebDriverWait(driver, 10)

def insert_csv(output_list):
    with open('restaurant_list.csv', 'a+', newline='', encoding='UTF-8') as csvfile:
        spamwriter = csv.writer(csvfile, dialect='excel')
        spamwriter.writerows(output_list)
        csvfile.close()

def page_search(i, output_list):
    try:
        print('页数'+ str(i))
        driver.get('http://www.dianping.com/shanghai/ch10/p' + str(i))
        driver.implicitly_wait(6)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        return outputOneCommentResult(i, soup, output_list)
    except Exception as e:
        print('Error:', e)
        time.sleep(random.randint(2, 3) + random.random())
        page_search(i)

def outputOneCommentResult(page_id, soup, output_list):
    try:
        wait.until(EC.presence_of_element_located((By.ID, 'shop-all-list')))
        for item in soup.find(id='shop-all-list').ul:
            try:
                title = item.find(class_='tit').a.text.strip()
            except:
                title = ''
            try:
                link = item.find(class_='tit').a['href'] + '/review_more'
            except:
                link = ''
            try:
                star = item.find(class_='comment').span['title']
            except:
                star = ''
            try:
                comment_link = item.find(class_='review-num')['href']
            except:
                comment_link = ''
            try:
                comment = item.find(class_='review-num').b.text.strip()
            except:
                comment = ''
            try:
                price = item.find(class_='mean-price').b.text.strip()
            except:
                price = ''
            try:
                tag = item.find(class_='tag').text.strip()
            except:
                tag = ''
            try:
                addr = item.find(class_='addr').text.strip()
            except:
                addr = ''
            try:
                commentlist = commentlist.replace('\n', '|')
            except:
                commentlist = ''

            if title != '':
                output_list.append([str(page_id), title, link, star, comment_link, comment, price, tag, addr, commentlist])
                # print(output_new)

        return output_list
    except TimeoutError as e:
        print('Error:', e)
        outputOneCommentResult(page_id, soup)


if __name__ == '__main__':
    print('正在爬取大众点评网站餐厅数据:')
    for i in range(1, 3):
        output_list = []
        output_list = page_search(i, output_list)
        insert_csv(output_list)
        time.sleep(random.randint(2, 3) + random.random())

    driver.close()

两页的采集数据:

之后从restaurant_list中获取美食商铺链接,再进行解析后获取店铺详细信息

import csv
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC

# 提前限制JS与图片加载
option = webdriver.ChromeOptions()
prefs = {
     'profile.default_content_setting_values': {
        'images': 2,
        'javascript':2
    }
}
option.add_experimental_option('prefs',prefs)

# 创建chrome参数对象
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
# driver = webdriver.Chrome()
driver.implicitly_wait(6)
wait = WebDriverWait(driver, 10)

link_list = []

def get_csv():
    with open('restaurant_list.csv', encoding='UTF-8') as f:
        csv_file = csv.reader(f)
        link_list = [[row[1], row[2], row[3]] for row in csv_file]
        f.close()
    return link_list

def insert_csv(output_list):
    with open('restaurant_detail.csv', 'a+', newline='', encoding='UTF-8') as csvfile:
        spamwriter = csv.writer(csvfile, dialect='excel')
        spamwriter.writerows(output_list)
        csvfile.close()

def dishes_detail(eachone, dishes_list):
    try:
        dishes = eachone.text
        dishes_list.append(dishes)
        return dishes_list
    except Exception as e:
        print('Dishes Error:', e)

def comment_detail(eachone, comment_list):
    try:
        comment_tag = eachone.text.strip().replace(' ', '').replace('\n', '')
        comment_list.append(comment_tag)
        return comment_list
    except Exception as e:
        print('Comment Error:', e)

def link_detail(eachone, output_list):

        name = eachone[0]
        str = eachone[2]
        print('正在爬取:',name, str)

        # 进入评论页面
        link = eachone[1] + '/review_all'
        driver.get(link)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # 爬取优秀评论
        comment_list = []
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#review-list > div.review-list-container > div.review-list-main > div.reviews-wrapper > div.reviews-tags > div.content')))
        for ea in soup.select('.tag'):
            comment_list = comment_detail(ea, comment_list)
        comment_list = '|'.join(comment_list)

        # 爬取优秀菜谱
        dishes_list = []
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#review-list > div.review-list-container > div.review-list-aside > div.shop-dish > div.dish-list.clearfix')))
        for ea in soup.select('.dish-list .dish-name'):
            dishes_list = dishes_detail(ea, dishes_list)
        dishes_list = '|'.join(dishes_list)

        output_list.append([name, eachone[1], dishes_list, comment_list])

        return output_list

if __name__ == '__main__':
    link_list = get_csv()
    print('开始爬取大众点评热门商家:')
    for eachone in link_list:
        output_list = []
        output_list = link_detail(eachone, output_list)
        insert_csv(output_list)
        time.sleep(random.randint(2, 3) + random.random())
    driver.close()

采用格式转换将csv格式转换成xlsx格式:

 

 

 

爬取结束.

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值