抓取设了CSS反爬机制的大众点评数据(下)

该篇实现大众点评爬虫操作代码,所有原理都在(上)篇均已详细阐述,让我没想到的是大众点评不仅设置了CSS反爬,在ip限制方面也是十分的凶狠,不得已花了10块钱买了一天代理ip。

 

大众点评究的反爬竟有多恶心?

1.设置了ip限制

解决方法:花钱买代理获取接口

2.对计算机浏览器进行了识别,频率过高输入验证码

 解决方法:网上找数十个User-Agent作随机切换

3.对Cookie 也进行了识别,到达阀值自动停止(这是真恶心)

解决方法:找了几个朋友进行登录获取Cookie作随机切换

拜托我就拿个论文数据你也太凶了吧,写了那么多的爬虫第一次差点被虐哭, 还是要不断学习的呀 ,代码如下,略长,包括读取代理ip的接口我也写在上面了,有需要的朋友尽快爬取,这烦人网站每几天就更新一次反爬呢!


from fontTools.ttLib import TTFont
from lxml import etree
import requests
import random
import time
import json
import csv
import os
import re

os.chdir("E:\\pythonwork\\大众点评")

user_agent = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
              'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
              'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
              'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)']

headers = {
    'User-Agent': random.choice(user_agent),
    'Cookie':'_lxsdk_cuid=16fc3c57f39c8-0f56d131537ae6-b383f66-100200-16fc3c57f39c8; _lxsdk=16fc3c57f39c8-0f56d131537ae6-b383f66-100200-16fc3c57f39c8; _hc.v=14e8f7af-d069-6e77-9518-a77bcaec53c5.1579537498; s_ViewType=10; ctu=572330a0ea41badf8bf669a97c491bed041c6679a38dfa80b8fd11e69ffa9cab; aburl=1; __utma=205923334.2138749831.1584008233.1584008233.1584008233.1; __utmz=205923334.1584008233.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); looyu_id=c268fc6888d234def2c92f9c5710805232_51868%3A1; cy=224; cye=nanning; _dp.ac.v=c108f647-d852-413d-b71f-293d0ca87f9b; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; dplet=095342bde535b6cb0942b2ebee7d8242; dper=db7e2db9c4b21702e103cf5ed1b3b7fa79c223e47a8e89c36dbce62e8c28bfa6af42c434610d94e8f73cc176b4dd5ff4a322d1bb5739e5b08c3bdbb6130fa8fa77b3bdd99d8013e02c5bbf68393a0c406a9af94f0f6f410bf4c4766fd2350034; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_4536684833; _lxsdk_s=170d36cde80-71f-c35-517%7C%7C97'
    }

def get_font():
    """
    解密数字
    :return:
    """
    font = TTFont('数字.woff')
    font_names = font.getGlyphOrder()
    # 这些文字就是在FontEditor软件打开字体文件后看到的文字名字
    texts = ['','','1','2','3','4','5','6','7','8','9','0']
    font_name = {}
    # 将字体名字和它们所对应的乱码构成一个字典
    for index,value in enumerate(texts):
        a = font_names[index].replace('uni', '&#x').lower() + ";"
        font_name[a] = value
    return font_name

def get_comment_count(html):
    """
    解析出:好中差评数
    :param html:  大页的源代码

    """
    global proxies

    sole_urls = html.xpath("//div[@class='tit']/a[@data-hippo-type='shop']/@href")
    good = []
    middle = []
    bad = []
    for i in sole_urls:
        sole_url = i +'/review_all'
        requests.packages.urllib3.disable_warnings()
        response = requests.get(sole_url, headers=headers,verify=False, proxies=random.choice(proxies)).text

        # while True:
        #     try:
        #         response = requests.get(sole_url,headers=headers,proxies=random.choice(proxies)).text
        #         break
        #     except:
        #         time.sleep(1)
        html_review = etree.HTML(response)
        # 好中差评数
        try:
            good_count = int(html_review.xpath("//label[@class='filter-item filter-good']/span/text()")[0].split('(')[1].split(')')[0])
        except:
            good_count = 0
        try:
            middle_count = int(html_review.xpath("//label[@class='filter-item filter-middle']/span/text()")[0].split('(')[1].split(')')[0])
        except:
            middle_count = 0
        try:
            bad_count = int(html_review.xpath("//label[@class='filter-item filter-bad']/span/text()")[0].split('(')[1].split(')')[0])
        except:
            bad_count = 0
        good.append(good_count)
        middle.append(middle_count)
        bad.append(bad_count)

        time.sleep(4)
    zero = 0
    for i in good:
        if i == 0:
            zero += 1
    if zero == 15:
        print("===================== 注意!======================")

    return good,middle,bad


def get_detail(big_url):
    """
    解析出:店名、星级、人均价格、口味环境服务评分
    :param big_url:  一页的url

    """
    global proxies

    business_detail = {}
    business = []

    requests.packages.urllib3.disable_warnings()
    response = requests.get(big_url, headers=headers,verify=False, proxies=random.choice(proxies)).text

    # while True:
    #     try:
    #         response = requests.get(big_url,headers=headers,proxies=random.choice(proxies)).text
    #         break
    #     except:
    #         time.sleep(1)

    num = get_font()      # 加载字体
    # 替换加密数字
    for key in num:
        if key in response:
            response = response.replace(key, str(num[key]))
    html = etree.HTML(response)
    good_middle_bad = get_comment_count(html)

    good_count = good_middle_bad[0]   # 列表
    middle_count = good_middle_bad[1]
    bad_count = good_middle_bad[2]

    title = []   # 1店名
    titles = html.xpath("//div[@class='tit']//h4/text()")
    for t in titles:
        title.append(t)

    star = []    # 2星级评分
    stars = html.xpath("//div[@class='nebula_star']//text()")
    for i in stars:
        linshi = re.findall(r'\b\d+\b', i)
        if linshi:
            star.append(linshi[0]+'.'+linshi[1])

    taste_rating = []   # 3口味评分
    environment_rating = []   # 4环境评分
    service_rating = [] # 5服务评分

    comment_rating = html.xpath("//span[@class='comment-list']//text()")
    string = ''
    for i in comment_rating:
        string += i
    string = string.strip().replace(' ','').replace('\n',';')
    string_list = string.replace(';;',';').split(';')

    for i in string_list:
        if '口味' in i:
            taste_rating.append(i.split('味')[1])
        elif '环境' in i:
            environment_rating.append(i.split('境')[1])
        else :
            service_rating.append(i.split('务')[1])
    # 6人均价格
    avg_string = ''
    avg = []
    rj = html.xpath("//a[@data-click-name='shop_avgprice_click']//text()")
    for i in rj:
        avg_string += i.replace('\n','').replace(' ','')
    avg_list = avg_string.split('人均')
    del avg_list[0]

    for j in avg_list:
        if j == '-':
            avg.append('无人均价格')
        else:
            avg.append(j.split('¥')[1])
            
    for q in range(15):
        try:
            business_detail['title'] = title[q]
            business_detail['star'] = star[q]
            business_detail['avg'] = avg[q]
            business_detail['taste_rating'] = taste_rating[q]
            business_detail['environment_rating'] = environment_rating[q]
            business_detail['service_rating'] = service_rating[q]
            business_detail['good_count'] = good_count[q]
            business_detail['middle_count'] = middle_count[q]
            business_detail['bad_count'] = bad_count[q]
        except:
            business_detail['title'] = '无'
            business_detail['star'] = '无'
            business_detail['avg'] = '无'
            business_detail['taste_rating'] = '无'
            business_detail['environment_rating'] = '无'
            business_detail['service_rating'] = '无'
            business_detail['good_count'] = '无'
            business_detail['middle_count'] = '无'
            business_detail['bad_count'] = '无'
        business.append(business_detail.copy())    # 不加copy则是复制了字典的地址

    for bus in business:
        save(bus)

def save(big_information):
    with open('BeiH.csv', 'a',newline='', encoding='utf-8-sig') as f:
        fieldnames = ['title','star','avg','taste_rating','environment_rating','service_rating','good_count','middle_count','bad_count']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writerow(big_information)
        
def get_ip():
    url = 'https://dev.kdlapi.com/api/getproxy/?orderid=918401564263990&num=100&protocol=1&method=2&an_an=1&an_ha=1&sep=%2C'
    ip_str = requests.get(url, headers=headers).text
    ultimate_ips = []
    dic = {}
    ips = ip_str.split(',')

    for i in ips:
        dic['https'] = i
        ultimate_ips.append(dic.copy())
    return ultimate_ips

if __name__ == '__main__':
    global proxies
    proxies = get_ip()
    for i in range(1,31):
        big_url = 'http://www.dianping.com/beihai/ch10/p{}'.format(i)
        print(big_url)
        get_detail(big_url)
        time.sleep(1)

 

 

更多原创文章请关注我的公众号:DataUser

一枚数据分析的爱好者~

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值