一次失败的大众点评爬虫

仅供学习,别搞其他事

前几天一觉醒来发现群里这么一条消息:

感觉这个还不简单?干就完了

打开网站,抓个登录的包(不会写模拟登录,就是菜),发现店名还是很好抓的

但是电话得点到具体店家链接才能看到

这思路很清晰了呀,那就是先抓店家和链接一个表,再通过具体链接来访问店家来抓他电话

但是我还是大意了

这电话明显是加了反爬了

呵呵,不会了

百度一下,css加密可以呀,参考一波爬虫实例—爬取大众点评信息_Migrant workers-CSDN博客

没写到数据库里面(因为不会),直接打印出来的

后面断断续续搞了两天,李奶奶的太菜了(直接抓包放的)

# 作者:juju
# 日期:2021/9/17 15:12
import requests
from bs4 import BeautifulSoup
from d import get_phone_num

def get_i(i):
    cookies = {
        'fspop': 'test',
        'cy': '23',
        'cye': 'haikou',
        '_lx_utm': 'utm_source%3DBaidu%26utm_medium%3Dorganic',
        '_lxsdk_cuid': '17bf28ccf05c8-0b28b6685b9a86-1b337040-144000-17bf28ccf054d',
        '_lxsdk_s': '17bf28ccf06-635-7d1-3c5%7C%7C116',
        '_lxsdk': '17bf28ccf05c8-0b28b6685b9a86-1b337040-144000-17bf28ccf054d',
        'Hm_lvt_602b80cf8079ae6591966cc70a3940e7': '1631861920',
        'Hm_lpvt_602b80cf8079ae6591966cc70a3940e7': '1631862378',
        '_hc.v': '607f25f8-ad7c-38b7-4a73-79eca8f1af81.1631861921',
        's_ViewType': '10',
        '_dp.ac.v': '54123a93-bcb5-4efe-bc74-d7dfb6d2e261',
        'thirdtoken': '880720a3-5dd2-4ed2-b111-4cb3f1b35c8f',
        '_thirdu.c': '8f9673ecb6879d3d8c297364f1afe74d',
        'dplet': '4b0a0fecb04ab08f834e3afc03343b98',
        'dper': 'ca0efb15c2f15e7bbd955326e9340a7212e533b24c7f96f9729e8d7c3ad96af72dd5f47a846fdb0d65411f810dfb3306e400aefa2d21043b7b416480df5bf3dd0db110c97897e340595b05a36061591c0257998c8da9dd0003fac97840efd5ce',
        'll': '7fd06e815b796be3df069dec7836c3df',
        'ua': '%E5%BF%A0%E5%8E%9A%E8%80%81%E5%AE%9E%E7%9A%84%E7%AB%A5%E5%93%A5',
        'ctu': '9689bea245262631f40d14a2e74522f9549075c19e859184c077cdae34473da0',
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    url_shop_list = 'http://www.dianping.com/search/keyword/23/0_%E5%89%A7%E6%9C%AC%E6%9D%80/p' + str(i)
    response = requests.get(url_shop_list, headers=headers, cookies=cookies)
    response_r = BeautifulSoup(response.text, 'lxml')

    # 获取商店列表
    res = response_r.find(id="shop-all-list")
    # 分开解析每个商店
    list_shop = res.find_all_next('li')
    for shop in list_shop:
        # 获取店名
        shop_img = shop.find('img')
        # 获取链接
        shop_a = shop.find('a')
        # 打印店名和链接和电话号码
        shop_url = shop_a.get_attribute_list('href')[0]
        print('店家:%s;链接:%s;电话号码:%s' % (shop_img.get_attribute_list('alt')[0], shop_url, get_phone_num(url_shop_list, shop_url)))
    # font_css = response_r.findAll('link', rel="stylesheet")

if __name__ == '__main__':
    for i in range(2, 4):
        get_i(i)
# 作者:juju
# 日期:2021/9/17 17:22
import requests
from bs4 import BeautifulSoup
from fontTools.ttLib import TTFont

def get_phone_num(refer, shop_url):
    cookies = {
        'fspop': 'test',
        'cy': '23',
        'cye': 'haikou',
        '_lx_utm': 'utm_source%3DBaidu%26utm_medium%3Dorganic',
        '_lxsdk_cuid': '17bf28ccf05c8-0b28b6685b9a86-1b337040-144000-17bf28ccf054d',
        '_lxsdk': '17bf28ccf05c8-0b28b6685b9a86-1b337040-144000-17bf28ccf054d',
        'Hm_lvt_602b80cf8079ae6591966cc70a3940e7': '1631861920',
        'Hm_lpvt_602b80cf8079ae6591966cc70a3940e7': '1631870247',
        '_hc.v': '607f25f8-ad7c-38b7-4a73-79eca8f1af81.1631861921',
        's_ViewType': '10',
        '_dp.ac.v': '54123a93-bcb5-4efe-bc74-d7dfb6d2e261',
        'thirdtoken': '880720a3-5dd2-4ed2-b111-4cb3f1b35c8f',
        '_thirdu.c': '8f9673ecb6879d3d8c297364f1afe74d',
        'dplet': '4b0a0fecb04ab08f834e3afc03343b98',
        'dper': 'ca0efb15c2f15e7bbd955326e9340a7212e533b24c7f96f9729e8d7c3ad96af72dd5f47a846fdb0d65411f810dfb3306e400aefa2d21043b7b416480df5bf3dd0db110c97897e340595b05a36061591c0257998c8da9dd0003fac97840efd5ce',
        'll': '7fd06e815b796be3df069dec7836c3df',
        'ua': '%E5%BF%A0%E5%8E%9A%E8%80%81%E5%AE%9E%E7%9A%84%E7%AB%A5%E5%93%A5',
        'ctu': '9689bea245262631f40d14a2e74522f9549075c19e859184c077cdae34473da0',
        '_lxsdk_s': '17bf2f8e96d-b81-d29-f00%7C%7C126',
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Referer': refer,
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control': 'max-age=0',
    }

    response = requests.get(shop_url, headers=headers, cookies=cookies)
    res = BeautifulSoup(response.text.replace('&#x', ''), 'lxml')
    # 获取该商店加载的字体反扒css
    font_css = res.findAll('link', rel="stylesheet")[1]['href'][2:]
    css_text = requests.get('http://' + font_css)
    # 从css链接中获取电话号码加载的字体woff链接
    font_url = css_text.text.split('.num')
    font_woff = font_url[0].split('url("//')[-1][:-5]
    # 获取woff文件
    headers_woff = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    url_woff = 'http://' + font_woff
    res_woff = requests.get(url_woff, headers=headers_woff)
    with open(r'C:\Users\juju\Desktop\ffff.woff', "wb+") as code:
        code.write(res_woff.content)
        code.close()

    # 建立映射关系
    font = TTFont(r'C:\Users\juju\Desktop\ffff.woff')
    list = {}
    num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
    text = enumerate(num)
    for key, value in text:
        a = font.getGlyphOrder()[2:][key][3:]
        a = a + ';'
        list[a] = value
        # 将html加密后的电话号码进行替换,用映射关系来代替
    res_replace = BeautifulSoup(response.text.replace('&#x', ''), 'lxml')

    # 获取电话的标签
    tel = res_replace.find('p', class_='expand-info tel').contents[3:]
    # 电话一般都是1开头的吧
    # 返回电话
    phone_num = '1'
    for i in tel:
        if i.string[-1] == ';':
            phone_num = phone_num + list[i.string]
        else:
            phone_num = phone_num + i.string
    return phone_num[:-1]

 这个warning还真没搞定,有没有大佬带带我呀

爬虫还是很有意思滴

 

  • 3
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值