python关于大众点评详情页核心数据抓取

大众点评页面手机号什么的不怕你看,也没弄中间号,而是做了前端svg混淆,有俩个方法,

1,截图,用OCR识别

2,代码判断(本篇使用)

直接上代码

"""
author:yaoye
date : 2019-03-20

"""

import requests
from random import choice
from bs4 import BeautifulSoup
from lxml import etree
import re
import numpy as np

class Dazhongdp(object):
    def __init__(self):
        ua = [
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0',
            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)']
        self.headers = {}
        self.headers['User-Agent'] = choice(ua)
        self.headers['Connection'] = 'keep-alive'
        self.headers['Host'] = 'www.dianping.com'


    def get_url(self):
        url = 'http://www.dianping.com/shop/3500059'
        return url

    def parse_url(self,url):
        print('@@@')
        req = requests.get(url,headers = self.headers)
        html= req.text

        soup = BeautifulSoup(html,'lxml')

        # name = soup.select('h1.shop-name')[0].get_text().strip()

        # print(name)

        phone = self.get_phone(html,url)


    def get_phone(self,html,url):
        r = etree.HTML(html)
        item = r.xpath('//p[@class="expand-info tel"]')[0]
        # print(item)
        item = etree.tostring(item).decode('utf-8')
        # print(item)
        items= [re.sub('<span class="info-name">[\s\S]*</span>','',item,re.S)]
        if re.findall('&#160;',item):
            items = items[0].split('&#160;')
        print(items)
        code = self.get_css_code(url)
        for item in items:

            item_list = re.findall('<d class="([\s\S]*?)"/>|([\d]+)',item,re.S)
            print(item_list)
            rel_list = []

            for i in item_list:
                if i[0]:
                    rel_list.append(i[0])
                    continue
                else:
                    rel_list.append(i[1])

            print(rel_list)
            matchlist =self.match_code(rel_list,code)
            print(''.join(matchlist))

    def match_code(self,rel_list,code):
        matchlist = []
        for i in rel_list:
            if len(i) >1:
                num = self.css_xy(i,code)
                matchlist.append(num)
            else:
                matchlist.append(i)
        return matchlist


    def get_css(self,url):
        response =requests.get(url,headers = self.headers).text
        html = etree.HTML(response)

        r = html.xpath('//link[@rel="stylesheet"]/@href')
        print(r)
        return 'http:'+r[1]

    def get_css_code(self,url):
        css_url = self.get_css(url)
        css_header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                      'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9',
                      'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 's3plus.meituan.net',
                      'If-Modified-Since': 'Wed, 06 Mar 2019 14', 'If-None-Match': '"4684b1c3dee8f4d172b2ac0c11e827a1"',
                      'Upgrade-Insecure-Requests': '1',
                      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
        css = requests.get(css_url, headers=css_header)
        css.encoding = 'utf-8'
        code = css.text
        return code

    def css_xy(self,i,code):
        svg_url ='http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/f8350660159e938ca81d948ca9d0d555.svg'
        result = re.search(i+'{background:([\s\S]*?)px ([\s\S]*?)px;}',code,re.S).groups()
        x = abs(int(float(result[0])))+6
        y = abs(int(float(result[1])))+30
        print(i,result,x,y)
        num = self.get_svg(svg_url,x,y)
        return num


    def get_svg(self,svg_url,x_,y):
        response = requests.get(svg_url)
        response.encoding = 'utf-8'
        # print(response.text)
        html= etree.HTML(response.content)
        y_list = html.xpath('//text/@y')

        y_list = np.array(y_list)
        y_list = y_list.astype(np.int64)
        print(y_list)
        y_index = np.abs(y_list - y).argsort()[0]
        y_ = y_list[y_index]
        x = html.xpath('//text[@y="{y}"]/@x'.format(y=y_))
        num = html.xpath('//text[@y="{y}"]/text()'.format(y=y_))

        dict_x = dict(zip(x[0].split(), list(num[0])))
        return dict_x[str(x_)]




    def parse_item(self):
        url = self.get_url()

        self.parse_url(url)
        # self.get_css_code(url)


A = Dazhongdp()
A.parse_item()

 

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值