Python爬虫实战——Svg映射型爬虫(大众点评)

一、svg爬虫简介

SVG 是用于描述二维矢量图形的一种图形格式。它基于 XML 描述图形,对图形进行放大或缩小操作都不会影响图形质量。矢量图形的这个特点使得它被广泛应用在 Web 网站中。

二、svg的具体表现

在这里插入图片描述
css文件
在这里插入图片描述
svg文件
在这里插入图片描述
在这里插入图片描述

三、举例详解

已知:
类名:vhkjj4
坐标:(-316px -141px)----取正整数则为(316,141)
在这里插入图片描述
在这里插入图片描述

四、爬取大众点评评论数据

①下载网页源代码

网站链接: http://www.dianping.com/shop/130096343/review_all
 def down_data(url, cookie):
    headers = {
        "Cookie": cookie,
        "Referer": "http://www.dianping.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    ret = requests.get(url, headers=headers).text
    with open('dazhong.html', 'w', encoding='utf-8') as f:
        f.write(ret)
 url = 'http://www.dianping.com/shop/130096343/review_all'
 cookie = ''
 down_data(url, cookie)

②下载网站css文件

css_url = re.findall('<link rel="stylesheet" type="text/css" href="(//s3plus.meituan.*?)">', ret)
css_url = 'https:' + css_url[0]
css_response = requests.get(css_url, headers=headers).text
with open('dazhong.css', 'w', encoding='utf-8') as f:
    f.write(css_response)

③匹配所有对应css的映射文件svg

css = open('dazhong.css', encoding='utf-8')
svg_urls = re.findall(r'.*?\[class\^="(.*?)"\]\{.*?background-image: url\((.*?)\);', css.read())
css.close()
for svg_url in svg_urls:
     name, svg = svg_url
     svg_url = 'https:' + svg
     print(svg_url)
     svg_response = requests.get(svg_url).text
     with open(f'{name}_dazhong.svg', 'w', encoding='utf-8') as f:
         f.write(svg_response)

④读取评论相对应的svg文件并解析成相对应字典

with open('be_dazhong.svg', 'r', encoding='utf-8') as f:
        svg_html = f.read()
sel = parsel.Selector(svg_html)
    texts = sel.css('textPath')
    paths = sel.css('path')
    path_dict = {}
    for path in paths:
        path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1]

⑤y坐标与字符串进行相对应

count = 1
zpd_svg_dict = {}  # y坐标和字符串的联系
for text in texts:
    zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get()
    count += 1

⑥读取css中样式、x轴坐标、y坐标

with open('dazhong.css', 'r', encoding='utf-8') as f:
        css_html = f.read()
    css_paths = re.findall('\.(.*?)\{background:-(\d+)\.0px -(\d+)\.0px;\}.*?', css_html)  # 正则表达式条件根据css文件类标签更换

⑦将源文件中的css属性转换为对应文字

last_map = {}
for css_path in css_paths:
    css_name, x, y = css_path
    index = int(int(x) / 14)  # font-size:14px;fill:#333;}
    for i in zpd_svg_dict:
        if int(y) > int(i):
            pass
        else:
            try:
                last_map[css_name] = zpd_svg_dict[i][index]
                break
            except IndexError as e:
                print(e)

8将html源码中的属性进行替换解析

with open('dazhong.html', 'r', encoding='utf-8') as f:
    ret = f.read()
svg_list = re.findall('<svgmtsi class="(.*?)"></svgmtsi>', ret)
for svg in svg_list:
    try:
        ret = ret.replace(f'<svgmtsi class="{svg}"></svgmtsi>', last_map[svg])
    except KeyError:
        print('KeyError', svg)

9.循环得到评价内容

 etre = etree.HTML(ret)
li_list = etre.xpath('//div[@class="reviews-items"]/ul/li')
 for i in li_list:
     print(i.xpath('div[@class="main-review"]/div[@class="review-words Hide"]/text()'))

五、完整Python源码

import requests
from lxml import etree
import re
import parsel


def down_data(url, cookie):
    headers = {
        "Cookie": cookie,
        "Referer": "http://www.dianping.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    ret = requests.get(url, headers=headers).text
    with open('dazhong.html', 'w', encoding='utf-8') as f:
        f.write(ret)
    css_url = re.findall('<link rel="stylesheet" type="text/css" href="(//s3plus.meituan.*?)">', ret)
    css_url = 'https:' + css_url[0]
    css_response = requests.get(css_url, headers=headers).text
    with open('dazhong.css', 'w', encoding='utf-8') as f:
        f.write(css_response)
    css = open('dazhong.css', encoding='utf-8')
    svg_urls = re.findall(r'.*?\[class\^="(.*?)"\]\{.*?background-image: url\((.*?)\);', css.read())
    print(svg_urls)
    css.close()
    for svg_url in svg_urls:
        name, svg = svg_url
        svg_url = 'https:' + svg
        print(svg_url)
        svg_response = requests.get(svg_url).text
        with open(f'{name}_dazhong.svg', 'w', encoding='utf-8') as f:
            f.write(svg_response)


def crack_data():
    with open('be_dazhong.svg', 'r', encoding='utf-8') as f:
        svg_html = f.read()
    # with open('gs_dazhong.svg', 'r', encoding='utf-8') as f:
    #     svg_html += f.read()
    # with open('rq_dazhong.svg', 'r', encoding='utf-8') as f:
    #     svg_html += f.read()
    sel = parsel.Selector(svg_html)
    texts = sel.css('textPath')
    paths = sel.css('path')
    path_dict = {}
    for path in paths:
        path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1]
    print(path_dict)
    count = 1
    zpd_svg_dict = {}  # y坐标和字符串的联系
    for text in texts:
        zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get()
        count += 1
    print('zpd_svg_dict', zpd_svg_dict)
    with open('dazhong.css', 'r', encoding='utf-8') as f:
        css_html = f.read()
    css_paths = re.findall('\.(.*?)\{background:-(\d+)\.0px -(\d+)\.0px;\}.*?', css_html)  # 正则表达式条件根据css文件类标签更换
    print(css_paths)
    last_map = {}
    for css_path in css_paths:
        css_name, x, y = css_path
        index = int(int(x) / 14)  # font-size:14px;fill:#333;}
        for i in zpd_svg_dict:
            if int(y) > int(i):
                pass
            else:
                try:
                    last_map[css_name] = zpd_svg_dict[i][index]
                    break
                except IndexError as e:
                    print(e)

    return last_map


def decode_html(last_map):
    with open('dazhong.html', 'r', encoding='utf-8') as f:
        ret = f.read()

    svg_list = re.findall('<svgmtsi class="(.*?)"></svgmtsi>', ret)
    print(svg_list)
    for svg in svg_list:
        try:
            ret = ret.replace(f'<svgmtsi class="{svg}"></svgmtsi>', last_map[svg])
        except KeyError:
            print('KeyError', svg)
    return ret


def get_text(html):
    ret = html.replace('<div class="richtitle">消费后评价</div>', '')
    etre = etree.HTML(ret)
    li_list = etre.xpath('//div[@class="reviews-items"]/ul/li')
    for i in li_list:
        print(i.xpath('div[@class="main-review"]/div[@class="review-words Hide"]/text()'))


if __name__ == '__main__':
    url = 'http://www.dianping.com/shop/130096343/review_all'
    cookie = ''
    try:
        down_data(url, cookie)
    except Exception as e:
        print('遇到验证问题')
    data = crack_data()
    html = decode_html(data)
    get_text(html)

  • 2
    点赞
  • 23
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值