细说还原某点评店铺评论

无唔吾

已于 2023-02-06 15:24:04 修改

阅读量172

点赞数 3

文章标签： python 爬虫

于 2023-02-05 22:19:51 首次发布

本文链接：https://blog.csdn.net/qq_66445364/article/details/128893260

版权

细说还原某点评店铺评论

加密与解密
- 完整代码

加密与解密

某点评的评论采用css+svg混合加密，如果直接获取页面源码提取评论，得不到想要的内容，我们打开想要爬取的页面按F12打开开发者工具，选中评论发现一些字消失了，这些就是被隐藏加密的内容
在这里插入图片描述

要解开这些内容我们需要找到css文件中加密内容的类名和对应位置
在这里插入图片描述

再到svg中映射出我们看到的内容，点击打开这个svg
在这里插入图片描述

在这里插入图片描述
现在我们开始尝试还原评论，还记得刚刚消失的“没”字的位置

我们去svg里找到最靠近17且比17大的数

又因为字体大小为14，我们需要用364/14得到26

最终得到文字位置在“y=40”对应位置的第26位(索引从0开始)
在这里插入图片描述
其他加密文字还原以此类推，我在获取不同店铺评论时还遇到了另一种映射方式，这是我扒下来的svg

看起来好像和上面讲的的第一种完全不一样，其实只是换汤不换药，这里我们依然需要比较出最靠近加密文字y位置且比其大的数，然后再根据对应id定位到真正的textLength后的文本，最后用取文本[x/14]得到的字就是还原后的字
在这里插入图片描述

至此，我们已经了解了某点评评论的文字加密及解密方法，
总结一下，还原主要步骤：
1.获取页面源代码
2.从源代码中提取出css文件
3.再从中得到svg
4.根据svg还原加密文字

完整代码

import random
import requests
import re
import time
import parsel
from bs4 import BeautifulSoup
import pandas as pd




def crew(i):
    #填自己的header，获取相同地区不同店铺评论只需更换Referer，获取不同地区店铺评论需要更换cookie和referer
    header = {
        'Cookie': '',
        'Host': '',
        'Referer': '',
        'User-Agent': ''
    }
    url = f'店铺网址/review_all/p{i}' #填店铺网址
    response = requests.get(url=url,headers=header)
    response.encoding = 'utf-8'
    with open('./某点评源代码1.html','w',encoding='utf-8',newline='') as f:
        f.write(response.text)
    soup = BeautifulSoup(response.text, 'lxml')
    svg = soup.find('svgmtsi')['class'][0]
    svgmtsi = soup.find_all('svgmtsi', limit=10)
    for svgm in svgmtsi:
        svg_temp = ''
        for i in range(len(svg)):
            if svg[i] == svgm['class'][0][i]:
                svg_temp += svg[i]
            else:
                break
        svg = svg_temp
    css_url = re.findall('<link rel="stylesheet" type="text/css" href="(//s3plus.meituan.*?)">',response.text)
    css_url = 'https:'+css_url[0]
    css_response = requests.get(css_url)
    with open('./某点评css1.css','w',encoding='utf-8',newline='') as f:
        f.write(css_response.text)
    svg_url = 'https:'+re.findall(r'svgmtsi\[class\^="' + svg + r'"\].*?background-image: url\((.*?)\);', css_response.text)[0]
    svg_response = requests.get(svg_url)
    with open('./某点评svg1.svg','w',encoding='utf-8',newline='') as f:
        f.write(svg_response.text)
    with open('./某点评svg1.svg','r',encoding='utf-8') as f:
        svg_html = f.read()
    sel = parsel.Selector(svg_html)

    #加载映射规则表
    #上面讲的第一种加密方式加载映射svg代码
    # texts = sel.css('text')
    # lines = []
    # for text in texts:
    #     print(text.css('text::text').get())
    #     print(text.css('text::attr(y)').get())
    #     lines.append([int(text.css('text::attr(y)').get()),text.css('text::text').get()])
   
   #第二种加密方式加载映射svg代码
    texts = sel.css('defs path')
    liness = []
    for text in texts:
        # ids = text.css('path::attr(id)').get()
        ds = text.css('path::attr(d)').get()
        d = re.findall('\d+', ds)
        d = d[1]
        liness.append(int(d))

    line = []
    second_texts = sel.css('text')
    for second_text in second_texts:
        line = second_text.css('textPath::text').getall()
    lines = list(zip(liness, line))
    
    #获取所有的类名与位置
    with open('./某点评css1.css','r',encoding='utf-8') as f:
        css_text = f.read()
    class_map = re.findall(r'\.(' + svg + r'\w+)\{background:-(\d+)\.0px -(\d+)\.0px;\}',css_text)
    class_map = [(cls_name,int(x),int(y)) for cls_name,x,y in class_map]
    d_map= {}
    #获取类名和汉字对应关系
    for one_char in class_map:
        try:
            cls_name,x,y = one_char
            # print(one_char
            for line in lines:
                # print(line)
                if line[0] < y:
                    pass
                else:
                    #字符所在位置
                    index = int(x/14)
                    char = line[1][index]
                    d_map[cls_name]=char
                    #匹配到一个内容之后，应该结束当前匹配，去匹配下一个字符
                    break
        except Exception as e:
            print(e)
    with open('./某点评源代码1.html','r',encoding='utf-8') as f:
        html = f.read()
    for key,value in d_map.items():
        html = html.replace('<svgmtsi class="'+key+'"></svgmtsi>',value)
    with open('./某点评源代码解密1.html','w',encoding='utf-8') as f:
        f.write(html)
    #提取信息
    soup = BeautifulSoup(html,'lxml')
    shopName = re.search("'shopName':(.*)",html).group(1) #店铺名称
    reviews_items = soup.find('div', class_="reviews-items")
    reviews = reviews_items.find_all('li', class_=None)
    comment_list = []
    for review in reviews:
        try:
            comment = review.find('div', class_="review-words Hide").get_text()
            comment = comment.replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '').replace('收起评价', '')
        except:
            comment = review.find('div', class_="review-words").get_text()
            comment = comment.replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '')
        comment_list.append(comment)


    user = soup.find_all(class_='dper-info')
    user_list = [] #用户列表
    for username in user:
        username = username.text.replace('\n','').replace('\t','').replace('\r','').replace(' ','')
        user_list.append(username)
    return user_list,comment_list

if __name__ == '__main__':
    df_list = []
    for i in range(1,25):#设置获取评论页数24页，p1和p0是一样的
        user_list,comment_list = crew(i)
        df_list.append(pd.DataFrame({'用户名': user_list, '评论': comment_list}))
        time.sleep(10 + random.randint(1,10))#设置获取间隔时间防止ip被封，如果觉得慢可以自己调整休息时间，或者可以采用ip代理池
        print(f'第{i}页获取完毕！')
    result = pd.concat(df_list)  # 将不同页数评论合并到一起
    result.to_excel('outexcel.xlsx',index=False)

最后结果展示：
在这里插入图片描述