大众点评美食评论爬虫

最新推荐文章于 2024-08-08 07:36:59 发布

ColaForced

最新推荐文章于 2024-08-08 07:36:59 发布

阅读量2.1k

点赞数 1

文章标签： python 爬虫大众点评

本文链接：https://blog.csdn.net/py431382/article/details/99951191

版权

大家都知道的，大众点评用了css反爬，脑壳疼。评论文字使用SVG替换。然后还需要登录才能查看全部评论，也就是要带cookie了，此外时不时跳验证码，验证码还有几种，这帮人是真的过分了，搬砖的何苦为难搬砖的呢。

'''function：从数据库取店铺id取解析店铺下的评论'''
# -*- coding:utf-8 -*-
# Author : peng

from getter import DP2_get   #改成你自己的get方法，怎么处理验证码之类的建议放在这里面
from redis import StrictRedis,ConnectionPool
import threading, time
import re
from pyquery import PyQuery
import os
import json
import random

pool = ConnectionPool(host='localhost', port=***, db=***, password=***)

def word_repair(s,index_dict,svg_dict):
    '''
    s :待替换的字码
    index_dict : 字码坐标字典
    svg_dict: 字库
    '''
    # print(index_dict)
    # print(svg_dict)
    s = re.sub(r'@','',s) #之前添加的用于分割的字符，处理可能存在的残留字符
    try:coordinate = index_dict[s] #字码坐标
    except:
        print('error_______________',s)
        return ''   #错误或者空字符返回空字符
    x = float(coordinate[0]) 
    y = float(coordinate[1])
    '''根据该字码在svg图上的位置解析出该字在svg_dict对应的位置'''
    hight_list = svg_dict.keys()  
    hight_list = [int(_) for _ in svg_dict.keys()]
    if y<hight_list[0]:
        y_key = str(hight_list[0])
    elif y>hight_list[-1]:
        y_key = str(hight_list[-1])
    else:
        for i,n in enumerate(hight_list):
            if n<y<hight_list[i+1]:
                y_key = str(hight_list[i+1])
                break
    x_pos = int(x/14)
    # print(svg_dict)
    word = svg_dict[y_key][x_pos]
    return word

def get_index_dict(html):
    '''
    html:店铺首页的response.text
    '''
    raw = re.findall(r'href="(//s3plus.meituan.net/v1.*?\.css)',html)[0]
    url = 'http:'+raw
    print(url) #css文件链接
    res = DP2_get(url)
    if res == None:
        return None,None
        
    '''解析css文件，解析出字码对应坐标'''
    index_info = re.findall(r'\.(\w+?){background:-(.*?)px\s-(.*?)px;}',res.text)
    index_dict = {}
    for each in index_info:
        index_dict[each[0]]=[each[1],each[2]]

    '''找到主要的svg文件，舍弃次要的svg文件'''
    svgs = re.findall(r'\[class\^="(\S*?)"]{(.*?)(//.*?\.svg)',res.text)
    for each in svgs:
        if 'width: 14px' in each[1] and 'height: 24px' in each[1]:
            svg = each
    svg_url = 'http:'+svg[2]
    svg_name = svg[0]
    
    file_path = 'C:/Users/pengyong/Desktop/DZDP/svg_files/'+svg_name

    '''已经存在则无需更新，不存在则访问下载该svg文件来添加新的svg字典'''
    if not os.path.exists(file_path):
        res2 = DP2_get(svg_url)
        print(res2.url)
        if res2 == None:
            return None,None
        '''svg文件记录，目前发现有两种格式，后一种临时添加，懒得美化代码了'''
        hight_list = re.findall(r'<path id="\d*" d="\w* (\d*?) \w*"/>', res2.text)
        text_list = re.findall(r'<textPath xlink:href="\S*" textLength="\d*">(\S*)</textPath>', res2.text)
        svg_dict = {}
        for i in range(len(hight_list)):
            svg_dict[str(hight_list[i])] = text_list[i]
        if len(svg_dict) == 0:
            #则为第二种格式
            hight_and_text_list = re.findall(r'<text x="\d*" y="(\d*)">(\S*)</text>',res2.text)
            print(hight_and_text_list)
            for i in range(len(hight_and_text_list)):
                svg_dict[str(hight_and_text_list[i][0])] = hight_and_text_list[i][1]

        with open(file_path,'w',encoding='utf-8')as f:
            f.write(str(svg_dict))
    else:
        with open(file_path,'r',encoding='utf-8')as f:
            svg_dict=eval(f.read())
            
    return index_dict,svg_dict

def save(s):
    '''存储评论'''
    with open('DP_comment.txt','a',encoding='utf-8')as f:
        f.write(s+'\n')

def comment_parse(html,index_dict, svg_dict):
    '''
    html: 评论页面
    index_dict: 坐标字典
    svg_dic： 字库
    '''
    divs = re.findall(r'<div class="review-words Hide">(.*?)<div', html, re.S)
    
    def rp(temp):
        '''处理svgmtsi中字码，用#，@做分割标记之类的工作，方便后续处理'''
        svg = temp.group()
        return '#@' + re.findall(r'"(.*?)"', svg)[0] + '#'
        
    for div in divs:
        div = re.sub(r'<svgmtsi class=".*?"></svgmtsi>', rp, div)
        div = re.sub(r'\r', '\n', div)
        div = re.sub(r'<[^<]+?>|\t|\s', '', div)
        words = div.split('#')
        
        for i, word in enumerate(words):
            if word != None and '@' in word:  #判断是否是svgmtsi标签中的字码
                words[i] = word_repair(word, index_dict, svg_dict) #替换
        comment = ''.join(words) #合并
        '''处理评论内容'''
        comment = re.sub(u'&x0A|&x20|&x2F|&x0D', '\n', comment) 
        comment = re.sub(u';', '', comment)
        comment = re.sub(r'^(\s*)\n','',comment)
        print(comment)
        save(comment)

def work():
    redis = StrictRedis(connection_pool=pool)
    while True:
        id = redis.spop('DPids')  # 返回一个元素并删除
        if isinstance(id, bytes):
            id = id.decode()
        # id = '510660'
        url = 'http://www.dianping.com/shop/' + id + '/review_all/p1'
        res = DP2_get(url)

        if res:
            # print(res.text)
            index_dict, svg_dict = get_index_dict(res.text) #获取坐标字典，svg字库
            if index_dict==None or svg_dict == None:
                #获取失败
                break
            comment_parse(res.text, index_dict, svg_dict)
            pages = re.findall(r'<a href="/shop/.*?" data-pg="(\d*?)" class="PageLink"', res.text)
            pages = [int(_) for _ in pages] #类型转换
            try:
                max_page = max(pages)  #最大评论页数
            except:pass
            else:
                if max_page >= 2:
                    for i in range(2, max_page + 1):
                        href = 'http://www.dianping.com/shop/' + id + '/review_all/p' + str(i)
                        print(href)
                        res2 = DP2_get(href)
                        if res2:
                            comment_parse(res2.text, index_dict, svg_dict)
                            n = 15+20*random.random()#平均睡眠25秒
                            time.sleep(n)
        else:
            #记录错误
            with open('error_record.txt', 'a')as f:
                f.write(id)

        
        

if __name__ == '__main__':
    #留着开多线程的，但改成多线程不现实，单线程都休眠25秒了，还得登录，还有封号危险。
    work()

'''function：获取美食店铺id'''

# -*- coding:utf-8 -*-
# Author : peng

from pyquery import PyQuery 
from getter import DP_get
import time
import re
from redis import StrictRedis,ConnectionPool

pool = ConnectionPool(host='localhost', port=***, db=***, password=***)
redis = StrictRedis(connection_pool=pool)

areas = ['beijing','shanghai','guangzhou','shenzhen','tianjin','hangzhou','nanjing','suzhou','chengdu','wuhan','chongqing','xian','tokyo','seoul','bangkok','paris']

id_comp = re.compile(r'"http://www.dianping.com/shop/(\d{4,})" data-click-name')

for area in areas: 
	url = 'http://www.dianping.com/{}/food'.format(area)
	res = DP_get(url)
	if res:
		doc = PyQuery(res.text)
		label_a = doc('#J_nc_cooking > div > ul > li > a')
		for a in label_a.items(): 
			half_url =a.attr('href') 
			# print(a.attr('href'))  /beijing/ch10/g2714
			for i in range(1,51): 
				href2 = 'http://www.dianping.com'+half_url+'p'+str(i)
				GetIdPageRes = DP_get(href2) #分类详情页响应
				if GetIdPageRes:
					shop_ids = re.findall(id_comp,GetIdPageRes.text)
					for each in shop_ids:
						redis.sadd('DPids',each)
						print(each)

此外，关于大众点评的字体替换，更烦，还好我目前不用搞。

思路是 fontTools解析woff文件，找到编码对应的 ‘笔画’(这个东西吧,看下面)。编码是会变的，就是所谓字体库换了，但这个笔画是不会变的，所以，根据笔画替换编码。至于第一次怎么找到笔画对应什么汉字，我也很脑壳疼。。。。