大众点评-万达483舆情数据

爬取数据生成每一条文件

import datetime
import random
import time
import re
# import sys,os
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pymongo
from lxml import etree
import requests
from pyquery import PyQuery as pq
# 导入CSV安装包
import csv
# client = pymongo.MongoClient('localhost', 27017)
# shidai = client['gongyuan']
# comments = shidai['comments']

path_one = r'C:\Users\songlk\AppData\Local\Google\Chrome\Application\chromedriver.exe'

COOKIES = '_lxsdk_cuid=16a3e5550cac8-0328ac989f3a72-3c644d0e-100200-16a3e5550cbc8; _lxsdk=16a3e5550cac8-0328ac989f3a72-3c644d0e-100200-16a3e5550cbc8; _hc.v=b108378a-8f67-0f82-24be-f6bd59936218.1555823941; s_ViewType=10; ua=zeroing; ctu=66a794ac79d236ecce433a9dd7bbb8bf29eff0bc049590703a72f844379eb7c5; dper=56648ebad0a12bed853d89482e9f3c35c89ef2504f07d5388fd0dfead6018398ae8c14a81efb6f9e42cb7e1f46473489252facff635921c09c106e3b36b311bafcd118a3e618fff67b5758b9bd5afca901c01dc9ec74027240ac50819479e9fc; ll=7fd06e815b796be3df069dec7836c3df; _lx_utm=utm_source%3Dgoogle%26utm_medium%3Dorganic; cy=2; cye=beijing; _lxsdk_s=16b84e44244-3d8-afd-795%7C1393851569%7C2'
SHOPID = 'iH3N7RpkNlNqIaIG'
f = open('C:\\Users\\songlk\\Desktop\\' +SHOPID+'.csv','w',encoding='utf-8',newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(["姓名", "日期", "星级", "评论", "附图"])
read_txt_id_lines = []
#
# 传入一个写死id  然后爬取大众点评差评,写入csv中
# 输出结果:测试写入CSV表格.csv
#
class DianpingComment:
    font_size = 14
    start_y = 23

    def __init__(self, shop_id, cookies, delay=7, handle_ban=True):
        self.shop_id = shop_id
        self._delay = delay
        self.num = 1
        # self.db = comments
        self._cookies = self._format_cookies(cookies)
        self._css_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        self._default_headers = {
            'Connection': 'keep-alive',
            'Host': 'www.dianping.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
            'Cookie': 'cy=2; cye=beijing; _lxsdk_cuid=1791baabd12c8-0bcecf9c18d7ce-376b4502-e1000-1791baabd13c8; '
                      '_hc.v=7b4a1260-41ee-e1a2-da40-e1c6eb9189a6.1619666846; s_ViewType=10; '
                      'ctu=13a958a5a4925f571b94206ddf51e874f4334b6941d1dbc2ab540ad02eada20b; switchcityflashtoast=1; '
                      'cityid=2; _lxsdk=F74FD600A8BA11EBAA418171958965BAB1FC1D73726F4257A4349EE2BC1014A4; fspop=test; '
                      'ua=%E6%88%98%E7%A5%9E%E9%87%91%E5%88%9A%E5%87%B9%E5%87%B8%E6%9B%BC; '
                      'default_ab=citylist%3AA%3A1%7Cindex%3AA%3A3%7CshopList%3AC%3A5%7Cshopreviewlist%3AA%3A1%7Cmap'
                      '%3AA%3A1%7Cugcdetail%3AA%3A1; dplet=82b545d2c8323ce159c4280f7a2418b2; '
                      'dper=abb09aea6d2a5802f6d6e26b6dc7b31a5033eda857968c9c661ee49033a2d648a3703a5c029279ad1d9141d9856bd4257b9a6ccc08f930decaa80f0fe2191aa43944ea48542867ab13260a5c6d0c857401ed5235eee38d1f4f7105ee5f6f822d; aburl=1; ll=7fd06e815b796be3df069dec7836c3df; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1620638055,1620646026,1620718429,1620786381; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1620786411; _lxsdk_s=1795e657f49-2d7-8a6-f40%7C%7C86',
                # 'cy=2; cye=beijing; _lxsdk_cuid=1791baabd12c8-0bcecf9c18d7ce-376b4502-e1000-1791baabd13c8; _hc.v=7b4a1260-41ee-e1a2-da40-e1c6eb9189a6.1619666846; s_ViewType=10; ctu=13a958a5a4925f571b94206ddf51e874f4334b6941d1dbc2ab540ad02eada20b; switchcityflashtoast=1; cityid=2; _lxsdk=F74FD600A8BA11EBAA418171958965BAB1FC1D73726F4257A4349EE2BC1014A4; fspop=test; ua=%E6%88%98%E7%A5%9E%E9%87%91%E5%88%9A%E5%87%B9%E5%87%B8%E6%9B%BC; default_ab=citylist%3AA%3A1%7Cindex%3AA%3A3%7CshopList%3AC%3A5%7Cshopreviewlist%3AA%3A1%7Cmap%3AA%3A1%7Cugcdetail%3AA%3A1; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1620458878,1620638055,1620646026,1620718429; dplet=82b545d2c8323ce159c4280f7a2418b2; dper=abb09aea6d2a5802f6d6e26b6dc7b31a5033eda857968c9c661ee49033a2d648a3703a5c029279ad1d9141d9856bd4257b9a6ccc08f930decaa80f0fe2191aa43944ea48542867ab13260a5c6d0c857401ed5235eee38d1f4f7105ee5f6f822d; ll=7fd06e815b796be3df069dec7836c3df; aburl=1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1620723302; _lxsdk_s=1795a78aada-cdc-572-ddd%7C%7C62',
        }
        # ?queryType=reviewGrade&queryVal=middle
        self._cur_request_url = 'http://www.dianping.com/shop/{}/review_all?queryType=reviewGrade&queryVal=bad'.format(
            self.shop_id)
        self.sub_url = 'http://www.dianping.com'

    def run(self):
        self._css_link = self._get_css_link(self._cur_request_url)
        self._font_dict = self._get_font_dict(self._css_link)
        self._get_conment_page()
        # self.define_my_csv_header()

    def _delay_func(self):
        delay_time = random.randint((self._delay - 2) * 10, (self._delay + 2) * 10) * 0.1
        time.sleep(delay_time)

    def _init_browser(self):
        """
            初始化游览器
        """
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=path_one)
        browser.get(self._cur_request_url)
        for name, value in self._cookies.items():
            browser.add_cookie({'name': name, 'value': value})
        browser.refresh()
        return browser

    def _handle_ban(self):
        """
            爬取速度过快,出现异常时处理验证
        """
        try:
            self._browser.refresh()
            time.sleep(1)
            button = self._browser.find_element_by_id('yodaBox')
            move_x_offset = self._browser.find_element_by_id('yodaBoxWrapper').size['width']
            webdriver.ActionChains(self._browser).drag_and_drop_by_offset(
                button, move_x_offset, 0).perform()
        except:
            pass

    def _format_cookies(self, cookies):
        '''
        获取cookies;;;
        :param cookies:
        :return:
        '''
        cookies = {cookie.split('=')[0]: cookie.split('=')[1]
                   for cookie in cookies.replace(' ', '').split(';')}
        return cookies

    def _get_conment_page(self):
        """
            请求评论页,并将<span></span>样式替换成文字;
        """
        while self._cur_request_url:
            self._delay_func()
            print('[{now_time}] {msg}'.format(now_time=datetime.datetime.now(), msg=self._cur_request_url))
            # http接口
            res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=self._cookies)
            while res.status_code != 200:
                cookie = random.choice(COOKIES)
                cookies = self._format_cookies(cookie)
                res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=cookies)
                if res.status_code == 200:
                    break
            html = res.text
            class_set = []
            for span in re.findall(r'<svgmtsi class="([a-zA-Z0-9]{5,6})"></svgmtsi>', html):
                class_set.append(span)
            for class_name in class_set:
                try:
                    html = re.sub('<svgmtsi class="%s"></svgmtsi>' % class_name, self._font_dict[class_name], html)
                    # print('{}已替换完毕_______________________________'.format(self._font_dict[class_name]))
                except:
                    html = re.sub('<svgmtsi class="%s"></svgmtsi>' % class_name, '', html)
                    # print('替换失败…………………………………………………………………………&&&&&&&&&&&&&&&&&&&&&&&&')
            doc = pq(html)
            self._parse_comment_page(html)
            if doc('.NextPage').attr('href'):
                self._default_headers['Referer'] = self._cur_request_url
                next_page_url1 = doc('.NextPage').attr('href')
                next_page_url = self.sub_url + str(next_page_url1)
                print('next_url:{}'.format(next_page_url))
            else:
                next_page_url = None
            print('next_page_url:{}'.format(next_page_url))
            self._cur_request_url = next_page_url

    def _data_pipeline(self, data):
        """
            处理数据
        """
        # print(data)

    def _parse_comment_page(self, html):
        """
            解析评论页并提取数据,把数据写入文件中;;
        """
        doc = pq(html)
        # print(html)
        # return
        for li in doc('div.review-list-main > div.reviews-wrapper > div.reviews-items > ul > li'):

            doc_text = pq(li)
            # 用户名字
            if doc_text('.dper-info .name').text():
                name = doc_text('.dper-info .name').text()
                # review-pictures
            else:
                name = None
            pictures = []
            try:
                for img in doc_text('.review-pictures  > ul > li img'):
                    # TODO
                    # doc_text2 = img.attr('data-big')
                    doc_text2 = pq(img)
                    doc_text3=doc_text2('img').attr('data-big')
                    print(doc_text3)
                    pictures.append(doc_text3)
                    # pictures = doc_text2('.main-review .review-pictures').attr('data-big')
            except:
                pictures = None

            try:
                star = doc_text('.review-rank .sml-rank-stars').attr('class')

            except IndexError:
                star = None
            if doc_text('div.misc-info.clearfix > .time').text():
                date_time = doc_text('div.misc-info.clearfix > .time').text()
            else:
                date_time = None
            if doc_text('.main-review .review-words').text():
                comment = doc_text('.main-review .review-words').text()
            else:
                comment = None

            data = [name, date_time, star, comment, pictures]
            #     {
            #     'name': name,
            #     'date_time': date_time,
            #     'star': star,
            #     'comment': comment,
            #     'pictures': pictures,
            # }
            # print(data)
            # f.write((str(data) + "\n").encode('utf-8'))
            # print('写入数据完成', data)
            csv_writer.writerow(data)

    # def define_my_csv_header(self):
        # 2. 基于文件对象构建 csv写入对象



    def _get_css_link(self, url):
        """
            请求评论首页,获取css样式文件
        """
        try:
            # print(url)
            res = requests.get(url, headers=self._default_headers, cookies=self._cookies)
            html = res.text
            css_link = re.search(r'<link re.*?css.*?href="(.*?svgtextcss.*?)">', html)
            # print(css_link)
            assert css_link
            css_link = 'http:' + css_link[1]
            return css_link
        except:
            None

    def _get_font_dict(self, url):
        """
            获取css样式对应文字的字典
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text

        background_image_link = re.findall(r'background-image:.*?\((.*?svg)\)', html)
        # print(background_image_link)
        background_image_link_list = []
        for i in background_image_link:
            url = 'http:' + i
            background_image_link_list.append(url)

        # print(background_image_link_list)

        html = re.sub(r'span.*?\}', '', html)
        group_offset_list = re.findall(r'\.([a-zA-Z0-9]{5,6}).*?round:(.*?)px (.*?)px;', html)
        '''
        多个偏移字典,合并在一起;;;
        '''
        font_dict_by_offset_list = {}
        for i in background_image_link_list:
            font_dict_by_offset_list.update(self._get_font_dict_by_offset(i))

        font_dict_by_offset = font_dict_by_offset_list
        # print(font_dict_by_offset)
        font_dict = {}
        for class_name, x_offset, y_offset in group_offset_list:
            x_offset = x_offset.replace('.0', '')
            y_offset = y_offset.replace('.0', '')
            try:
                font_dict[class_name] = font_dict_by_offset[int(y_offset)][int(x_offset)]

            except:
                font_dict[class_name] = ''
        return font_dict

    def _get_font_dict_by_offset(self, url):
        """
            获取坐标偏移的文字字典, 会有最少两种形式的svg文件(目前只遇到两种)
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text
        font_dict = {}
        y_list = re.findall(r'd="M0 (\d+?) ', html)
        if y_list:
            font_list = re.findall(r'<textPath .*?>(.*?)<', html)
            for i, string in enumerate(font_list):
                y_offset = self.start_y - int(y_list[i])

                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font
                font_dict[y_offset] = sub_font_dict
        else:
            font_list = re.findall(r'<text.*?y="(.*?)">(.*?)<', html)
            for y, string in font_list:
                y_offset = self.start_y - int(y)
                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font
                font_dict[y_offset] = sub_font_dict
        return font_dict


class Customer(DianpingComment):
    def _data_pipeline(self, data):
        print(data)


if __name__ == "__main__":
    dianping = Customer(SHOPID, cookies=COOKIES)
    dianping.run()
    f.close()

格式化链接:添加一列

import xlwt
import xlrd
from xlutils.copy import copy
# openpyxl
import openpyxl
import pandas as pd
from pandas import DataFrame, Series
import os
import datetime

# df = pd.read_excel('C:\\Users\\songlk\\Desktop\\dp.xlsx')
# print( df['shopID'])
wb = openpyxl.load_workbook('C:\\Users\\songlk\\Desktop\\dp.xlsx')
ws = wb.active
for row in range(2, ws.max_row + 1):  # 使用for循环,从第二行开始遍历当前表单最大行数+1,
    fcell = ws['A' + str(row)].value  # 'A'列的每一个单元格的值赋值给fcell
    if fcell is not None:
        ws['CQ' + str(row)].value = "http://10.199.203.143:8080/dianping/" + str(fcell) + ".csv"
wb.save('C:\\Users\\songlk\\Desktop\\dp2.xlsx')

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值