大众点评店铺信息及好评,中评,差评的爬虫程序最新版2024-5月份之差评数据的采集

不多废话直接放代码,注意cookie失效了,记得换个自己的,这个是接上篇内容的

import time
import random
import pymysql
import requests
from lxml import etree


headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    
    'Cookie': 'fspop=test; _lxsdk_cuid=18a96dd69fcc8-033456bc9cd367-26031d51-144000-18a96dd69fcbc; _lxsdk=18a96dd69fcc8-033456bc9cd367-26031d51-144000-18a96dd69fcbc; _hc.v=cf49de5b-27f5-2e5a-0129-9c6d2ae50ecb.1716894401; WEBDFPID=xy163uvy5w3y5x56y961u04734552z9681uyvv816yz97958945018y3-2032254401568-1716894401568CKMMEICfd79fef3d01d5e9aadc18ccd4d0c95072361; s_ViewType=10; ctu=a63e4ba3a5b45a2f87beabdf4e08942769d28c83775407376a91791c8159379a; qruuid=6da1df02-0233-4819-a5cc-3a741b7d72fa; dper=02026d2122d76f02a976ffbd0830633d52035e7d004205e139b2f003877d706eb0963ba70bfe813b00406f963447ea48c24cb50f50a4ac13bcf70000000051200000f78cca09b9e1f5df51ef0e84268e1d7648cecdbf51648c4356b73d85c6d64714abcfa1df126f275b1092cd75315caa34; _lxsdk_s=18fccad5714-d16-242-920%7C%7C15; ll=7fd06e815b796be3df069dec7836c3df; cy=224; cye=nanning',
    'Pragma': 'no-cache',
    'Referer': 'https://www.dianping.com/shop/H9t6wplYc54itiuo',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
    'sec-ch-ua': '"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}

# 定义常量
HOST = 'localhost'
USER = 'root'
PASSWORD = 'admin147'
DATABASE = 'demo'
PORT = 3306
count = 0


def remove_newlines(text):
    """去掉多余的空格并移除特定的字符串 ["['\\n", "\\n']", "']"]"""
    # 替换特定的字符串
    for item in ["['\\n", "\\n']", "']", "\\n", "\\t"]:
        text = text.replace(item, "")

    # 按空格分割并去掉首尾空白字符,过滤空字符串
    return [dish.strip() for dish in text.split(" ") if dish.strip() != ""]


def get_data_from_xpath(qwer, xpath):
    try:
        return qwer.xpath(xpath)
    except Exception as e:
        return 'N'


with pymysql.connect(host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=PORT) as conn:
    cur = conn.cursor()
    cur.execute(
        f""" select name,xq_url,address,bad_page_jl from dzdp_guangzhou where bad_del_flag='100' order by name,address limit 100""")
    df = cur.fetchall()
    del_flag = False
    del_flag1 = False
    for i in df:
        name = i[0]
        xq_url = i[1]
        bad_page_jl = int(i[3])
        for page in range(bad_page_jl, 6):
            params = {
                'queryType': 'reviewGrade',
                'queryVal': 'bad',
            }
            url = str(i[1]) + f'/review_all/p{page}'
            response = requests.get(
                url=url,
                params=params,
                headers=headers,
            )
        
            count += 1
            t = random.uniform(5, 8)
            time.sleep(t)
            stat_code = response.status_code
            if stat_code == 403:
                del_flag = True
                print("网站封了,换个cookie")
                break
            else:
                content = response.text
                data = etree.HTML(content)
                next_xx = get_data_from_xpath(data,
                                              f'//*[@id="review-list"]/div[2]/div[3]/div[2]/div[3]/text()')
                if str(next_xx) in '暂无点评':
                    break
                else:
                    for index in range(1, 16):
                        user_name = get_data_from_xpath(data,
                                                        f'//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li[{index}]/div/div[1]/a/text()')
                        user_pf = get_data_from_xpath(data,
                                                      f'string(//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li[{index}]/div/div[2]/span[2])')
                        user_pl = get_data_from_xpath(data,
                                                      f'//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li[{index}]/div/div[3]/text()')
                        user_name = remove_newlines(str(user_name))[0]
                
                        if user_name=='[]':
                            del_flag1 = True
                            break;
                        else:
                            user_pf = remove_newlines(str(user_pf))
                            user_pl = remove_newlines(str(user_pl))
                            print(name,user_name)
                            cur.execute(
                                """INSERT INTO dzdp_guangzhou_bad_pl(name, xq_url, user_name, user_pf, user_pl) 
                                   VALUES (%s, %s, %s, %s, %s)""",
                                (str(name), str(xq_url), str(user_name), str(user_pf), str(user_pl))
                            )
                    cur.execute(
                        "UPDATE dzdp_guangzhou SET bad_page_jl =%s WHERE xq_url = %s  and bad_del_flag='100'",
                        (page, xq_url))
                    conn.commit()
            if del_flag1 == True:
                break

        cur.execute(
            "UPDATE dzdp_guangzhou SET bad_del_flag ='0' WHERE xq_url = %s  and bad_del_flag='100'",
            (xq_url))
        conn.commit()
        if del_flag == True:
            break
print(f'一共请求{count}次')

  • 4
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值