大众点评店铺信息及好评，中评，差评的爬虫程序最新版2024-5月份之中评数据的采集

最新推荐文章于 2024-08-05 16:50:32 发布

教你打中单

最新推荐文章于 2024-08-05 16:50:32 发布

阅读量144

点赞数 2

文章标签：爬虫

本文链接：https://blog.csdn.net/ymdaa555/article/details/139348756

版权

不多废话直接放代码，注意cookie失效了，记得换个自己的，这个是接上篇内容的

import time
import random
import pymysql
import requests
from lxml import etree


headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    
    'Cookie': 'fspop=test; _lxsdk_cuid=18a96dd69fcc8-033456bc9cd367-26031d51-144000-18a96dd69fcbc; _lxsdk=18a96dd69fcc8-033456bc9cd367-26031d51-144000-18a96dd69fcbc; _hc.v=cf49de5b-27f5-2e5a-0129-9c6d2ae50ecb.1716894401; WEBDFPID=xy163uvy5w3y5x56y961u04734552z9681uyvv816yz97958945018y3-2032254401568-1716894401568CKMMEICfd79fef3d01d5e9aadc18ccd4d0c95072361; s_ViewType=10; ctu=a63e4ba3a5b45a2f87beabdf4e08942769d28c83775407376a91791c8159379a; cy=4; cye=guangzhou; qruuid=61935c32-61b4-4392-ab5a-ad7060244b8e; dper=0202112487a168ea9715415bcefd640e26e75457d1a736c8913d220fbf19a41eddf43b76f9b9fd7153b84efc2952def77151c6cbc3ebe467ae9e0000000051200000cd114ba8ebebedff9c40521b05ea711b0fe287e9c9d97215629422280b05244c7d2ead9ec6c6025ca776573c88d3c8a0; _lxsdk_s=18fc81ab57c-850-7a3-92b%7C%7C20; ll=7fd06e815b796be3df069dec7836c3df',
    
    'Pragma': 'no-cache',
    'Referer': 'https://www.dianping.com/shop/H9t6wplYc54itiuo',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
    'sec-ch-ua': '"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}

# 定义常量
HOST = 'localhost'
USER = 'root'
PASSWORD = 'admin147'
DATABASE = 'demo'
PORT = 3306
count = 0


def remove_newlines(text):
    """去掉多余的空格并移除特定的字符串 ["['\\n", "\\n']", "']"]"""
    # 替换特定的字符串
    for item in ["['\\n", "\\n']", "']", "\\n", "\\t"]:
        text = text.replace(item, "")

    # 按空格分割并去掉首尾空白字符，过滤空字符串
    return [dish.strip() for dish in text.split(" ") if dish.strip() != ""]


def get_data_from_xpath(qwer, xpath):
    try:
        return qwer.xpath(xpath)
    except Exception as e:
        return 'N'


with pymysql.connect(host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=PORT) as conn:
    cur = conn.cursor()
    cur.execute(
        f""" select name,xq_url,address,middle_page_jl from dzdp_guangzhou where middle_del_flag='100' order by name,address limit 100""")
    df = cur.fetchall()
    del_flag = False
    del_flag1 = False
    for i in df:
        name = i[0]
        xq_url = i[1]
        middle_page_jl = int(i[3])
        for page in range(middle_page_jl, 11):
            params = {
                'queryType': 'reviewGrade',
                'queryVal': 'good',
            }
            url = str(i[1]) + f'/review_all/p{page}'
            response = requests.get(
                url=url,
                params=params,
                headers=headers,
            )
        
            count += 1
            t = random.uniform(5, 8)
            time.sleep(t)
            stat_code = response.status_code
            if stat_code == 403:
                del_flag = True
                print("网站封了,换个cookie")
                break
            else:
                content = response.text
                data = etree.HTML(content)
                next_xx = get_data_from_xpath(data,
                                              f'//*[@id="review-list"]/div[2]/div[3]/div[2]/div[3]/text()')
                if str(next_xx) in '暂无点评':
                    break
                else:
                    for index in range(1, 16):
                        user_name = get_data_from_xpath(data,
                                                        f'//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li[{index}]/div/div[1]/a/text()')
                        user_pf = get_data_from_xpath(data,
                                                      f'string(//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li[{index}]/div/div[2]/span[2])')
                        user_pl = get_data_from_xpath(data,
                                                      f'//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li[{index}]/div/div[3]/text()')
                        user_name = remove_newlines(str(user_name))[0]
                
                        if user_name=='[]':
                            del_flag1 = True
                            break;
                        else:
                            user_pf = remove_newlines(str(user_pf))
                            user_pl = remove_newlines(str(user_pl))
                            print(name,user_name)
                            cur.execute(
                                """INSERT INTO dzdp_guangzhou_middle_pl(name, xq_url, user_name, user_pf, user_pl) 
                                   VALUES (%s, %s, %s, %s, %s)""",
                                (str(name), str(xq_url), str(user_name), str(user_pf), str(user_pl))
                            )
                    cur.execute(
                        "UPDATE dzdp_guangzhou SET middle_page_jl =%s WHERE xq_url = %s  and middle_del_flag='100'",
                        (page, xq_url))
                    conn.commit()
            if del_flag1 == True:
                break

        cur.execute(
            "UPDATE dzdp_guangzhou SET middle_del_flag ='0' WHERE xq_url = %s  and middle_del_flag='100'",
            (xq_url))
        conn.commit()
        if del_flag == True:
            break
print(f'一共请求{count}次')