不多废话直接放代码,注意cookie失效了,记得换个自己的,这个是接上篇内容的
import time
import random
import pymysql
import requests
from lxml import etree
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'fspop=test; _lxsdk_cuid=18a96dd69fcc8-033456bc9cd367-26031d51-144000-18a96dd69fcbc; _lxsdk=18a96dd69fcc8-033456bc9cd367-26031d51-144000-18a96dd69fcbc; _hc.v=cf49de5b-27f5-2e5a-0129-9c6d2ae50ecb.1716894401; WEBDFPID=xy163uvy5w3y5x56y961u04734552z9681uyvv816yz97958945018y3-2032254401568-1716894401568CKMMEICfd79fef3d01d5e9aadc18ccd4d0c95072361; s_ViewType=10; ctu=a63e4ba3a5b45a2f87beabdf4e08942769d28c83775407376a91791c8159379a; cy=4; cye=guangzhou; qruuid=61935c32-61b4-4392-ab5a-ad7060244b8e; dper=0202112487a168ea9715415bcefd640e26e75457d1a736c8913d220fbf19a41eddf43b76f9b9fd7153b84efc2952def77151c6cbc3ebe467ae9e0000000051200000cd114ba8ebebedff9c40521b05ea711b0fe287e9c9d97215629422280b05244c7d2ead9ec6c6025ca776573c88d3c8a0; _lxsdk_s=18fc81ab57c-850-7a3-92b%7C%7C20; ll=7fd06e815b796be3df069dec7836c3df',
'Pragma': 'no-cache',
'Referer': 'https://www.dianping.com/shop/H9t6wplYc54itiuo',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
'sec-ch-ua': '"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# 定义常量
HOST = 'localhost'
USER = 'root'
PASSWORD = 'admin147'
DATABASE = 'demo'
PORT = 3306
count = 0
def remove_newlines(text):
"""去掉多余的空格并移除特定的字符串 ["['\\n", "\\n']", "']"]"""
# 替换特定的字符串
for item in ["['\\n", "\\n']", "']", "\\n", "\\t"]:
text = text.replace(item, "")
# 按空格分割并去掉首尾空白字符,过滤空字符串
return [dish.strip() for dish in text.split(" ") if dish.strip() != ""]
def get_data_from_xpath(qwer, xpath):
try:
return qwer.xpath(xpath)
except Exception as e:
return 'N'
with pymysql.connect(host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=PORT) as conn:
cur = conn.cursor()
cur.execute(
f""" select name,xq_url,address,middle_page_jl from dzdp_guangzhou where middle_del_flag='100' order by name,address limit 100""")
df = cur.fetchall()
del_flag = False
del_flag1 = False
for i in df:
name = i[0]
xq_url = i[1]
middle_page_jl = int(i[3])
for page in range(middle_page_jl, 11):
params = {
'queryType': 'reviewGrade',
'queryVal': 'good',
}
url = str(i[1]) + f'/review_all/p{page}'
response = requests.get(
url=url,
params=params,
headers=headers,
)
count += 1
t = random.uniform(5, 8)
time.sleep(t)
stat_code = response.status_code
if stat_code == 403:
del_flag = True
print("网站封了,换个cookie")
break
else:
content = response.text
data = etree.HTML(content)
next_xx = get_data_from_xpath(data,
f'//*[@id="review-list"]/div[2]/div[3]/div[2]/div[3]/text()')
if str(next_xx) in '暂无点评':
break
else:
for index in range(1, 16):
user_name = get_data_from_xpath(data,
f'//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li[{index}]/div/div[1]/a/text()')
user_pf = get_data_from_xpath(data,
f'string(//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li[{index}]/div/div[2]/span[2])')
user_pl = get_data_from_xpath(data,
f'//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li[{index}]/div/div[3]/text()')
user_name = remove_newlines(str(user_name))[0]
if user_name=='[]':
del_flag1 = True
break;
else:
user_pf = remove_newlines(str(user_pf))
user_pl = remove_newlines(str(user_pl))
print(name,user_name)
cur.execute(
"""INSERT INTO dzdp_guangzhou_middle_pl(name, xq_url, user_name, user_pf, user_pl)
VALUES (%s, %s, %s, %s, %s)""",
(str(name), str(xq_url), str(user_name), str(user_pf), str(user_pl))
)
cur.execute(
"UPDATE dzdp_guangzhou SET middle_page_jl =%s WHERE xq_url = %s and middle_del_flag='100'",
(page, xq_url))
conn.commit()
if del_flag1 == True:
break
cur.execute(
"UPDATE dzdp_guangzhou SET middle_del_flag ='0' WHERE xq_url = %s and middle_del_flag='100'",
(xq_url))
conn.commit()
if del_flag == True:
break
print(f'一共请求{count}次')