1、driver配置设置
采用了无头模式,并模拟用户在浏览器中的行为,来躲过闲鱼网站的反爬机制。设置chrome的能力来启用CDP,设置capabilities日志捕获,来获取点击操作后的请求响应。创建实例
def driver():
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {"profile.default_content_settings": {"images": 2}})
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
driver = webdriver.Chrome(options=chrome_options)
driver.get('https://www.goofish.com/')
yield driver
driver.quit()
2、自动化主体,引用实例,根据用户输入的商品名称,在页面搜索,将页面操作产生的请求响应全部捕捉,并提取精确url目标的信息,将获取到的信息处理后提取对应的商品信息,首次运行会直接邮件通知,后面每次循环只有当商品数量比上次多时才会邮件通知。
@pytest.mark.usefixtures("driver")
class Testxianyu1:
def __init__(self):
self.email_notifier = EmailNotifier() # 使用 EmailNotifier 类
@pytest.mark.parametrize("name", [("努比亚阿尔法手表")])
def test_xianyushangping(self, driver, name):
print(f"开始搜索 {name}...")
search_input = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-input--WY2l9QD3"))
)
search_input.click()
search_input.send_keys(name + Keys.ENTER)
checkbox = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located((By.CSS_SELECTOR,
".search-checkbox-item-container--DsTIZUle:nth-child(2) > .search-checkbox--fULWOSyM"))
)
checkbox.click()
response_bodies = self.capture_all_network_responses(driver,
target_url_partial="mtop.taobao.idlemtopsearch.pc.search")
if response_bodies:
relevant_response = self.extract_relevant_response(response_bodies)
if relevant_response:
processed_items = self.process_data(relevant_response)
filtered_items = self.filter_items(processed_items)
# 如果是首次运行或有新的商品,则发送提醒邮件
if len(filtered_items) > len(self.email_notifier.previous_filtered_items):
new_items = [item for item in filtered_items if
item not in self.email_notifier.previous_filtered_items]
self.email_notifier.send_email(new_items)
print("发送邮件成功。")
3、capture_all_network_responses捕获全部的响应信息,如果捕获异常会输出对应提示,并对信息进行深度处理,extract_relevant_response方法根据url精确匹配对应响应数据,并使用我们需要的参数进行辅助识别,process_data方法,提取我们需要的信息字段,并拼接、排序、储存
def capture_all_network_responses(self, driver, target_url_partial):
logs = driver.get_log('performance')
response_bodies = []
print("开始捕获网络响应...") # 调试信息
for log in logs:
log_dict = json.loads(log['message'])['message']
if log_dict['method'] == 'Network.responseReceived':
response_details = log_dict.get('params', {}).get('response', {})
url = response_details.get('url', '')
mime_type = response_details.get('mimeType', '')
if target_url_partial in url and 'application/json' in mime_type:
request_id = log_dict['params']['requestId']
try:
response = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
response_body = response.get('body', '')
if response_body:
response_bodies.append(response_body)
except Exception as e:
if 'No resource with given identifier found' not in str(e):
print(f"异常捕获: {str(e)}")
print(f"捕获到 {len(response_bodies)} 个响应") # 调试信息
return response_bodies
def extract_relevant_response(self, response_bodies):
for response_body in response_bodies:
try:
response_body_json = json.loads(response_body)
if 'data' in response_body_json and 'resultList' in response_body_json['data']:
return response_body_json
except json.JSONDecodeError:
continue
return None
def process_data(self, data):
if 'data' not in data or 'resultList' not in data['data']:
print("响应格式不符合预期:", data)
return []
items = data['data']['resultList']
processed_items = []
for item in items:
item_data = item['data']
title = item_data.get('title', '')
description = item_data.get('description', '')
province = item_data.get('province', '')
city = item_data.get('city', '')
area = item_data.get('area', '')
user_nick = item_data.get('userNick', '')
sold_price = item_data.get('soldPrice', '')
image_urls = item_data.get('imageUrls', [])
# 提取商品成色
condition = next(
(label['propertyValuesList'][0]['value'] for label in json.loads(item_data.get('pvLabels', '[]')) if
label.get('propertyKeyName') == '成色'), '无')
# 提取商家信用
seller_credit = '无'
if any(tag['data'].get('content') == '卖家信用极好' for tag in
item_data.get('fishTags', {}).get('r4', {}).get('tagList', [])):
seller_credit = '卖家信用极好'
# 提取商品验证方法
verification_method = '无'
if any(tag['data'].get('content') == '验货宝' for tag in
item_data.get('fishTags', {}).get('r1', {}).get('tagList', [])):
verification_method = '验货宝'
try:
price_value = float(sold_price) if sold_price else float('inf')
except ValueError:
price_value = float('inf')
processed_items.append({
"description": f"{title} {description}",
"address": f"{province} {city} {area}",
"seller": user_nick,
"price": sold_price,
"price_value": price_value,
"images": image_urls,
"condition": condition,
"seller_credit": seller_credit,
"verification_method": verification_method
})
return sorted(processed_items, key=lambda x: x['price_value'])
4、filter_items方法,可以根据上面我们得到的商家信息数据重点成色、价格、卖家信用等字段对的判断条件对商家信息进行精度过滤。
def filter_items(self, items):
return [item for item in items if (float(item["price"]) >= 300)]
5、创建EmailNotifier类来对邮件中展示信息格式样式简单处理,当用户首次运行时通知邮件中的信息字体颜色均为黑色,后面每次循环发现的新商品后,邮件中新商品信息的字体颜色会变红,用来和原有的商品进行区分,方便查阅。
# 邮件配置
email_config = {
"user": "xxxxxx@qq.com",
"password": "xxxxxx",
"host": "smtp.qq.com",
"to": ["xxxxxxxx@qq.com", "xxxxxxxx@qq.com"]
}
EMAIL_SUBJECT = '商品更新通知'
class EmailNotifier:
def __init__(self):
self.previous_filtered_items = [] # 用于存储上次过滤后的商品信息
def send_email(self, new_items):
yag = yagmail.SMTP(email_config['user'], email_config['password'], email_config['host'])
contents = ["<h3>商品信息如下:</h3>"]
# 如果是首次发送邮件,所有商品信息为黑色字体
if not self.previous_filtered_items:
for item in new_items:
contents.append(self.format_item(item, color='black'))
else:
# 添加之前的商品信息(黑色字体)
for item in self.previous_filtered_items:
contents.append(self.format_item(item, color='black'))
# 添加新增的商品信息(红色字体)
for item in new_items:
contents.append(self.format_item(item, color='red'))
yag.send(email_config['to'], EMAIL_SUBJECT, contents)
self.previous_filtered_items.extend(new_items) # 更新上次过滤后的商品信息
def format_item(self, item, color):
return (
f"<p style='color:{color};'>描述: {item['description']}<br>价格: {item['price']}<br>"
f"成色: {item['condition']}<br>商家信用: {item['seller_credit']}<br>"
f"商品验证: {item['verification_method']}<br>商家名称: {item['seller']}<br>"
f"商家地址: {item['address']}<br>图片链接: {', '.join(item['images'])}</p>"
+ "<p>" + "-" * 70 + "</p>" # 70 个短横线分隔符
)
6、运行时间设置,设置了脚本无限循环,每天00:00~24:00(可自定义),每隔一段时间执行,间隔时间为(10~30)分钟中任取一个随机数(可自定义时间)
def run_test():
try:
print("开始测试运行...")
pytest.main(["-q", "test_xianyu8.py::Testxianyu1"])
print("测试运行成功")
except Exception as e:
print(f"测试运行时遇到错误: {e}")
def schedule_next_run():
next_run_in_minutes = random.randint(1, 2)
print(f"下次运行将在 {next_run_in_minutes} 分钟后")
schedule.every(next_run_in_minutes).minutes.do(run_and_reschedule)
return next_run_in_minutes
def run_and_reschedule():
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"准备在 {current_time} 运行测试...")
run_test()
next_run_in_minutes = schedule_next_run()
next_run_time = datetime.now() + timedelta(minutes=next_run_in_minutes)
print(f"测试运行完成,下次运行时间: {next_run_time.strftime('%Y-%m-%d %H:%M:%S')}")
print("首次运行...")
run_and_reschedule()
while True:
current_hour = datetime.now().hour
if 0 <= current_hour < 24:
schedule.run_pending()
else:
print(f"当前时间 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 不在运行时间范围内。")
time.sleep(1)
7、使用效果
8、完整代码如下,感谢阅读。
import schedule
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pytest
import yagmail
import json
import random
from datetime import datetime, timedelta
import time
# 邮件配置
email_config = {
"user": "xxxxxxx@qq.com", # 邮箱名称
"password": "xxxxxxxx", # 邮箱授权码
"host": "smtp.qq.com",
"to": ["xxxxxx@qq.com"] # 收件人邮箱
}
EMAIL_SUBJECT = '商品更新通知'
class EmailNotifier:
def __init__(self):
self.previous_filtered_items = [] # 用于存储上次过滤后的商品信息
def send_email(self, new_items):
yag = yagmail.SMTP(email_config['user'], email_config['password'], email_config['host'])
contents = ["<h3>商品信息如下:</h3>"]
# 如果是首次发送邮件,所有商品信息为黑色字体
if not self.previous_filtered_items:
for item in new_items:
contents.append(self.format_item(item, color='black'))
else:
# 添加之前的商品信息(黑色字体)
for item in self.previous_filtered_items:
contents.append(self.format_item(item, color='black'))
# 添加新增的商品信息(红色字体)
for item in new_items:
contents.append(self.format_item(item, color='red'))
yag.send(email_config['to'], EMAIL_SUBJECT, contents)
self.previous_filtered_items.extend(new_items) # 更新上次过滤后的商品信息
def format_item(self, item, color):
return (
f"<p style='color:{color};'>描述: {item['description']}<br>价格: {item['price']}<br>"
f"成色: {item['condition']}<br>商家信用: {item['seller_credit']}<br>"
f"商品验证: {item['verification_method']}<br>商家名称: {item['seller']}<br>"
f"商家地址: {item['address']}<br>图片链接: {', '.join(item['images'])}</p>"
+ "<p>" + "-" * 80 + "</p>" # 70 个短横线分隔符
)
@pytest.fixture(scope="class")
def driver():
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {"profile.default_content_settings": {"images": 2}})
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
driver = webdriver.Chrome(options=chrome_options)
driver.get('https://www.goofish.com/')
yield driver
driver.quit()
@pytest.mark.usefixtures("driver")
class Testxianyu1:
def __init__(self):
self.email_notifier = EmailNotifier() # 使用 EmailNotifier 类
@pytest.mark.parametrize("name", [("努比亚阿尔法手表")]) # 想要搜索的商品名称
def test_xianyushangping(self, driver, name):
print(f"开始搜索 {name}...")
search_input = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-input--WY2l9QD3"))
)
search_input.click()
search_input.send_keys(name + Keys.ENTER)
checkbox = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located((By.CSS_SELECTOR,
".search-checkbox-item-container--DsTIZUle:nth-child(2) > .search-checkbox--fULWOSyM"))
)
checkbox.click()
response_bodies = self.capture_all_network_responses(driver,
target_url_partial="mtop.taobao.idlemtopsearch.pc.search")
if response_bodies:
relevant_response = self.extract_relevant_response(response_bodies)
if relevant_response:
processed_items = self.process_data(relevant_response)
filtered_items = self.filter_items(processed_items)
# 如果是首次运行或有新的商品,则发送提醒邮件
if len(filtered_items) > len(self.email_notifier.previous_filtered_items):
new_items = [item for item in filtered_items if
item not in self.email_notifier.previous_filtered_items]
self.email_notifier.send_email(new_items)
print("发送邮件成功。")
def capture_all_network_responses(self, driver, target_url_partial):
logs = driver.get_log('performance')
response_bodies = []
print("开始捕获网络响应...") # 调试信息
for log in logs:
log_dict = json.loads(log['message'])['message']
if log_dict['method'] == 'Network.responseReceived':
response_details = log_dict.get('params', {}).get('response', {})
url = response_details.get('url', '')
mime_type = response_details.get('mimeType', '')
if target_url_partial in url and 'application/json' in mime_type:
request_id = log_dict['params']['requestId']
try:
response = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
response_body = response.get('body', '')
if response_body:
response_bodies.append(response_body)
except Exception as e:
if 'No resource with given identifier found' not in str(e): # 忽略部分无用异常信息,减少日志打印
print(f"异常捕获: {str(e)}")
print(f"捕获到 {len(response_bodies)} 个响应") # 调试信息
return response_bodies
def extract_relevant_response(self, response_bodies):
for response_body in response_bodies:
try:
response_body_json = json.loads(response_body)
if 'data' in response_body_json and 'resultList' in response_body_json['data']:
return response_body_json
except json.JSONDecodeError:
continue
return None
def process_data(self, data):
if 'data' not in data or 'resultList' not in data['data']:
print("响应格式不符合预期:", data)
return []
items = data['data']['resultList']
processed_items = []
for item in items:
item_data = item['data']
title = item_data.get('title', '')
description = item_data.get('description', '')
province = item_data.get('province', '')
city = item_data.get('city', '')
area = item_data.get('area', '')
user_nick = item_data.get('userNick', '')
sold_price = item_data.get('soldPrice', '')
image_urls = item_data.get('imageUrls', [])
# 提取商品成色
condition = next(
(label['propertyValuesList'][0]['value'] for label in json.loads(item_data.get('pvLabels', '[]')) if
label.get('propertyKeyName') == '成色'), '无')
# 提取商家信用
seller_credit = '无'
if any(tag['data'].get('content') == '卖家信用极好' for tag in
item_data.get('fishTags', {}).get('r4', {}).get('tagList', [])):
seller_credit = '卖家信用极好'
# 提取商品验证方法
verification_method = '无'
if any(tag['data'].get('content') == '验货宝' for tag in
item_data.get('fishTags', {}).get('r1', {}).get('tagList', [])):
verification_method = '验货宝'
# 如果 price 不是数字或为空,则将其设置为较高的值以将其推送到排序列表的末尾
try:
price_value = float(sold_price) if sold_price else float('inf')
except ValueError:
price_value = float('inf')
processed_items.append({
"description": f"{title} {description}",
"address": f"{province} {city} {area}",
"seller": user_nick,
"price": sold_price,
"price_value": price_value,
"images": image_urls,
"condition": condition,
"seller_credit": seller_credit,
"verification_method": verification_method
})
# 按价格从低到高进行排序
return sorted(processed_items, key=lambda x: x['price_value'])
def filter_items(self, items):
return [item for item in items if (float(item["price"]) >= 300)]
def run_test():
try:
print("开始测试运行...")
pytest.main(["-q", "test_xianyu8.py::Testxianyu1"])
print("测试运行成功")
except Exception as e:
print(f"测试运行时遇到错误: {e}")
def schedule_next_run():
next_run_in_minutes = random.randint(1, 2)
print(f"下次运行将在 {next_run_in_minutes} 分钟后")
schedule.every(next_run_in_minutes).minutes.do(run_and_reschedule)
return next_run_in_minutes
def run_and_reschedule():
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"准备在 {current_time} 运行测试...")
run_test()
next_run_in_minutes = schedule_next_run()
next_run_time = datetime.now() + timedelta(minutes=next_run_in_minutes)
print(f"测试运行完成,下次运行时间: {next_run_time.strftime('%Y-%m-%d %H:%M:%S')}")
print("首次运行...")
run_and_reschedule()
while True:
current_hour = datetime.now().hour
if 0 <= current_hour < 24:
schedule.run_pending()
else:
print(f"当前时间 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 不在运行时间范围内。")
time.sleep(1)