Selenium爬取携程景区评论数据（仅供学习）

最新推荐文章于 2025-04-24 18:34:45 发布

SDLearn

最新推荐文章于 2025-04-24 18:34:45 发布

阅读量271

点赞数 7

文章标签：学习 selenium pycharm 爬虫

本文链接：https://blog.csdn.net/qq_63883625/article/details/147400852

版权

环境：

确定谷歌的版本：

版本 96.0.4664.45（正式版本）（64 位）

确定chromedriver的版本：（谷歌版本为多少就选择哪个版本的）

下载链接：CNPM Binaries Mirror
我的电脑为Windows，直接选取win32（不管是64或者32的都选择32的就可以）

确定爬取的网站：

url：https://you.ctrip.com/sight/antu643/136032.html?scene=online

定位爬取的元素：

检验xpath：

定位本页所有评论：

好了直接上代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import json
import os
import random


# # 获取配置对象 => 什么样的浏览器就选择什么浏览器配置
# option = webdriver.ChromeOptions()
# option.add_experimental_option("detach", True)
#
# # 获取driver对象, 并将配置好的option传入进去
# driver = webdriver.Chrome(options=option)
# driver.get('https://you.ctrip.com/sight/antu643/136032.html?scene=online')
# time.sleep(10)
# print("=========第一次尝试=========")
# # 新增爬取文本的操作
# try:
#     # 定位元素
#     element = driver.find_element("xpath", '//*[@id="commentModule"]/div[5]/div[1]')
#
#     # 获取文本内容
#     text_content = element.text
#
#     # 打印提取的文本
#     print("提取到的文本内容：")
#     print(text_content)
#
# except Exception as e:
#     print(f"爬取文本时出现错误: {e}")
# print("=========First Over!!!!=========")
# time.sleep(20)
# 新增爬取文本的操作
# print("=========第二次尝试=========")
# try:
#     # 定位所有匹配的评论元素（使用 find_elements 获取列表）
#     comment_elements = driver.find_elements("xpath", '//*[@id="commentModule"]/div[5]/div')
#
#     # 检查是否找到了评论
#     if not comment_elements:
#         print("未找到评论内容！")
#     else:
#         # 遍历每个评论元素并提取文本
#         for idx, element in enumerate(comment_elements, start=1):
#             # 获取评论文本
#             text_content = element.text.strip()  # strip() 去除多余空格
#             print(f"评论 {idx}:")
#             print(text_content)
#             print("-" * 50)  # 分隔线
#
# except Exception as e:
#     print(f"爬取文本时出现错误: {e}")
# print("=========Second Over!!!=========")
# time.sleep(20)
# print("=========Try Three=========")
# # 新增爬取文本的操作
# try:
#     # 定位所有匹配的评论元素（使用 find_elements 获取列表）
#     comment_elements = driver.find_elements("xpath", '//*[@id="commentModule"]/div[5]/div')
#
#     # 检查是否找到了评论
#     if not comment_elements:
#         print("未找到评论内容！")
#     else:
#         comments_list = []  # 存储所有评论的列表
#
#         # 遍历每个评论元素并提取文本
#         for idx, element in enumerate(comment_elements, start=1):
#             # 获取评论文本
#             text_content = element.text.strip()  # strip() 去除多余空格
#             print(f"评论 {idx}:")
#             print(text_content)
#             print("-" * 50)  # 分隔线
#
#             # 添加到评论列表
#             comments_list.append({
#                 "id": idx,
#                 "content": text_content
#             })
#
#         # 保存为 JSON 文件（项目根目录）
#         json_file_path = os.path.join(os.getcwd(), "comments.json")
#
#         with open(json_file_path, "w", encoding="utf-8") as f:
#             json.dump(comments_list, f, ensure_ascii=False, indent=4)  # 格式化输出
#
#         print(f"评论数据已保存至：{json_file_path}")
#
# except Exception as e:
#     print(f"爬取文本时出现错误: {e}")
# print("========Three Over========")
print("========Four Start========")

# 配置浏览器
option = webdriver.ChromeOptions()
option.add_experimental_option("detach", True)

# 添加随机User-Agent
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15"
]
option.add_argument(f"user-agent={random.choice(user_agents)}")

# 禁用自动化特征检测
option.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(options=option)

# 使用已有的date目录
date_dir = os.path.join(os.getcwd(), "date_CBSBP")
if not os.path.exists(date_dir):
    raise Exception(f"目录 {date_dir} 不存在，请先创建该目录")

print(f"将使用已有目录: {date_dir}")

# 访问目标页面
driver.get('https://you.ctrip.com/sight/antu643/136032.html?scene=online')
time.sleep(10 + random.randint(1, 5))  # 随机初始等待

# 在开始爬取前点击一次指定元素
# try:
#     click_element = WebDriverWait(driver, 15).until(
#         EC.element_to_be_clickable((By.XPATH, '//*[@id="commentModule"]/div[4]/span[2]'))
#     )
#     exit()
# #     click_element.click()
# #     print("已成功点击评论筛选元素")
# #     time.sleep(5)  # 等待评论加载完成
# # except Exception as e:
# #     print(f"点击评论筛选元素失败: {e}")
# #     driver.quit()
# #     exit()

# 存储所有评论
all_comments = []
max_pages = 150  # 最多爬取150页
current_page = 1

try:
    while current_page <= max_pages:
        print(f"\n========= 正在爬取第 {current_page} 页 =========")

        # 随机滚动页面模拟人工操作
        for _ in range(3):
            driver.execute_script("window.scrollBy(0,  500)")
            time.sleep(random.uniform(0.5, 2))

        # 提取当前页的评论
        try:
            comment_elements = WebDriverWait(driver, 15).until(
                EC.presence_of_all_elements_located((By.XPATH, '//*[@id="commentModule"]/div[5]/div'))
            )
        except Exception as e:
            print(f"定位评论元素失败: {e}")
            comment_elements = []

        if not comment_elements:
            print("未找到评论内容！尝试刷新页面...")
            driver.refresh()
            time.sleep(10)
            continue
        else:
            comments_list = []  # 当前页评论列表
            for idx, element in enumerate(comment_elements, start=1):
                try:
                    text_content = element.text.strip()
                    print(f"评论 {idx}:")
                    print(text_content[:50] + "...")  # 只打印前50个字符
                    print("-" * 50)

                    # 添加到当前页评论列表
                    comments_list.append({
                        "page": current_page,
                        "id": idx,
                        "content": text_content
                    })
                except Exception as e:
                    print(f"提取评论{idx}时出错: {e}")

            # 添加到总评论列表
            all_comments.extend(comments_list)

            # 每页保存到date目录
            json_file_path = os.path.join(date_dir, f"comments_page_{current_page}.json")
            with open(json_file_path, "w", encoding="utf-8") as f:
                json.dump(comments_list, f, ensure_ascii=False, indent=4)
            print(f"第 {current_page} 页评论已保存至：{json_file_path}")

        # 检查是否已达到最大页数
        if current_page >= max_pages:
            print(f"已达到最大爬取页数 {max_pages}，停止爬取。")
            break

        try:
            # 随机等待时间
            wait_time = 10 + random.randint(-5, 5)
            print(f"等待{wait_time}秒后继续下一页...")
            time.sleep(wait_time)

            # 尝试翻页
            for attempt in range(3):  # 最多尝试3次
                try:
                    # 定位页码输入框
                    page_input = WebDriverWait(driver, 15).until(
                        EC.presence_of_element_located(
                            (By.XPATH, '//*[@id="commentModule"]/div[6]/ul/li[10]/div/input'))
                    )

                    # 定位跳转按钮
                    page_button = WebDriverWait(driver, 15).until(
                        EC.element_to_be_clickable(
                            (By.XPATH, '//*[@id="commentModule"]/div[6]/ul/li[10]/div/span/button'))
                    )

                    # 输入下一页页码
                    for _ in range(3):  # 确保输入成功
                        page_input.clear()
                        page_input.send_keys(str(current_page + 1))
                        time.sleep(0.5)

                        # 随机点击延迟
                    time.sleep(random.uniform(0.5, 2))

                    # 点击跳转按钮
                    driver.execute_script("arguments[0].click();", page_button)

                    # 等待页面加载
                    time.sleep(5 + random.randint(1, 3))

                    # 验证是否翻页成功
                    try:
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, f'//*[@id="commentModule"]/div[5]/div'))
                        )
                        current_page += 1
                        break  # 翻页成功，退出重试循环
                    except:
                        if attempt == 2:
                            raise Exception("翻页后未加载新内容")
                        continue

                except Exception as e:
                    if attempt == 2:  # 最后一次尝试
                        raise
                    print(f"翻页尝试 {attempt + 1} 失败: {e}")
                    time.sleep(5)
                    driver.refresh()
                    time.sleep(5)

        except Exception as e:
            print(f"翻页最终失败: {e}")
            break

finally:
    # 保存所有评论到date目录
    all_comments_path = os.path.join(date_dir, "all_comments.json")
    with open(all_comments_path, "w", encoding="utf-8") as f:
        json.dump(all_comments, f, ensure_ascii=False, indent=4)

    print(f"\n所有评论已保存至：{all_comments_path}")
    print(f"共爬取 {len(all_comments)} 条评论。")

    # 关闭浏览器
    driver.quit()
print("Four Over!!!")
print("Four Over!!!")

代码前部分的注释为测试阶段，读者可以注释掉后面的去一个一个测试，最终爬取的程序就是上传程序没有注释代码
由于上面代码后面评论页数不足150页会重复爬取最后一页数据直到爬取满150页，所以下面是更新后的代码：

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import json
import os
import random


# # 获取配置对象 => 什么样的浏览器就选择什么浏览器配置
# option = webdriver.ChromeOptions()
# option.add_experimental_option("detach", True)
#
# # 获取driver对象, 并将配置好的option传入进去
# driver = webdriver.Chrome(options=option)
# driver.get('https://you.ctrip.com/sight/antu643/136032.html?scene=online')
# time.sleep(10)
# print("=========第一次尝试=========")
# # 新增爬取文本的操作
# try:
#     # 定位元素
#     element = driver.find_element("xpath", '//*[@id="commentModule"]/div[5]/div[1]')
#
#     # 获取文本内容
#     text_content = element.text
#
#     # 打印提取的文本
#     print("提取到的文本内容：")
#     print(text_content)
#
# except Exception as e:
#     print(f"爬取文本时出现错误: {e}")
# print("=========First Over!!!!=========")
# time.sleep(20)
# 新增爬取文本的操作
# print("=========第二次尝试=========")
# try:
#     # 定位所有匹配的评论元素（使用 find_elements 获取列表）
#     comment_elements = driver.find_elements("xpath", '//*[@id="commentModule"]/div[5]/div')
#
#     # 检查是否找到了评论
#     if not comment_elements:
#         print("未找到评论内容！")
#     else:
#         # 遍历每个评论元素并提取文本
#         for idx, element in enumerate(comment_elements, start=1):
#             # 获取评论文本
#             text_content = element.text.strip()  # strip() 去除多余空格
#             print(f"评论 {idx}:")
#             print(text_content)
#             print("-" * 50)  # 分隔线
#
# except Exception as e:
#     print(f"爬取文本时出现错误: {e}")
# print("=========Second Over!!!=========")
# time.sleep(20)
# print("=========Try Three=========")
# # 新增爬取文本的操作
# try:
#     # 定位所有匹配的评论元素（使用 find_elements 获取列表）
#     comment_elements = driver.find_elements("xpath", '//*[@id="commentModule"]/div[5]/div')
#
#     # 检查是否找到了评论
#     if not comment_elements:
#         print("未找到评论内容！")
#     else:
#         comments_list = []  # 存储所有评论的列表
#
#         # 遍历每个评论元素并提取文本
#         for idx, element in enumerate(comment_elements, start=1):
#             # 获取评论文本
#             text_content = element.text.strip()  # strip() 去除多余空格
#             print(f"评论 {idx}:")
#             print(text_content)
#             print("-" * 50)  # 分隔线
#
#             # 添加到评论列表
#             comments_list.append({
#                 "id": idx,
#                 "content": text_content
#             })
#
#         # 保存为 JSON 文件（项目根目录）
#         json_file_path = os.path.join(os.getcwd(), "comments.json")
#
#         with open(json_file_path, "w", encoding="utf-8") as f:
#             json.dump(comments_list, f, ensure_ascii=False, indent=4)  # 格式化输出
#
#         print(f"评论数据已保存至：{json_file_path}")
#
# except Exception as e:
#     print(f"爬取文本时出现错误: {e}")
# print("========Three Over========")

print("========Four Start========")

# 配置浏览器
option = webdriver.ChromeOptions()
option.add_experimental_option("detach", True)

# 添加随机User-Agent
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15"
]
option.add_argument(f"user-agent={random.choice(user_agents)}")

# 禁用自动化特征检测
option.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(options=option)

# 使用已有的date目录
date_dir = os.path.join(os.getcwd(), "date_CBSXP")
if not os.path.exists(date_dir):
    raise Exception(f"目录 {date_dir} 不存在，请先创建该目录")

print(f"将使用已有目录: {date_dir}")

# 访问目标页面
driver.get('https://you.ctrip.com/sight/fusong2479/136039.html?scene=online')
time.sleep(10 + random.randint(1, 5))  # 随机初始等待


# 获取最大页数
def get_max_pages():
    try:
        max_page_element = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="commentModule"]/div[6]/ul/li[8]/a'))
        )
        max_pages = int(max_page_element.text)
        print(f"检测到最大页数: {max_pages}")
        return max_pages
    except Exception as e:
        print(f"获取最大页数失败: {e}, 使用默认值150")
        return 150  # 如果获取失败，使用默认值


# 存储所有评论
all_comments = []
max_pages = get_max_pages()  # 动态获取最大页数
current_page = 1

try:
    while current_page <= max_pages:
        print(f"\n========= 正在爬取第 {current_page} 页(共{max_pages}页) =========")

        # 随机滚动页面模拟人工操作
        for _ in range(3):
            driver.execute_script("window.scrollBy(0,   500)")
            time.sleep(random.uniform(0.5, 2))

        # 提取当前页的评论
        try:
            comment_elements = WebDriverWait(driver, 15).until(
                EC.presence_of_all_elements_located((By.XPATH, '//*[@id="commentModule"]/div[5]/div'))
            )
        except Exception as e:
            print(f"定位评论元素失败: {e}")
            comment_elements = []

        if not comment_elements:
            print("未找到评论内容！尝试刷新页面...")
            driver.refresh()
            time.sleep(10)
            continue
        else:
            comments_list = []  # 当前页评论列表
            for idx, element in enumerate(comment_elements, start=1):
                try:
                    text_content = element.text.strip()
                    print(f"评论 {idx}:")
                    print(text_content[:50] + "...")  # 只打印前50个字符
                    print("-" * 50)

                    # 添加到当前页评论列表
                    comments_list.append({
                        "page": current_page,
                        "id": idx,
                        "content": text_content
                    })
                except Exception as e:
                    print(f"提取评论{idx}时出错: {e}")

            # 添加到总评论列表
            all_comments.extend(comments_list)

            # 每页保存到date目录
            json_file_path = os.path.join(date_dir, f"comments_page_{current_page}.json")
            with open(json_file_path, "w", encoding="utf-8") as f:
                json.dump(comments_list, f, ensure_ascii=False, indent=4)
            print(f"第 {current_page} 页评论已保存至：{json_file_path}")

        # 检查是否已达到最大页数
        if current_page >= max_pages:
            print(f"已达到最大爬取页数 {max_pages}，停止爬取。")
            break

        try:
            # 随机等待时间
            wait_time = 10 + random.randint(-5, 5)
            print(f"等待{wait_time}秒后继续下一页...")
            time.sleep(wait_time)

            # 尝试翻页
            for attempt in range(3):  # 最多尝试3次
                try:
                    # 定位页码输入框
                    page_input = WebDriverWait(driver, 15).until(
                        EC.presence_of_element_located(
                            (By.XPATH, '//*[@id="commentModule"]/div[6]/ul/li[10]/div/input'))
                    )

                    # 定位跳转按钮
                    page_button = WebDriverWait(driver, 15).until(
                        EC.element_to_be_clickable(
                            (By.XPATH, '//*[@id="commentModule"]/div[6]/ul/li[10]/div/span/button'))
                    )

                    # 输入下一页页码
                    for _ in range(3):  # 确保输入成功
                        page_input.clear()
                        page_input.send_keys(str(current_page + 1))
                        time.sleep(0.5)

                        # 随机点击延迟
                    time.sleep(random.uniform(0.5, 2))

                    # 点击跳转按钮
                    driver.execute_script("arguments[0].click();", page_button)

                    # 等待页面加载
                    time.sleep(5 + random.randint(1, 3))

                    # 验证是否翻页成功
                    try:
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, f'//*[@id="commentModule"]/div[5]/div'))
                        )
                        current_page += 1
                        break  # 翻页成功，退出重试循环
                    except:
                        if attempt == 2:
                            raise Exception("翻页后未加载新内容")
                        continue

                except Exception as e:
                    if attempt == 2:  # 最后一次尝试
                        raise
                    print(f"翻页尝试 {attempt + 1} 失败: {e}")
                    time.sleep(5)
                    driver.refresh()
                    time.sleep(5)

        except Exception as e:
            print(f"翻页最终失败: {e}")
            break

finally:
    # 保存所有评论到date目录
    all_comments_path = os.path.join(date_dir, "all_comments.json")
    with open(all_comments_path, "w", encoding="utf-8") as f:
        json.dump(all_comments, f, ensure_ascii=False, indent=4)

    print(f"\n所有评论已保存至：{all_comments_path}")
    print(f"共爬取 {len(all_comments)} 条评论。")

    # 关闭浏览器
    driver.quit()
print("Four Over!!!")
print("Four Over!!!")

最后让我们感谢工程师！！！！