仅供学习参考
一、获取特定文本和json链接
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 指定 Chrome 驱动程序路径
chrome_driver_path = r'F:\chrome-win64\chromedriver.exe'
# 创建 Chrome 浏览器的 WebDriver 实例
driver = webdriver.Chrome(executable_path=chrome_driver_path)
# 访问网页
url = "动态网页网址链接"
driver.get(url)
# 等待页面加载完成
driver.implicitly_wait(3)
# 使用显示等待,等待姓名元素出现
wait = WebDriverWait(driver, 10) # 最长等待时间为10秒
for i in range(1, 100):
name_xpath = f"/html/body/div[5]/ul[1]/li[{i}]/p/a"
# 等待姓名元素出现
name_element = wait.until(EC.visibility_of_element_located((By.XPATH, name_xpath)))
name = name_element.text
link = name_element.get_attribute("href")
print(f"姓名: {name}, 链接: {link}")
# 关闭浏览器
driver.quit()
结果:现TXT文本内容
姓名:abc,链接:http://abc.json
二、打开现TXT文本,将姓名保留,把获得的新链接放入姓名之后,以新的文本输出
import os
import time
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# 设置 ChromeDriver 路径
chrome_driver_path = 'F:\chrome-win64\chromedriver.exe'
# 创建 Chrome WebDriver 选项
options = Options()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
# 创建 Chrome WebDriver 对象
driver = webdriver.Chrome(executable_path=chrome_driver_path, options=options)
# 读取包含多个网址的TXT文件
with open('网址.txt', 'r', encoding='utf-8') as file:
content = file.read()
# 使用正则表达式提取姓名和网址
pattern = r'姓名: (.*?)\s+链接: (.*?)\n'
matches = re.findall(pattern, content, re.DOTALL)
# 创建新的TXT文件
output_file_path = '个人网址.txt'
if not os.path.exists(output_file_path):
open(output_file_path, 'w').close()
# 打开新的TXT文件进行写入
with open(output_file_path, 'w', encoding='utf-8') as output_file:
# 遍历每个匹配项
for match in matches:
name = match[0]
url = match[1]
try:
driver.get(url)
# 等待2秒
time.sleep(2)
# 使用 Selenium 获取动态生成的内容
try:
email_element = driver.find_element_by_id('需要填充') #根据目标网站修改
email = email_element.text #根据目标网站修改
except:
# 使用正则表达式查找邮箱地址
email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+' #通用
email_matches = re.findall(email_pattern, driver.page_source)
if email_matches:
email = email_matches[0]
else:
email = "无法提取邮箱地址"
# 写入姓名和邮箱地址到文件
output_file.write(f'{name}: {email}\n')
except Exception as e:
print(f"打开链接失败: {e}")
# 关闭浏览器
driver.quit()
结果:现TXT个人网址文本内容
abc: abc@qq.com