刚才爬了一些数据,分享一下代码。
本例是从Excel表从读取数据,运用到网页中进行搜索,再将结果写入Excel表中。
1.首先引入需要用到的包,没下载的在终端pip install一下
例如:pip install selenium openpyxl
import time
import openpyxl
from selenium import webdriver
from selenium.webdriver.common.by import By
from openpyxl import Workbook
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
2.初始化driver
driver_path = "chromedriver.exe"
service = Service(driver_path)
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# driver = webdriver.Chrome()#可视化调试
driver = webdriver.Chrome(service=service, options=options)
这里要先下载与Chrome版本相同的chromedriver(版本要相同!!!)
ChromeDriver下载链接:https://registry.npmmirror.com/binary.html?path=chromedriver/
如果上面的链接找不到合适的版本看这里:ChromeDriver - WebDriver for Chrome
下载好后,将chromedriver路径正确配置在环境变量中
3. 读取Excel文件中的数据
# 读取Excel文件中的数据
def read_excel(file_path, column_name):
wb = Workbook() # 如果Excel文件不存在,创建一个新的Workbook对象
try:
# ws = wb.load_workbook(file_path).active
ws = openpyxl.load_workbook(file_path).active
except FileNotFoundError:
# wb = Workbook()
ws = wb.active
ws.title = "Search Results"
ws.append(["Search Term", "URL"]) # 添加表头
data_list = [cell.value for cell in ws[column_name]]
return ws, data_list
4.在网页中搜索并点击元素,获取新页面的URL(这是我的项目需求,其他项目视情况而定)
# 在网页中搜索并点击元素,获取新页面的URL
def search_click_and_get_url(url, search_term, xpath_to_click):
driver.get(url)
# URL已包含搜索词,直接跳到下一步
time.sleep(2)
try:
element_to_click = driver.find_element(By.XPATH, xpath_to_click)
href_attribute = element_to_click.get_attribute('href')
# print(f"链接: {href_attribute}")
new_url = href_attribute
except NoSuchElementException:
new_url = ''
return new_url
5.将结果写入Excel
# 将URL写入Excel
def write_url_to_excel(ws, search_term, url):
ws.append([search_term, url])
wb = ws.parent
wb.save(filename="updated_search_results8.xlsx")
6.主函数(我有xpath_to_click是因为我需要在页面点击元素后,才能得到想要的结果,这个也视情况而定)
def main():
excel_path = 'data.xlsx'
column_name = 'A' # 列名
url_to_search = '爬取数据的网址'
xpath_to_click = '//*[@id="app"]/div/main/div/div[2]/div[1]/div[1]/div[1]/a' # XPath
# 读取Excel数据
ws, search_terms = read_excel(excel_path, column_name)
for search_term in search_terms:
url_with_search_term = f"{url_to_search}{search_term}/2/1"
new_url = search_click_and_get_url(url_with_search_term, search_term, xpath_to_click)
write_url_to_excel(ws, search_term, new_url)
driver.quit()