from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import xlsxwriter as xw
desired_capabilities = DesiredCapabilities.EDGE
desired_capabilities["pageLoadStrategy"] = "none"
# 设置edge驱动器的环境
options = webdriver.EdgeOptions()
# 设置edge不加载图片,提高速度
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 设置不显示窗口
# options.add_argument('--headless')
# 创建一个Edge驱动器
driver = webdriver.Edge(options=options)
# 打开页面
# 因为这个url藏得太深了,只能一个个找
urls=['','','',....
]
arr=[]
for url in urls:
driver.get(url)
title_xpath = f"/html/body/div[2]/div[3]/div/div[1]/div/article/section"
row = []
for i in range(3):
try:
if i==1: # 爬英文版内容
jump_to_eng = f"/html/body/div[2]/div[1]/div/div/div[1]/div/div/ul/li[3]/a"
WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, jump_to_eng))).click()
if i==2: # 爬简中版内容
jump_to_sim = f"/html/body/div[2]/div[1]/div/div/div[1]/div/div/ul/li[2]/a"
WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, jump_to_sim))).click()
# 获取课程内容
content_sim = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, title_xpath))).get_attribute(
"outerHTML")
except:
print(f" {url} 爬取失败\n")
content_sim='NULL'
driver.get(url)
row.append(content_sim)
continue
row.append(content_sim)
arr.append(row)
# xlsxwriter库储存数据到excel
def xw_toExcel(data, fileName):
workbook = xw.Workbook(fileName) # 创建工作簿
worksheet1 = workbook.add_worksheet("sheet1") # 创建子表
worksheet1.activate() # 激活表
title = ['major_crs_summary_trad', 'major_crs_summary_eng', 'major_crs_summary_sim'] # 设置表头
worksheet1.write_row('A1', title) # 从A1单元格开始写入表头
i = 2 # 从第二行开始写入数据
for j in range(len(data)):
insertData = [data[j][0], data[j][1], data[j][2]]
row = 'A' + str(i)
worksheet1.write_row(row, insertData)
i += 1
workbook.close() # 关闭表
fileName = 'uicmajor.xlsx'
xw_toExcel(arr, fileName)
第一次使用爬虫,实在是不想一个个手动复制粘贴了
开爬之前先看看网站的robots.txt
没有问题