爬虫高校排行获取信息+下载图标到本地
from selenium import webdriver from bs4 import BeautifulSoup import time import os import requests driver = webdriver.Chrome() # 替换为你的 WebDriver 路径 # 创建用于保存图标的文件夹 icon_folder = 'university_icons' if not os.path.exists(icon_folder): os.makedirs(icon_folder) try: # 打开网页 driver.get("https://www.shanghairanking.cn/rankings/bcur/202010") time.sleep(5) # 等待页面动态内容加载 # 获取页面源代码 html_content = driver.page_source soup = BeautifulSoup(html_content, 'html.parser') # 提取数据 table_rows = soup.find_all('tr') print("找到的行数:", len(table_rows)) for row in table_rows: try: # 提取排名 ranking_div = row.find('div', class_='ranking') ranking = ranking_div.text.strip() if ranking_div else 'N/A' # 提取大学名称(中文) univ_name_div_cn = row.find('div', class_='univname-container') university_name_cn = univ_name_div_cn.a.text.strip() if univ_name_div_cn and univ_name_div_cn.a else 'N/A' # 提取大学英文名称 univ_name_div_en = row.find('a', class_='name-en') university_name_en = univ_name_div_en.text.strip() if univ_name_div_en else 'N/A' # 提取大学地区 columns = row.find_all('td') region = columns[2].text.strip() if len(columns) > 2 else 'N/A' # 提取学校类型 type = columns[3].text.strip() if len(columns) > 3 else 'N/A' # 提取总分 score = columns[4].text.strip() if len(columns) > 4 else 'N/A' # 提取标签 tags_p = row.find('p', class_='tags') tags = tags_p.text.strip() if tags_p else 'N/A' # 提取图标 URL icon_img = row.find('div', class_='logo').find('img') icon_url = icon_img['src'] if icon_img and icon_img.has_attr('src') else None if icon_url: # 图标文件名为大学名称 icon_filename = f"{university_name_cn.replace('/', '')}.png" # 替换 '/' 避免路径问题 icon_path = os.path.join(icon_folder, icon_filename) # 下载并保存图标 icon_response = requests.get(icon_url) if icon_response.status_code == 200: with open(icon_path, 'wb') as file: file.write(icon_response.content) print(f"{ranking}, {university_name_cn}, {university_name_en}, {tags}, {region}, {type}, {score} ") except Exception as e: print(f"处理行时发生错误:{e}") except Exception as e: print(f"请求或解析过程中发生错误:{e}") finally: driver.quit() # 关闭浏览器