前言
在当今数据驱动的时代,如何高效地从网络中获取有用的信息成为了研究人员和开发者们关心的重要问题。本文将介绍如何通过Python编写爬虫,结合Selenium自动化工具,从中国知网(CNKI)中爬取有关山东大学的学术数据。希望通过分享我的实践经验,能为有类似需求的读者提供一些参考。
爬虫工具简介
Selenium 是一个用于Web应用程序测试的工具。它提供了一系列的API,可以驱动浏览器执行各种操作,如点击、输入文本、获取页面元素等。通过Selenium,我们可以模拟人类操作,自动化地进行网页数据的爬取。
爬取目标
本次爬取的目标是获取中国知网中关于“大数据”主题的学术论文,并提取以下信息:
-
论文基本信息:标题、作者、发表日期、来源、数据库等。
-
作者详细信息:作者所在机构、标签、总发文量、总下载量、关注领域等。
爬虫代码实现
以下是完整的爬虫代码:
import time import concurrent.futures from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.action_chains import ActionChains import os import csv def webserver(): desired_capabilities = DesiredCapabilities.EDGE desired_capabilities["pageLoadStrategy"] = "none" options = webdriver.EdgeOptions() options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) driver = webdriver.Edge(options=options) return driver def open_page(driver, keyword): driver.get("https://kns.cnki.net/kns8/AdvSearch") time.sleep(1) opt = driver.find_element(By.CSS_SELECTOR, 'div.sort-list') driver.execute_script("arguments[0].setAttribute('style', 'display: block;')", opt) ActionChains(driver).move_to_element(driver.find_element(By.CSS_SELECTOR, 'li[data-val="RP"]')).perform() WebDriverWait(driver, 100).until( EC.presence_of_element_located((By.XPATH, '''//*[@id="gradetxt"]/dd[1]/div[2]/input''')) ).send_keys(keyword) WebDriverWait(driver, 100).until( EC.presence_of_element_located((By.XPATH, '''//*[@id="ModuleSearch"]/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[3]/input''')) ).click() print("正在搜索,请稍后...") res_unm = WebDriverWait(driver, 100).until(EC.presence_of_element_located( (By.XPATH, '''//*[@id="countPageDiv"]/span[1]/em''')) ).text res_unm = int(res_unm.replace(",", '')) page_unm = int(res_unm / 20) + 1 print(f"共找到 {res_unm} 条结果, {page_unm} 页。") return res_unm def get_info(driver, xpath): try: element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath))) return element.text except: return '无' def get_choose_info(driver, xpath1, xpath2, str): try: if WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.XPATH, xpath1))).text == str: return WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.XPATH, xpath2))).text else: return '无' except: return '无' def get_authors(driver): try: author_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//h3[@id='authorpart']/span/a"))) authors = [author.text for author_elements in author_elements] author_urls = [author.get_attribute("href") for author in author_elements] return authors, author_urls except: return ['无'], [] def get_author_info(driver, author_url, author_name): driver.get(author_url) time.sleep(5) # 等待页面加载完成 author_info = {} author_info["name"] = author_name try: school_element = driver.find_element(By.CSS_SELECTOR, "#kcms-author-info h3 > span > a") author_info["school"] = school_element.text.strip() except: author_info["school"] = '无' try: tags_element = driver.find_element(By.CSS_SELECTOR, "#kcms-author-info h3:nth-of-type(2) > span") author_info["tags"] = tags_element.text.strip().replace(";", "; ") except: author_info["tags"] = '无' try: author_info["publication_count"] = driver.find_element(By.XPATH, "//h5/span[text()='总发文量:']/following-sibling::em").text.strip() except: author_info["publication_count"] = '无' try: author_info["download_count"] = driver.find_element(By.XPATH, "//h5/span[text()='总下载量:']/following-sibling::em").text.strip() except: author_info["download_count"] = '无' try: focus_areas_element = driver.find_element(By.CSS_SELECTOR, "#kcms-author-keyword .listcont ul") focus_areas = [item.text for item in focus_areas_element.find_elements(By.TAG_NAME, "li")] author_info["focus_areas"] = "; ".join(focus_areas) except: author_info["focus_areas"] = '无' author_info["url"] = author_url return author_info def save_author_info(author_info, file_path): with open(file_path, "a", encoding="gbk", newline='') as file: writer = csv.writer(file) writer.writerow([ author_info["name"], author_info["school"], author_info["tags"], author_info["publication_count"], author_info["download_count"], author_info["focus_areas"], author_info["url"] ]) def crawl(driver, papers_need, theme): count = 1 articles_file_path = f"{theme}_文章.csv" authors_file_path = f"{theme}_作者.csv" # Initialize article CSV file if os.path.exists(articles_file_path) and os.path.getsize(articles_file_path) > 0: with open(articles_file_path, "r", encoding='gbk') as file: lines = file.readlines() last_line = lines[-1].strip() count = int(last_line.split(",")[0]) + 1 else: with open(articles_file_path, "w", encoding='gbk', newline='') as file: writer = csv.writer(file) writer.writerow(["编号", "标题", "作者", "单位", "日期", "来源", "专辑", "专题", "分类号", "数据库", "引用", "下载", "关键词", "摘要", "URL"]) # Initialize author CSV file if not os.path.exists(authors_file_path): with open(authors_file_path, "w", encoding='gbk', newline='') as file: writer = csv.writer(file) writer.writerow(["姓名", "学校", "作者标签", "总发文量", "总下载量", "关注领域", "URL"]) for i in range(count // 20): time.sleep(1) WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@id='PageNext']"))).click() print(f"从第{count}条开始爬取\n") while count <= papers_need: time.sleep(1) title_list = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "fz14"))) for i in range((count-1) % 20 + 1, 21): print(f"正在爬取第{count}条(本页第{i}条)\n") try: term = (count-1) % 20 + 1 print('基础信息') title_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[2]''' author_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[3]''' source_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[4]''' date_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[5]''' database_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[6]''' quote_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[7]''' download_xpath = f'''//*[@id=" gridTable"]/div/div/table/tbody/tr[{term}]/td[8]''' xpaths = [title_xpath, author_xpath, source_xpath, date_xpath, database_xpath, quote_xpath, download_xpath] with concurrent.futures.ThreadPoolExecutor() as executor: future_elements = [executor.submit(get_info, driver, xpath) for xpath in xpaths] title, authors, source, date, database, quote, download = [future.result() for future in future_elements] if not quote.isdigit(): quote = '0' if not download.isdigit(): download = '0' print(f"{title} | {authors} | {source} | {date} | {database} | {quote} | {download}\n") title_list[i-1].click() n = driver.window_handles driver.switch_to.window(n[-1]) time.sleep(1) try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, '''//*[@id="ChDivSummaryMore"]''')) ).click() except: pass print('机构信息') try: institute = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h3[2]"))).text except: institute = '无' print(institute+'\n') print('作者信息') authors, author_urls = get_authors(driver) print(f"作者列表: {', '.join(authors)}\n") print('摘要') try: abstract = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "abstract-text"))).text except: abstract = '无' print(abstract+'\n') print('关键词') try: keywords = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "keywords"))).text[:-1] except: keywords = '无' print(keywords+'\n') print('专辑') xpaths = [ ("/html/body/div[2]/div[1]/div[3]/div/div/div[6]/ul/li[1]/span", "/html/body/div[2]/div[1]/div[3]/div/div/div[6]/ul/li[1]/p"), ("/html/body/div[2]/div[1]/div[3]/div/div/div[6]/ul/li[2]/span", "/html/body/div[2]/div[1]/div[3]/div/div/div[6]/ul/li[2]/p"), ("/html/body/div[2]/div[1]/div[3]/div/div/div[7]/ul/li[1]/span", "/html/body/div[2]/div[1]/div[3]/div/div/div[7]/ul/li[1]/p"), ("/html/body/div[2]/div[1]/div[3]/div/div/div[7]/ul/li[2]/span", "/html/body/div[2]/div[1]/div[3]/div/div/div[7]/ul/li[2]/p"), ("/html/body/div[2]/div[1]/div/3/div/div/div[4]/ul/li[1]/span", "/html/body/div[2]/div[1]/div/3/div/div/div/4/ul/li[1]/p") ] with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(get_choose_info, driver, xpath1, xpath2, '专辑:') for xpath1, xpath2 in xpaths] results = [future.result() for future in concurrent.futures.as_completed(futures)] publication = next((result for result in results if result != '无'), '无') print(publication+'\n') print('专题') topic_xpath = "//*[@class='top-space']//span[text()='专题:']/following-sibling::p" try: topic = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, topic_xpath))).text except: topic = '无' print(topic+'\n') print('分类号') classification_xpath = "//*[@class='top-space']//span[text()='分类号:']/following-sibling::p" try: classification = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, classification_xpath))).text except: classification = '无' print(classification+'\n') url = driver.current_url res = [count, title, ', '.join(authors), institute, date, source, publication, topic, classification, database, quote, download, keywords, abstract, url] try: with open(articles_file_path, 'a', encoding='gbk', newline='') as f: writer = csv.writer(f) writer.writerow(res) print('写入文章信息成功\n') except Exception as e: print('写入文章信息失败:', str(e)) raise e print('开始获取作者信息\n') # 获取作者详细信息并保存 for author_name, author_url in zip(authors, author_urls): author_info = get_author_info(driver, author_url, author_name) save_author_info(author_info, authors_file_path) print(f"写入作者\"{author_info['name']}\"信息") except Exception as e: print(f"第{count}条爬取失败: {str(e)}\n") continue finally: n2 = driver.window_handles if len(n2) > 1: driver.close() driver.switch_to.window(n2[0]) count += 1 if count > papers_need: break WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[@id='PageNext']"))).click() print("\n爬取完毕!") if __name__ == "__main__": keywords = ["大数据"] papers_per_keyword = 1200 driver = webserver() for keyword in keywords: print(f"正在爬取关键词: {keyword}") res_unm = open_page(driver, keyword) papers_need = papers_per_keyword if (papers_per_keyword <= res_unm) else res_unm crawl(driver, papers_need, keyword) time.sleep(5) # 等待片刻再爬取下一个关键词 driver.close()
爬虫代码解析
-
初始化浏览器:函数
webserver()
初始化了一个Edge浏览器实例,设置了页面加载策略和浏览器选项,以提高爬取效率。 -
打开搜索页面并输入关键词:函数
open_page()
打开知网的高级搜索页面,并输入关键词进行搜索。通过模拟鼠标和键盘操作,实现自动化输入和点击。 -
获取页面信息:函数
get_info()
和get_choose_info()
分别用于获取指定XPath位置的文本信息和条件选择信息。 -
获取作者信息:函数
get_authors()
用于获取作者列表及其个人主页链接,函数get_author_info()
用于访问作者个人主页并提取详细信息。 -
保存作者信息:函数
save_author_info()
将提取的作者信息保存到CSV文件中。 -
爬取文章信息:函数
crawl()
实现了主要的爬取逻辑。通过遍历搜索结果页面,依次点击每篇文章的标题,提取文章和作者的详细信息,并保存到CSV文件中。 -
主函数:主函数中定义了爬取的关键词和每个关键词需爬取的文章数量,依次调用上述函数完成爬取任务。
实践总结
通过以上代码,我成功地从中国知网中爬取到了关于“大数据”主题的学术论文及其作者信息。这些数据不仅包括文章的基本信息,还涵盖了作者的详细资料。通过对这些数据的分析和整理,可以进一步了解该领域的研究现状和发展趋势。
在实际操作中,遇到了一些挑战,如反爬虫机制和页面加载时间问题。通过调整代码和设置适当的等待时间,这些问题得到了有效解决。此外,使用多线程技术加快了数据爬取的速度,提高了工作效率。
总之,Python结合Selenium是实现网页数据爬取的有效工具,适用于各种数据采集任务。希望通过本文的分享,能为有类似需求的读者提供一些有益的参考。如果您有任何问题或建议,欢迎在评论区交流。