创新实训爬虫篇2——《山东大学相关知网文章爬虫》_# 获取作者单位 print('正在获取institute...') try:-CSDN博客

本文链接：https://blog.csdn.net/m0_62303445/article/details/139135105

前言

在当今数据驱动的时代，如何高效地从网络中获取有用的信息成为了研究人员和开发者们关心的重要问题。本文将介绍如何通过Python编写爬虫，结合Selenium自动化工具，从中国知网（CNKI）中爬取有关山东大学的学术数据。希望通过分享我的实践经验，能为有类似需求的读者提供一些参考。

爬虫工具简介

Selenium 是一个用于Web应用程序测试的工具。它提供了一系列的API，可以驱动浏览器执行各种操作，如点击、输入文本、获取页面元素等。通过Selenium，我们可以模拟人类操作，自动化地进行网页数据的爬取。

爬取目标

本次爬取的目标是获取中国知网中关于“大数据”主题的学术论文，并提取以下信息：

论文基本信息：标题、作者、发表日期、来源、数据库等。
作者详细信息：作者所在机构、标签、总发文量、总下载量、关注领域等。

爬虫代码实现

以下是完整的爬虫代码：

import time
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.action_chains import ActionChains
import os
import csv

def webserver():
    desired_capabilities = DesiredCapabilities.EDGE
    desired_capabilities["pageLoadStrategy"] = "none"

    options = webdriver.EdgeOptions()
    options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

    driver = webdriver.Edge(options=options)
    return driver

def open_page(driver, keyword):
    driver.get("https://kns.cnki.net/kns8/AdvSearch")
    time.sleep(1)

    opt = driver.find_element(By.CSS_SELECTOR, 'div.sort-list')
    driver.execute_script("arguments[0].setAttribute('style', 'display: block;')", opt)

    ActionChains(driver).move_to_element(driver.find_element(By.CSS_SELECTOR, 'li[data-val="RP"]')).perform()

    WebDriverWait(driver, 100).until(
        EC.presence_of_element_located((By.XPATH, '''//*[@id="gradetxt"]/dd[1]/div[2]/input'''))
    ).send_keys(keyword)

    WebDriverWait(driver, 100).until(
        EC.presence_of_element_located((By.XPATH, '''//*[@id="ModuleSearch"]/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[3]/input'''))
    ).click()

    print("正在搜索，请稍后...")

    res_unm = WebDriverWait(driver, 100).until(EC.presence_of_element_located(
        (By.XPATH, '''//*[@id="countPageDiv"]/span[1]/em'''))
    ).text

    res_unm = int(res_unm.replace(",", ''))
    page_unm = int(res_unm / 20) + 1
    print(f"共找到 {res_unm} 条结果, {page_unm} 页。")
    return res_unm

def get_info(driver, xpath):
    try:
        element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
        return element.text
    except:
        return '无'

def get_choose_info(driver, xpath1, xpath2, str):
    try: 
        if WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.XPATH, xpath1))).text == str:
            return WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.XPATH, xpath2))).text
        else:
            return '无'
    except:
        return '无'

def get_authors(driver):
    try:
        author_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//h3[@id='authorpart']/span/a")))
        authors = [author.text for author_elements in author_elements]
        author_urls = [author.get_attribute("href") for author in author_elements]
        return authors, author_urls
    except:
        return ['无'], []

def get_author_info(driver, author_url, author_name):
    driver.get(author_url)
    time.sleep(5)  # 等待页面加载完成

    author_info = {}
    author_info["name"] = author_name

    try:
        school_element = driver.find_element(By.CSS_SELECTOR, "#kcms-author-info h3 > span > a")
        author_info["school"] = school_element.text.strip()
    except:
        author_info["school"] = '无'

    try:
        tags_element = driver.find_element(By.CSS_SELECTOR, "#kcms-author-info h3:nth-of-type(2) > span")
        author_info["tags"] = tags_element.text.strip().replace(";", "; ")
    except:
        author_info["tags"] = '无'

    try:
        author_info["publication_count"] = driver.find_element(By.XPATH, "//h5/span[text()='总发文量：']/following-sibling::em").text.strip()
    except:
        author_info["publication_count"] = '无'

    try:
        author_info["download_count"] = driver.find_element(By.XPATH, "//h5/span[text()='总下载量：']/following-sibling::em").text.strip()
    except:
        author_info["download_count"] = '无'

    try:
        focus_areas_element = driver.find_element(By.CSS_SELECTOR, "#kcms-author-keyword .listcont ul")
        focus_areas = [item.text for item in focus_areas_element.find_elements(By.TAG_NAME, "li")]
        author_info["focus_areas"] = "; ".join(focus_areas)
    except:
        author_info["focus_areas"] = '无'

    author_info["url"] = author_url

    return author_info

def save_author_info(author_info, file_path):
    with open(file_path, "a", encoding="gbk", newline='') as file:
        writer = csv.writer(file)
        writer.writerow([
            author_info["name"],
            author_info["school"],
            author_info["tags"],
            author_info["publication_count"],
            author_info["download_count"],
            author_info["focus_areas"],
            author_info["url"]
        ])

def crawl(driver, papers_need, theme):
    count = 1

    articles_file_path = f"{theme}_文章.csv"
    authors_file_path = f"{theme}_作者.csv"
    
    # Initialize article CSV file
    if os.path.exists(articles_file_path) and os.path.getsize(articles_file_path) > 0:
        with open(articles_file_path, "r", encoding='gbk') as file:
            lines = file.readlines()
            last_line = lines[-1].strip()
            count = int(last_line.split(",")[0]) + 1
    else:
        with open(articles_file_path, "w", encoding='gbk', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["编号", "标题", "作者", "单位", "日期", "来源", "专辑", "专题", "分类号", "数据库", "引用", "下载", "关键词", "摘要", "URL"])

    # Initialize author CSV file
    if not os.path.exists(authors_file_path):
        with open(authors_file_path, "w", encoding='gbk', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["姓名", "学校", "作者标签", "总发文量", "总下载量", "关注领域", "URL"])

    for i in range(count // 20):
        time.sleep(1)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@id='PageNext']"))).click()

    print(f"从第{count}条开始爬取\n")

    while count <= papers_need:
        time.sleep(1)

        title_list = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "fz14")))
        for i in range((count-1) % 20 + 1, 21):
            print(f"正在爬取第{count}条(本页第{i}条)\n")

            try:
                term = (count-1) % 20 + 1
                
                print('基础信息')
                title_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[2]'''
                author_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[3]'''
                source_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[4]'''
                date_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[5]'''
                database_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[6]'''
                quote_xpath = f'''//*[@id="gridTable"]/div/div/table/tbody/tr[{term}]/td[7]'''
                download_xpath = f'''//*[@id="

gridTable"]/div/div/table/tbody/tr[{term}]/td[8]'''
                xpaths = [title_xpath, author_xpath, source_xpath, date_xpath, database_xpath, quote_xpath, download_xpath]
                with concurrent.futures.ThreadPoolExecutor() as executor:
                    future_elements = [executor.submit(get_info, driver, xpath) for xpath in xpaths]
                title, authors, source, date, database, quote, download = [future.result() for future in future_elements]
                if not quote.isdigit():
                    quote = '0'
                if not download.isdigit():
                    download = '0'
                print(f"{title} | {authors} | {source} | {date} | {database} | {quote} | {download}\n")
               
                title_list[i-1].click()
                
                n = driver.window_handles
                driver.switch_to.window(n[-1])
                time.sleep(1)
                
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.XPATH, '''//*[@id="ChDivSummaryMore"]'''))
                    ).click()
                except:
                    pass
                
                print('机构信息')
                try:
                    institute = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h3[2]"))).text
                except:
                    institute = '无'
                print(institute+'\n')
                
                print('作者信息')
                authors, author_urls = get_authors(driver)
                print(f"作者列表: {', '.join(authors)}\n")

                print('摘要')
                try:
                    abstract = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "abstract-text"))).text
                except:
                    abstract = '无'
                print(abstract+'\n')
                
                print('关键词')
                try:
                    keywords = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "keywords"))).text[:-1]
                except:
                    keywords = '无'      
                print(keywords+'\n')
                
                print('专辑')
                xpaths = [
                    ("/html/body/div[2]/div[1]/div[3]/div/div/div[6]/ul/li[1]/span", "/html/body/div[2]/div[1]/div[3]/div/div/div[6]/ul/li[1]/p"),
                    ("/html/body/div[2]/div[1]/div[3]/div/div/div[6]/ul/li[2]/span", "/html/body/div[2]/div[1]/div[3]/div/div/div[6]/ul/li[2]/p"),
                    ("/html/body/div[2]/div[1]/div[3]/div/div/div[7]/ul/li[1]/span", "/html/body/div[2]/div[1]/div[3]/div/div/div[7]/ul/li[1]/p"),
                    ("/html/body/div[2]/div[1]/div[3]/div/div/div[7]/ul/li[2]/span", "/html/body/div[2]/div[1]/div[3]/div/div/div[7]/ul/li[2]/p"),
                    ("/html/body/div[2]/div[1]/div/3/div/div/div[4]/ul/li[1]/span", "/html/body/div[2]/div[1]/div/3/div/div/div/4/ul/li[1]/p")
                ]
                with concurrent.futures.ThreadPoolExecutor() as executor:
                    futures = [executor.submit(get_choose_info, driver, xpath1, xpath2, '专辑：') for xpath1, xpath2 in xpaths]
                    results = [future.result() for future in concurrent.futures.as_completed(futures)]
                publication = next((result for result in results if result != '无'), '无')
                print(publication+'\n')
                
                print('专题')
                topic_xpath = "//*[@class='top-space']//span[text()='专题：']/following-sibling::p"
                
                try:
                    topic = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, topic_xpath))).text
                except:
                    topic = '无'
                print(topic+'\n')

                print('分类号')
                classification_xpath = "//*[@class='top-space']//span[text()='分类号：']/following-sibling::p"
                try:
                    classification = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, classification_xpath))).text
                except:
                    classification = '无'
                print(classification+'\n')
                
                url = driver.current_url

                res = [count, title, ', '.join(authors), institute, date, source, publication, topic, classification, database, quote, download, keywords, abstract, url]

                try:
                    with open(articles_file_path, 'a', encoding='gbk', newline='') as f:
                        writer = csv.writer(f)
                        writer.writerow(res)
                        print('写入文章信息成功\n')
                except Exception as e:
                    print('写入文章信息失败:', str(e))
                    raise e
                
                print('开始获取作者信息\n')
                # 获取作者详细信息并保存
                for author_name, author_url in zip(authors, author_urls):
                    author_info = get_author_info(driver, author_url, author_name)
                    save_author_info(author_info, authors_file_path)
                    print(f"写入作者\"{author_info['name']}\"信息")

            except Exception as e:
                print(f"第{count}条爬取失败: {str(e)}\n")
                continue
            
            finally:
                n2 = driver.window_handles
                if len(n2) > 1:
                    driver.close()
                    driver.switch_to.window(n2[0])
                count += 1
                if count > papers_need:
                    break

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[@id='PageNext']"))).click()
        
    print("\n爬取完毕！")

if __name__ == "__main__":
    keywords = ["大数据"]
    papers_per_keyword = 1200

    driver = webserver()

    for keyword in keywords:
        print(f"正在爬取关键词: {keyword}")
        res_unm = open_page(driver, keyword)
        papers_need = papers_per_keyword if (papers_per_keyword <= res_unm) else res_unm
        crawl(driver, papers_need, keyword)
        time.sleep(5)  # 等待片刻再爬取下一个关键词

    driver.close()

爬虫代码解析

初始化浏览器：函数 webserver() 初始化了一个Edge浏览器实例，设置了页面加载策略和浏览器选项，以提高爬取效率。
打开搜索页面并输入关键词：函数 open_page() 打开知网的高级搜索页面，并输入关键词进行搜索。通过模拟鼠标和键盘操作，实现自动化输入和点击。
获取页面信息：函数 get_info() 和 get_choose_info() 分别用于获取指定XPath位置的文本信息和条件选择信息。
获取作者信息：函数 get_authors() 用于获取作者列表及其个人主页链接，函数 get_author_info() 用于访问作者个人主页并提取详细信息。
保存作者信息：函数 save_author_info() 将提取的作者信息保存到CSV文件中。
爬取文章信息：函数 crawl() 实现了主要的爬取逻辑。通过遍历搜索结果页面，依次点击每篇文章的标题，提取文章和作者的详细信息，并保存到CSV文件中。
主函数：主函数中定义了爬取的关键词和每个关键词需爬取的文章数量，依次调用上述函数完成爬取任务。