import random
import re
import time
import os
import requests
from bs4 import BeautifulSoup
from pymysql import Connection
from selenium import webdriver
from selenium.webdriver.common.by import By
# 定义关键字类
class Keyword:
def __init__(self, keyword: str, title_list: list, url_list: list):
self.keyword = keyword
self.title_list = title_list
self.url_list = url_list
def get_url_list_from_keyword(self, keyword):
return url_list
# 定义论文类
class Paper:
def __init__(
self,
title: str,
author_data: dict[str, list[str]],
unit_data: dict[str, str],
author_unit: dict[str, list[str]],
abstract_text: str,
keywords_data: list[str],
name,
time,
content_string
):
self.title = title # 论文名
self.author_data = author_data # 作者姓名
self.unit_data = unit_data # 作者单位
self.author_unit = author_unit # 作者与对应的单位
self.abstract_text = abstract_text # 摘要
self.keywords_data = keywords_data # 关键字
self.name = name # 期刊名|会议名
self.time = time # 期刊时间|会议时间
self.content_string = content_string # 目录
# 打开知网并进入搜索页面
def driver_open(driver, keyword):
# 进入知网首页并搜索关键字
url = "https://www.cnki.net/"
driver.get(url) # 进入知网首页
time.sleep(5) # 等待网页加载
# 将关键词输入搜索框
driver.find_element(By.CSS_SELECTOR, '#txt_SearchText').send_keys(keyword)
time.sleep(5) # 等待输入完成
# 点击搜索按钮
driver.find_element(By.CSS_SELECTOR,
'body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click()
time.sleep(7)
# 点击中文按钮
try:
driver.find_element(By.CSS_SELECTOR, 'body > div.wrapper > div.top-doctype > div > div > div > a.ch').click()
time.sleep(5)
except:
# 在自动选择为总库或英文时点击中文按钮
driver.find_element(By.CSS_SELECTOR, '#ModuleSearch > div:nth-child(2) > div > div > div > div > a.ch').click()
time.sleep(5)
content = driver.page_source.encode('utf-8') # 拿到搜索页面的源代码
soup = BeautifulSoup(content, 'html.parser')
return soup
def spider_url(soup, url_list: list, title_list: list) -> tuple[list, list]:
# 获取当前页面的所有title和对应的url
tbody = soup.find_all('tbody') # 获取tbody标签
tbody = BeautifulSoup(str(tbody[0]), 'html.parser') # 解析出tbody标签
data_td_name = tbody.find_all('a', attrs={'class': 'fz14'}) # 获取所有class为fz14的a标签
for i in data_td_name:
url = i['href'] # 获取论文url
title = i.text.strip("\n").strip() # 获取论文标题
url_list.append(url)
title_list.append(title)
return url_list, title_list
# 自动翻页
def change_page(driver):
driver.find_element(By.CSS_SELECTOR, '#Page_next_top').click()
time.sleep(10)
content = driver.page_source.encode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
return soup
# 获取当前页数和总页数
def get_pn(soup):
# 判断是否需要翻页
if soup.find_all('span', attrs={'class': 'countPageMark'}):
pn = soup.find_all('span', attrs={'class': 'countPageMark'})[0].text
current_pn = int(pn.split("/")[0]) # 当前页数
total_pn = int(pn.split("/")[1]) # 总页数
else:
current_pn = 0
total_pn = 1
return current_pn, total_pn
def get_keyword_list(path) -> list[str]:
# 处理关键词文本
keyword_list = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
keyword_list.append(line.strip("\n").strip(";"))
return keyword_list
# 爬取论文页面详细信息
def get_paper(url_list: list) -> list[Paper]:
paper_list = []
# 尝试爬取论文页面
i = 0
for url in url_list:
i += 1
print(f"开始爬取第{i}个论文页面")
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.46",
"Cache-Control": "no-cache", # 禁用缓存
"Pragma": "no-cache"
}
response = requests.get(url, headers=headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# 论文名
title = soup.findAll("h1")[0].text
# 作者姓名
"{'张三': ['1'], '李四': ['1', '2', '3'], '王五': ['1']}"
author_data: dict[str, list[str]] = {}
spans = soup.select("h3.author span a") # a标签存储作者姓名和可能存在的sup标签里的数字
for span_a in spans:
# 使用正则表达式提取作者姓名 ##示例:"张三1,", "李四1,2,3", “王五”
author_name = re.match(r'([^0-9 ,]+)', span_a.text).group(0)
# 判断是否含有上标文本<sup>,从而判断是否有多个单位
if span_a.find_all("sup", recursive=False): # 在当前a标签中,不递归查找,包含所有sup子标签的列表
# 提取作者姓名对应的数字
author_number = span_a.find("sup").text # "1", "1,2,3"
number_list = []
# 判断是否同时对应多个单位
if "," in author_number:
number_list = author_number.split(",")
else:
number_list.append(author_number)
# 将姓名与数字按字典嵌套列表保存
author_data[author_name] = number_list
else:
author_data[author_name] = ['1']
# 作者单位
"{'1': 'a研究所'}; {'1': 'a大学', '2', 'x研究所'}; {'1', '未查找到单位'}"
unit_data: dict[str, str] = {} #
units = soup.select("h3 a.author") # 所有包含作者的a.author标签
# 通过if判断作者单位是否存在
if units:
for unit in units:
# 判断是否有多个单位
if "." in unit.text: # ”1. 国防大学政治学院“
key = str(unit.text.split(".")[0]) # 数字
value = unit.text.split(".")[1] # 单位
unit_data[key] = value
else:
unit_data["1"] = unit.text # ”辽宁大学”
else: # units = []论文详细页面未显示单位
unit_data["1"] = "未查找到单位"
# 将作者与单位整合成一个字典
"""
{'张三': ['a研究所']}
{'李四': ['a大学', '辽宁大学', 'b研究所'], '张三', ['b研究所']}
{'王五', ['未查找到单位']}
"""
author_unit: dict[str, list[str]] = {}
for name, numbers in author_data.items(): # 返回一个由键值对组成的元组
units = []
for number in numbers:
unit = unit_data.get(number)
units.append(unit)
author_unit[name] = units
# 摘要
abstract = soup.findAll("span", attrs={"class": "abstract-text"})[0]
abstract_text = abstract.text
# 关键字
keywords_data = []
keywords = soup.findAll("a", attrs={"name": "keyword"})
for keyword in keywords:
key_word = keyword.text.strip()
keywords_data.append(key_word.strip(";"))
# 期刊名和时间|会议名和会议时间
name = ""
time = ""
if len(soup.select("div.top-tip span a")) == 2:
name = "期刊名:" + soup.select("div.top-tip span a")[0].text # 期刊名
time = "日期:" + soup.select("div.top-tip span a")[1].text # 日期
elif len(soup.select("div.top-tip span a")) == 1:
"只有期刊名没有日期"
name = "期刊名:" + soup.select("div.top-tip span a")[0].text # 期刊名
time = "无日期" # 日期
elif len(soup.select("div.top-tip span a")) == 0:
"没有期刊名和日期"
# 判断是否有会议名称
if soup.find_all('span', attrs={"class": "rowtit"}, string='会议名称:'):
row_name = soup.find_all('span', attrs={"class": "rowtit"}, string='会议名称:')[0]
name = "会议名称:" + row_name.find_next_sibling('p').text # 会议名称
else:
name = "未查找到会议名称"
# 判断是否有会议时间
if soup.find_all('span', attrs={"class": "rowtit"}, string='会议时间:'):
row_time = soup.find_all('span', attrs={"class": "rowtit"}, string='会议时间:')[0]
time = "会议时间:" + row_time.find_next_sibling('p').text # 会议时间
else:
time = "未查找到会议时间"
# 文章目录
cont = soup.find_all("h5", string='文章目录')
contents = soup.select('ul.catalog-list li')
# 判断是否存在目录
if cont:
"存在目录"
content_string = ""
for content in contents:
content_string += content['title'] + "\n"
else:
content_string = "无目录"
# 保存爬取的信息到Paper类中
paper = Paper(
title, author_data, unit_data, author_unit, abstract_text, keywords_data, name, time, content_string
)
paper_list.append(paper)
except Exception as e:
print(f"爬取第{i}个论文页面{url}时出现错误了{e}")
return paper_list
# 创建MySQL关键词对应的数据表格
def create_table(keyword: str):
# 使用关键字创建表名
table_name = keyword
# 创建表的SQL语句
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {table_name} (
论文标题 varchar(100) PRIMARY KEY,
作者姓名 int(50),
作者单位 varchar(100),
摘要 Text,
关键字 varchar(100),
期刊名或会议名称 varchar(30),
日期或会议时间 date,
目录 Text
)
"""
cursor.execute(create_table_sql)
# 在MySQL关键词对应的表格中插入数据
def insert_sql(keyword: str, Paper: Paper, i: int):
authors = ', '.join(Paper.author_unit.keys()) # 作者姓名
units = ', '.join([value[0] for value in Paper.author_unit.values() if value and value[0]]) # 作者单位
keywords = ', '.join(Paper.keywords_data) # 关键词
# 处理可能为空的字段
if units is None:
units = "无单位"
if Paper.abstract_text is None:
Paper.abstract_text = "无摘要"
if keywords is None:
keywords = "无关键词"
if Paper.name is None:
Paper.name = "无期刊名或会议名称"
if Paper.time is None:
Paper.time = "无日期或会议时间"
if Paper.content_string is None:
Paper.content_string = "无目录"
# 构建插入数据的SQL语句
insert_table_sql = f"""
INSERT IGNORE INTO {keyword} (论文标题, 作者姓名, 作者单位, 摘要, 关键字, 期刊名或会议名称, 日期或会议时间, 目录)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
"""
# 执行插入操作
print(f"插入第{i}个数据")
cursor.execute(insert_table_sql, (
Paper.title, authors, units, Paper.abstract_text, keywords, Paper.name, Paper.time, Paper.content_string
))
if __name__ == '__main__':
# 获取每个关键词下爬取到的url,title
Keyword_data: list[Keyword] = [] # 用于保存每个关键词下爬取到的url,title
path = "E:\python_learn\Python爬虫\demo\测试关键字.txt"
keyword_list = get_keyword_list(path) # 所有关键字列表
for keyword in keyword_list:
driver = webdriver.Edge()
soup = driver_open(driver, keyword) # 打开知网并进入搜索页面
url_list = [] # 存储url
title_list = [] # 存储title
current_pn = get_pn(soup)[0] # 当前页数
total_pn = get_pn(soup)[1] # 总页数
# 执行(总页数-1)次循环,翻页(总页数-1)
for pn in range(current_pn, total_pn):
# 开始爬取当前页面
spider_url(soup, url_list, title_list)
# 生成一个随机的时间间隔,范围在1到5秒之间
random_interval = random.uniform(1, 5)
time.sleep(3) # # 随机暂停程序的执行
if current_pn != 0: # 判断是否只有一页
content = change_page(driver) # 自动跳转到下一页
driver.close() # 关闭页面
Keyword_data.append(Keyword(keyword, title_list, url_list))
"爬取每个关键字对应的所有论文页面详细信息,并保存到MySQL中"
# 构建到MySQL数据库的链接
conn = Connection(
host='localhost', # 主机名(或IP地址)
port=3306, # 端口,默认3306
user='root', # 账户
password='123456', # 密码
autocommit=True # 设置自动提交
)
# 获取游标对象
cursor = conn.cursor()
conn.select_db("知网论文信息") # 选择数据库
for Keyword_class in Keyword_data:
keyword = Keyword_class.keyword # 关键词
url_list = Keyword_class.url_list # 每一个关键词下,包含所有论文的url列表
Pager_list: list[Paper] = get_paper(url_list) # 每一个关键词下,包含所有论文的Paper类的列表
try:
# 保存url_list到本地
path = f'./临时url/{keyword}.txt'
f = open(path, 'w', encoding='utf-8')
# 写入文件
for url in url_list:
f.write(url)
f.flush()
f.close()
except Exception as e:
print(f"写入文件时时出现错误了{e}")
# 创建关键词对应的MySQL数据表格
create_table(keyword)
# 插入数据
for pager in Pager_list:
i = 1
insert_sql(keyword, pager, i)
i += 1
# 关闭链接
cursor.close()
conn.close()
05-11
05-09
03-31
03-30
07-15
04-08
04-08
04-08
04-08