from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
import requests
import re
from pymysql import Connection
# 关键字类
class Keyword:
def __init__(self, keyword: str, title_list: list, url_list: list):
self.keyword = keyword
self.title_list = title_list
self.url_list = url_list
# 定义论文类
class Paper:
def __init__(
self,
title: str,
author_data: dict[str, list[str]],
unit_data: dict[str, str],
author_unit: dict[str, list[str]],
abstract_text: str,
keywords_data: list[str],
name,
time,
content_string
):
self.title = title # 论文名
self.author_data = author_data # 作者姓名
self.unit_data = unit_data # 作者单位
self.author_unit = author_unit # 作者与对应的单位
self.abstract_text = abstract_text # 摘要
self.keywords_data = keywords_data # 关键字
self.name = name # 期刊名|会议名
self.time = time # 期刊时间|会议时间
self.content_string = content_string # 目录
# 打开知网并进入搜索页面
def driver_open(driver, keyword):
# 进入知网首页并搜索关键字
url = "https://www.cnki.net/"
driver.get(url) # 进入知网首页
time.sleep(5) # 等待网页加载
# 将关键词输入搜索框
driver.find_element(By.CSS_SELECTOR, '#txt_SearchText').send_keys(keyword)
time.sleep(5) # 等待输入完成
# 点击搜索按钮
driver.find_element(By.CSS_SELECTOR,
'body > div.wrapper.section1 > div.searchmain > div > div.input-box > input.search-btn').click()
time.sleep(7)
# 点击中文按钮
try:
driver.find_element(By.CSS_SELECTOR, 'body > div.wrapper > div.top-doctype > div > div > div > a.ch').click()
time.sleep(5)
except:
# 在自动选择为总库或英文时点击中文按钮
driver.find_element(By.CSS_SELECTOR, '#ModuleSearch > div:nth-child(2) > div > div > div > div > a.ch').click()
time.sleep(5)
content = driver.page_source.encode('utf-8') # 拿到搜索页面的源代码
soup = BeautifulSoup(content, 'html.parser')
return soup
def spider_url(soup, url_list: list, title_list: list) -> tuple[list, list]:
# 获取当前页面的所有title和对应的url
tbody = soup.find_all('tbody') # 获取tbody标签
tbody = BeautifulSoup(str(tbody[0]), 'html.parser') # 解析出tbody标签
data_td_name = tbody.find_all('a', attrs={'class': 'fz14'}) # 获取所有class为fz14的a标签
for i in data_td_name:
url = i['href'] # 获取论文url
title = i.text.strip("\n").strip() # 获取论文标题
url_list.append(url)
title_list.append(title)
return url_list, title_list
# 自动翻页
def change_page(driver):
driver.find_element(By.CSS_SELECTOR, '#Page_next_top').click()
time.sleep(10)
content = driver.page_source.encode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
return soup
# 获取当前页数和总页数
def get_pn(soup):
pn = soup.find_all('span', attrs={'class': 'countPageMark'})[0].text
current_pn = int(pn.split("/")[0]) # 当前页数
total_pn = int(pn.split("/")[1]) # 总页数
return current_pn, total_pn
def get_keyword_list(path) -> list[str]:
# 处理关键词文本
keyword_list = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
keyword_list.append(line.strip("\n").strip(";"))
return keyword_list
# 爬取论文页面详细信息
def get_paper(url_list: list) -> list[Paper]:
paper_list = []
# 尝试爬取论文页面
i = 0
for url in url_list:
i += 1
print(f"开始爬取第{i}个论文页面")
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.46"
}
response = requests.get(url, headers=headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
# 论文名
title = soup.findAll("h1")[0].text
# 作者姓名
author_data: dict[str, list[str]] = {}
spans = soup.select("h3.author span a")
for span_a in spans:
# 使用正则表达式提取作者姓名
author_name = re.match(r'([^0-9 ,]+)', span_a.text).group(0)
# 判断是否含有上标文本<sup>,从而判断是否有多个单位
if span_a.find_all(recursive=False):
# 提取作者姓名对应的数字
author_number = span_a.find("sup").text
number_list = []
# 判断是否同时对应多个单位
if "," in author_number:
number_list = author_number.split(",")
else:
number_list.append(author_number)
# 将姓名与数字按字典嵌套列表保存
author_data[author_name] = number_list
else:
author_data[author_name] = ['1']
# 作者单位
unit_data: dict[str, str] = {}
units = soup.select("h3 a.author")
# 通过if判断作者单位是否存在
if units:
for unit in units:
# 判断是否有多个单位
if "." in unit.text:
key = str(unit.text.split(".")[0])
value = unit.text.split(".")[1]
unit_data[key] = value
else:
unit_data["1"] = unit.text
else:
unit_data["1"] = ""
# 将作者与单位整合成一个字典
author_unit: dict[str, list[str]] = {}
for name, numbers in author_data.items():
units = []
for number in numbers:
unit = unit_data.get(number)
if unit is not None:
units.append(unit)
author_unit[name] = units
# 摘要
abstract = soup.findAll("span", attrs={"class": "abstract-text"})[0]
abstract_text = abstract.text
# 关键字
keywords_data = []
keywords = soup.findAll("a", attrs={"name": "keyword"})
for keyword in keywords:
key_word = keyword.text.strip()
keywords_data.append(key_word.strip(";"))
# 期刊名和时间|会议名和会议时间
name = None
time = None
if len(soup.select("div.top-tip span a")) == 2:
name = soup.select("div.top-tip span a")[0].text # 期刊名
time = soup.select("div.top-tip span a")[1].text # 日期
elif len(soup.select("div.top-tip span a")) == 1:
"只有期刊名没有日期"
name = soup.select("div.top-tip span a")[0].text # 期刊名
time = "无日期" # 日期
elif len(soup.select("div.top-tip span a")) == 0:
"没有期刊名和日期"
row_name = soup.find_all('span', attrs={"class": "rowtit"}, string='会议名称:')[0]
row_time = soup.find_all('span', attrs={"class": "rowtit"}, string='会议时间:')[0]
name = row_name.find_next_sibling('p').text # 会议名称
time = row_time.find_next_sibling('p').text # 会议时间
# 文章目录
cont = soup.find_all("h5", string='文章目录')
contents = soup.select('ul.catalog-list li')
# 判断是否存在目录
if cont:
"存在目录"
content_string = ""
for content in contents:
content_string += content['title'] + "\n"
else:
content_string = "无目录"
# 保存爬取的信息到Paper类中
paper = Paper(
title, author_data, unit_data, author_unit, abstract_text, keywords_data, name, time, content_string
)
paper_list.append(paper)
except Exception as e:
print(f"爬取{url}时出现错误{e}")
return paper_list
# 创建MySQL关键词对应的数据表格
def create_table(keyword: str):
# 使用关键字创建表名
table_name = keyword
# 创建表的SQL语句
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {table_name} (
论文标题 varchar(100) PRIMARY KEY,
作者姓名 int(50),
作者单位 varchar(100),
摘要 Text,
关键字 varchar(100),
期刊名或会议名称 varchar(30),
日期或会议时间 date,
目录 Text
)
"""
cursor.execute(create_table_sql)
# 在MySQL关键词对应的表格中插入数据
def insert_sql(keyword: str, Paper: Paper):
authors = ', '.join(Paper.author_data.keys()) # 作者姓名
units = ', '.join([value[0] for value in Paper.author_data.values()]) # 作者单位
keywords = ', '.join(Paper.keywords_data) # 关键词
# 处理可能为空的字段
if units is None:
units = "无单位"
if Paper.abstract_text is None:
Paper.abstract_text = "无摘要"
if keywords is None:
keywords = "无关键词"
if Paper.name is None:
Paper.name = "无期刊名或会议名称"
if Paper.time is None:
Paper.time = "无日期或会议时间"
if Paper.content_string is None:
Paper.content_string = "无目录"
# 构建插入数据的SQL语句
insert_table_sql = f"""
INSERT IGNORE INTO {keyword} (论文标题, 作者姓名, 作者单位, 摘要, 关键字, 期刊名或会议名称, 日期或会议时间, 目录)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
"""
# 执行插入操作
cursor.execute(insert_table_sql,(
Paper.title, authors, units, Paper.abstract_text, keywords, Paper.name, Paper.time,Paper.content_string
))
if __name__ == '__main__':
# 获取每个关键词下爬取到的url,title
Keyword_data: list[Keyword] = [] # 用于保存每个关键词下爬取到的url,title
path = "E:\python_learn\Python爬虫\demo\测试关键字.txt"
keyword_list = get_keyword_list(path) # 所有关键字列表
for keyword in keyword_list:
driver = webdriver.Edge()
soup = driver_open(driver, keyword) # 打开知网并进入搜索页面
url_list = [] # 存储url
title_list = [] # 存储title
current_pn = get_pn(soup)[0] # 当前页数
total_pn = get_pn(soup)[1] # 总页数
# 执行(总页数-1)次循环,翻页(总页数-1)
for pn in range(current_pn, total_pn):
spider_url(soup, url_list, title_list) # 开始爬取当前页面
time.sleep(3)
# 先爬取第一页再跳转
content = change_page(driver) # 自动跳转到下一页
driver.close() # 关闭页面
Keyword_data.append(Keyword(keyword, title_list, url_list))
"爬取每个关键字对应的所有论文页面详细信息,并保存到MySQL中"
# 构建到MySQL数据库的链接
conn = Connection(
host='localhost', # 主机名(或IP地址)
port=3306, # 端口,默认3306
user='root', # 账户
password='123456', # 密码
autocommit=True # 设置自动提交
)
# 获取游标对象
cursor = conn.cursor()
conn.select_db("知网论文信息") # 选择数据库
for Keyword_class in Keyword_data:
keyword = Keyword_class.keyword # 关键词
url_list = Keyword_class.url_list # 每一个关键词下,包含所有论文的url列表
Pager_list: list[Paper] = get_paper(url_list) # 每一个关键词下,包含所有论文的Paper类的列表
# 创建关键词对应的MySQL数据表格
create_table(keyword)
# 插入数据
for pager in Pager_list:
insert_sql(keyword, pager)
# 关闭链接
cursor.close()
conn.close()
爬虫知网学习
最新推荐文章于 2024-05-09 16:53:01 发布