import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time # 导入时间模块用于延时
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# 定义一个函数来爬取指定分区的数据
def crawl_journals(partition):
journals = [] # 初始化期刊列表
base_url = f"https://www.letpub.com.cn/index.php?page=journalapp&view=search&searchname=&searchissn=&searchfield=&searchimpactlow=&searchimpacthigh=&searchimpacttrend=&searchscitype=&searchcategory1=%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%A7%91%E5%AD%A6&searchcategory2=&searchjcrkind={partition}&searchopenaccess=&searchsort=relevance&searchsortorder=desc¤tsearchpage=1#journallisttable"
time.sleep(5) # 每次请求间隔5秒,避免被网站限流
response = requests.get(base_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# 提取总页数
page_info = soup.find('b').text # 直接找到包含页数信息的 <b> 标签
total_pages = 1
if page_info:
total_pages = int(page_info.split('共')[-1].split('页')[0].strip())
# 遍历所有页面
for page in range(1, total_pages + 1):
url = f"https://www.letpub.com.cn/index.php?page=journalapp&view=search&searchname=&searchissn=&searchfield=&searchimpactlow=&searchimpacthigh=&searchimpacttrend=&searchscitype=&searchcategory1=%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%A7%91%E5%AD%A6&searchcategory2=&searchjcrkind={partition}&searchopenaccess=&searchsort=relevance&searchsortorder=desc¤tsearchpage={page}#journallisttable"
time.sleep(5) # 每次请求间隔5秒,避免被网站限流
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'class': 'table_yjfx'})
if table:
for row in table.find_all('tr')[1:]: # 跳过表头
columns = row.find_all('td')
if len(columns) >= 12:
issn = columns[0].text.strip()
# 处理期刊名,去掉末尾的数字和括号
journal_name = columns[1].text.strip()
journal_name = re.sub(r'\s*\(\d+\)|\d+\.*\d*$', '', journal_name).strip()
overall_score = columns[2].text.strip()
# 处理期刊指标,分行显示
metrics = columns[3].text.strip().replace('\n', '').replace('\r', '')
metrics = metrics.replace('h-index:', 'h-index: ').replace('CiteScore:', '\nCiteScore: ')
cas_partition = columns[4].text.strip()
category = columns[5].text.strip()
sci_category = columns[6].text.strip()
is_oa = columns[7].text.strip()
acceptance_rate = columns[8].text.strip()
review_period = columns[9].text.strip()
# 处理近期文章,去掉末尾的数字
recent_articles = columns[10].text.strip()
recent_articles = re.sub(r'\s*\(\d+\)|\d+\.*\d*$', '', recent_articles).strip()
views = columns[11].text.strip()
# 提取详情页链接
detail_link = columns[1].find('a')
if detail_link and 'href' in detail_link.attrs:
detail_url = detail_link['href']
# 去掉最前面的 ./
detail_url = detail_url.lstrip('.')
if not detail_url.startswith('http'):
detail_url = "https://www.letpub.com.cn" + detail_url
else:
detail_url = "无详情页链接"
if "计算机科学" in category and cas_partition == f"{partition}区":
journals.append({
'ISSN': issn,
'期刊名': journal_name,
'综合评分': overall_score,
'期刊指标': metrics,
'中科院分区': cas_partition,
'学科领域': category,
'SCI收录': sci_category,
'是否OA': is_oa,
'录用比例': acceptance_rate,
'审稿周期': review_period,
'近期文章': recent_articles,
'查看数': views,
'详情页链接': detail_url
})
print(f"分区 {partition} - 第 {page} 页爬取完成")
else:
print(f"分区 {partition} - 第 {page} 页未找到表格数据")
else:
print(f"分区 {partition} - 请求第 {page} 页失败,状态码:{response.status_code}")
# 将结果保存到 Excel 文件
df = pd.DataFrame(journals)
# 创建一个 Pandas Excel 写入对象
with pd.ExcelWriter(f'computer_science_journals_partition_{partition}.xlsx', engine='xlsxwriter') as writer:
df.to_excel(writer, index=False, sheet_name=f'分区{partition}期刊')
# 获取工作表对象
workbook = writer.book
worksheet = writer.sheets[f'分区{partition}期刊']
# 定义超链接格式
hyperlink_format = workbook.add_format({
'font_color': 'blue',
'underline': 1
})
# 遍历数据行,添加超链接
for row_idx, row in df.iterrows():
# 添加近期文章链接
# article_cell = f'K{row_idx + 2}' # K 列对应近期文章列
# worksheet.write_url(article_cell, row['近期文章'], hyperlink_format, string='文章')
# 添加详情页链接
detail_cell = f'M{row_idx + 2}' # M 列对应详情页链接列
worksheet.write_url(detail_cell, row['详情页链接'], hyperlink_format, string='详情页')
# 调整列宽
worksheet.set_column('A:A', 10) # ISSN
worksheet.set_column('B:B', 30) # 期刊名
worksheet.set_column('C:C', 10) # 综合评分
worksheet.set_column('D:D', 30) # 期刊指标
worksheet.set_column('E:E', 10) # 中科院分区
worksheet.set_column('F:F', 30) # 学科领域
worksheet.set_column('G:G', 15) # SCI收录
worksheet.set_column('H:H', 10) # 是否OA
worksheet.set_column('I:I', 15) # 录用比例
worksheet.set_column('J:J', 15) # 审稿周期
worksheet.set_column('K:K', 15) # 近期文章
worksheet.set_column('L:L', 20) # 查看数
worksheet.set_column('M:M', 40) # 详情页链接
print(f"分区 {partition} 的数据已成功保存到 computer_science_journals_partition_{partition}.xlsx 文件中!")
# 分别爬取1区、2区、3区和4区的数据
for partition in range(1, 5):
crawl_journals(partition)
LetPub中科院期刊分区爬取(计算机科学)
最新推荐文章于 2025-03-29 10:22:18 发布