LetPub中科院期刊分区爬取（计算机科学）

最新推荐文章于 2025-03-29 10:22:18 发布
水代码的程序猿
最新推荐文章于 2025-03-29 10:22:18 发布
阅读量193
点赞数 2
分类专栏： python学习文章标签： python 爬虫
本文链接：https://blog.csdn.net/m0_54283072/article/details/146419152
版权
python学习专栏收录该内容
2 篇文章
订阅专栏

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time  # 导入时间模块用于延时

# 设置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# 定义一个函数来爬取指定分区的数据
def crawl_journals(partition):
    journals = []  # 初始化期刊列表
    base_url = f"https://www.letpub.com.cn/index.php?page=journalapp&view=search&searchname=&searchissn=&searchfield=&searchimpactlow=&searchimpacthigh=&searchimpacttrend=&searchscitype=&searchcategory1=%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%A7%91%E5%AD%A6&searchcategory2=&searchjcrkind={partition}&searchopenaccess=&searchsort=relevance&searchsortorder=desc&currentsearchpage=1#journallisttable"
    time.sleep(5)  # 每次请求间隔5秒，避免被网站限流
    response = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # 提取总页数
    page_info = soup.find('b').text  # 直接找到包含页数信息的 <b> 标签
    total_pages = 1
    if page_info:
        total_pages = int(page_info.split('共')[-1].split('页')[0].strip())

    # 遍历所有页面
    for page in range(1, total_pages + 1):
        url = f"https://www.letpub.com.cn/index.php?page=journalapp&view=search&searchname=&searchissn=&searchfield=&searchimpactlow=&searchimpacthigh=&searchimpacttrend=&searchscitype=&searchcategory1=%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%A7%91%E5%AD%A6&searchcategory2=&searchjcrkind={partition}&searchopenaccess=&searchsort=relevance&searchsortorder=desc&currentsearchpage={page}#journallisttable"
        time.sleep(5)  # 每次请求间隔5秒，避免被网站限流
        response = requests.get(url, headers=headers)


        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'class': 'table_yjfx'})

            if table:
                for row in table.find_all('tr')[1:]:  # 跳过表头
                    columns = row.find_all('td')
                    if len(columns) >= 12:
                        issn = columns[0].text.strip()

                        # 处理期刊名，去掉末尾的数字和括号
                        journal_name = columns[1].text.strip()
                        journal_name = re.sub(r'\s*\(\d+\)|\d+\.*\d*$', '', journal_name).strip()

                        overall_score = columns[2].text.strip()

                        # 处理期刊指标，分行显示
                        metrics = columns[3].text.strip().replace('\n', '').replace('\r', '')
                        metrics = metrics.replace('h-index:', 'h-index: ').replace('CiteScore:', '\nCiteScore: ')

                        cas_partition = columns[4].text.strip()
                        category = columns[5].text.strip()
                        sci_category = columns[6].text.strip()
                        is_oa = columns[7].text.strip()
                        acceptance_rate = columns[8].text.strip()
                        review_period = columns[9].text.strip()

                        # 处理近期文章，去掉末尾的数字
                        recent_articles = columns[10].text.strip()
                        recent_articles = re.sub(r'\s*\(\d+\)|\d+\.*\d*$', '', recent_articles).strip()

                        views = columns[11].text.strip()

                        # 提取详情页链接
                        detail_link = columns[1].find('a')
                        if detail_link and 'href' in detail_link.attrs:
                            detail_url = detail_link['href']
                            # 去掉最前面的 ./
                            detail_url = detail_url.lstrip('.')
                            if not detail_url.startswith('http'):
                                detail_url = "https://www.letpub.com.cn" + detail_url
                        else:
                            detail_url = "无详情页链接"

                        if "计算机科学" in category and cas_partition == f"{partition}区":
                            journals.append({
                                'ISSN': issn,
                                '期刊名': journal_name,
                                '综合评分': overall_score,
                                '期刊指标': metrics,
                                '中科院分区': cas_partition,
                                '学科领域': category,
                                'SCI收录': sci_category,
                                '是否OA': is_oa,
                                '录用比例': acceptance_rate,
                                '审稿周期': review_period,
                                '近期文章': recent_articles,
                                '查看数': views,
                                '详情页链接': detail_url
                            })
                print(f"分区 {partition} - 第 {page} 页爬取完成")
            else:
                print(f"分区 {partition} - 第 {page} 页未找到表格数据")
        else:
            print(f"分区 {partition} - 请求第 {page} 页失败，状态码：{response.status_code}")

    # 将结果保存到 Excel 文件
    df = pd.DataFrame(journals)
    # 创建一个 Pandas Excel 写入对象
    with pd.ExcelWriter(f'computer_science_journals_partition_{partition}.xlsx', engine='xlsxwriter') as writer:
        df.to_excel(writer, index=False, sheet_name=f'分区{partition}期刊')

        # 获取工作表对象
        workbook = writer.book
        worksheet = writer.sheets[f'分区{partition}期刊']

        # 定义超链接格式
        hyperlink_format = workbook.add_format({
            'font_color': 'blue',
            'underline': 1
        })

        # 遍历数据行，添加超链接
        for row_idx, row in df.iterrows():
            # 添加近期文章链接
            # article_cell = f'K{row_idx + 2}'  # K 列对应近期文章列
            # worksheet.write_url(article_cell, row['近期文章'], hyperlink_format, string='文章')

            # 添加详情页链接
            detail_cell = f'M{row_idx + 2}'  # M 列对应详情页链接列
            worksheet.write_url(detail_cell, row['详情页链接'], hyperlink_format, string='详情页')

        # 调整列宽
        worksheet.set_column('A:A', 10)  # ISSN
        worksheet.set_column('B:B', 30)  # 期刊名
        worksheet.set_column('C:C', 10)  # 综合评分
        worksheet.set_column('D:D', 30)  # 期刊指标
        worksheet.set_column('E:E', 10)  # 中科院分区
        worksheet.set_column('F:F', 30)  # 学科领域
        worksheet.set_column('G:G', 15)  # SCI收录
        worksheet.set_column('H:H', 10)  # 是否OA
        worksheet.set_column('I:I', 15)  # 录用比例
        worksheet.set_column('J:J', 15)  # 审稿周期
        worksheet.set_column('K:K', 15)  # 近期文章
        worksheet.set_column('L:L', 20)  # 查看数
        worksheet.set_column('M:M', 40)  # 详情页链接
    print(f"分区 {partition} 的数据已成功保存到 computer_science_journals_partition_{partition}.xlsx 文件中！")

# 分别爬取1区、2区、3区和4区的数据
for partition in range(1, 5):
    crawl_journals(partition)