Arxiv上每日自动获取最新NLP领域论文

最新推荐文章于 2023-09-05 16:01:55 发布

JackHCC

最新推荐文章于 2023-09-05 16:01:55 发布

阅读量1k

点赞数 2

分类专栏：项目推荐：个人团队开源分享文章标签：自然语言处理人工智能深度学习

本文链接：https://blog.csdn.net/qq_43042024/article/details/122499686

版权

项目推荐：个人团队开源分享专栏收录该内容

13 篇文章 2 订阅

订阅专栏

介绍

该项目自动获取最新Arxiv上的NLP论文，并生成markdown表格，包括论文题目，作者，链接，论文的开源代码等信息。
项目开源地址：https://github.com/JackHCC/Arxiv-NLP-Reporter
网页阅读：blog.creativecc.cn/arxiv-nlp-reporter
配置好github action，每日自动更新论文并生成Github Page

代码

import datetime
import requests
import json
import arxiv
import re

base_url = "https://arxiv.paperswithcode.com/api/v0/papers/"


def del_unicode(string):
    string = re.sub(r'\\u.{4}', '', string.__repr__())
    return string


def del_not_english(string):
    string = re.sub('[^A-Za-z]', '', string.__str__())
    return string


def get_authors(authors, first_author=False):
    output = str()
    if first_author == False:
        output = ", ".join(str(author) for author in authors)
    else:
        output = authors[0]
    return output


def sort_papers(papers):
    output = dict()
    keys = list(papers.keys())
    keys.sort(reverse=True)
    for key in keys:
        output[key] = papers[key]
    return output


def get_daily_papers(topic, query="nlp", max_results=2):
    """
    @param topic: str
    @param query: str
    @return paper_with_code: dict
    """

    # output
    content = dict()
    content_to_web = dict()

    # content
    output = dict()

    search_engine = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    cnt = 0

    for result in search_engine.results():

        paper_id = result.get_short_id()
        paper_title = result.title
        paper_url = result.entry_id

        code_url = base_url + paper_id
        paper_abstract = result.summary.replace("\n", " ")
        paper_authors = get_authors(result.authors)
        paper_first_author = get_authors(result.authors, first_author=True)
        primary_category = result.primary_category

        publish_time = result.published.date()
        update_time = result.updated.date()

        print("Time = ", update_time,
              " title = ", paper_title,
              " author = ", paper_first_author)

        # eg: 2108.09112v1 -> 2108.09112
        ver_pos = paper_id.find('v')
        if ver_pos == -1:
            paper_key = paper_id
        else:
            paper_key = paper_id[0:ver_pos]

        try:
            r = requests.get(code_url).json()
            # source code link
            if "official" in r and r["official"]:
                cnt += 1
                repo_url = r["official"]["url"]
                content[
                    paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|**[link]({repo_url})**|\n"
                content_to_web[
                    paper_key] = f"- **{update_time}**, **{paper_title}**, {paper_first_author} et.al., [PDF:{paper_id}]({paper_url}), **[code]({repo_url})**\n"
            else:
                content[
                    paper_key] = f"|**{update_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|null|\n"
                content_to_web[
                    paper_key] = f"- **{update_time}**, **{paper_title}**, {paper_first_author} et.al., [PDF:{paper_id}]({paper_url})\n"

        except Exception as e:
            print(f"exception: {e} with id: {paper_key}")

    data = {topic: content}
    data_web = {topic: content_to_web}
    return data, data_web


def update_json_file(filename, data_all):
    with open(filename, "r") as f:
        content = f.read()
        if not content:
            m = {}
        else:
            m = json.loads(content)

    json_data = m.copy()

    # update papers in each keywords
    for data in data_all:
        for keyword in data.keys():
            papers = data[keyword]

            if keyword in json_data.keys():
                json_data[keyword].update(papers)
            else:
                json_data[keyword] = papers

    with open(filename, "w") as f:
        json.dump(json_data, f)


def json_to_md(filename, to_web=False):
    """
    @param filename: str
    @return None
    """

    DateNow = datetime.date.today()
    DateNow = str(DateNow)
    DateNow = DateNow.replace('-', '.')

    with open(filename, "r") as f:
        content = f.read()
        if not content:
            data = {}
        else:
            data = json.loads(content)

    if to_web == False:
        md_filename = "README.md"
    else:
        md_filename = "./docs/index.md"

        # clean README.md if daily already exist else create it
    with open(md_filename, "w+") as f:
        pass

    # write data into README.md
    with open(md_filename, "a+", encoding='utf-8') as f:

        if to_web == True:
            f.write("---\n" + "layout: default\n" + "---\n\n")

        f.write("## Updated on " + DateNow + "\n\n")

        for keyword in data.keys():
            day_content = data[keyword]
            if not day_content:
                continue
            # the head of each part
            f.write(f"## {keyword}\n\n")

            if to_web == False:
                f.write("|Publish Date|Title|Authors|PDF|Code|\n" + "|---|---|---|---|---|\n")
            else:
                f.write("| Publish Date | Title | Authors | PDF | Code |\n")
                f.write("|:---------|:-----------------------|:---------|:------|:------|\n")

            # sort papers by date
            day_content = sort_papers(day_content)

            for _, v in day_content.items():
                if v is not None:
                    f.write(v)

            f.write(f"\n")

    print("finished")


if __name__ == "__main__":
    data_collector = []
    data_collector_web = []

    keywords = dict()
    keywords["NLP"] = "NLP" + "OR" + "\"Natural Language Processing\""
    keywords["Sequence Annotation"] = "\"Sequence Annotation\"OR\"Sequence Marking\"OR\"Named Entity Recognition\""
    keywords["Text Classification"] = "\"Text Classification\"OR\"Sentiment Analysis\"OR\"Topic Labeling\"OR\"News Classification\"OR\"Question Answering\"OR\"Dialog Act Classification\"OR\"Natural Language Inference\"OR\"Relation Classification\"OR\"Event Prediction\""
    keywords["Information Extraction"] = "\"Information Extraction\"OR\"Automatic Summary\"OR\"Title Generation\"OR\"Event Extraction\""
    keywords["Recommendation System"] = "\"Recommendation System\"OR\"Semantic Matching\"OR\"Chatbots\"OR\"Knowledge Graph\"OR\"Knowledge Graphs\""
    keywords["GNN"] = "GNN" + "OR" + "\"Recommendation System\"OR\"Graph Neural Network\""

    for topic, keyword in keywords.items():
        # topic = keyword.replace("\"","")
        print("Keyword: " + topic)

        data, data_web = get_daily_papers(topic, query=keyword, max_results=10)
        data_collector.append(data)
        data_collector_web.append(data_web)

        print("\n")

    # update README.md file
    json_file = "nlp-arxiv-daily.json"
    #     if ~os.path.exists(json_file):
    #         with open(json_file,'w')as a:
    #             print("create " + json_file)

    # update json data
    update_json_file(json_file, data_collector)
    # json data to markdown
    json_to_md(json_file)

    # update docs/index.md file
    json_file = "./docs/nlp-arxiv-daily-web.json"
    #     if ~os.path.exists(json_file):
    #         with open(json_file,'w')as a:
    #             print("create " + json_file)

    # update json data
    update_json_file(json_file, data_collector)
    # json data to markdown
    json_to_md(json_file, to_web=True)

workflow配置

# This is a basic workflow to help you get started with Actions

name: Run Arxiv Papers Daily

# Controls when the workflow will run
on:
  # Allows you to run this workflow manually from the Actions tab
  workflow_dispatch:
  schedule:
    - cron:  "* 0/12 * * *"  #'*/60 * * * *'
  # Triggers the workflow on push or pull request events but only for the main branch
#   push:
#     branches:
#     - main

env:

  GITHUB_USER_NAME: JackHCC
  GITHUB_USER_EMAIL: jackcc0701@163.com
  
  
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
  # This workflow contains a single job called "build"
  build:
    name: update
    # The type of runner that the job will run on
    runs-on: ubuntu-latest
    
    # Steps represent a sequence of tasks that will be executed as part of the job
    steps:
      - name: Checkout
        uses: actions/checkout@v2
        
      - name: Set up Python Env
        uses: actions/setup-python@v1
        with:
          python-version: 3.6        

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install arxiv
          pip install requests
          
      - name: Run arxiv report 
        run: |
          python arxiv-report.py
          
      - name: Push new nlp-arxiv-daily.md
        uses: github-actions-x/commit@v2.8
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          commit-message: "Github Action Automatic Update NLP Arxiv Papers"
          files: README.md nlp-arxiv-daily.json docs/nlp-arxiv-daily-web.json docs/index.md
          rebase: 'true'
          name: ${{ env.GITHUB_USER_NAME }}
          email: ${{ env.GITHUB_USER_EMAIL }}

代码地址：https://github.com/JackHCC/Arxiv-NLP-Reporter
主页：JackHCC