2025.3.13更新
更新代码如下:
加入了添加文件名后PMID的功能,如不需要可自行删去pmid相关变量
import pandas as pd
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
def download(pmc_id, filename, pmid):
url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/' + str(pmc) + "/"
ua = UserAgent()
headers = {'User-Agent': ua.firefox}
request = requests.get(url, headers=headers)
bs = BeautifulSoup(request.content, 'lxml')
download_url = bs.find_all(attrs={'class': "usa-link display-flex usa-tooltip"})
download_link = download_url[1].get('href')
print(download_link)
download_url = "https://pmc.ncbi.nlm.nih.gov/articles" + "/" + pmc + "/" + download_link
print(filename)
print(download_url)
r = requests.get(download_url, headers=headers)
with open("The Path You Want to Store These/Papers/"+filename+"_"+ str(pmid) + ".pdf", 'wb') as f:
f.write(r.content)
# Read the excel file containing the list of papers
df = pd.read_excel('The Path Where Your Paper List is Located/Demo_Paper_List.xlsx', header=None)
df.columns = ['Title', 'PMCNum', "PMID"]
# Drop rows with missing values
PMC = df.dropna(axis=0, how='any')
print(PMC)
PMCNum = PMC['PMCNum']
TitleO = list(PMC['Title'])
PMID = list(PMC['PMID'])
# Remove special characters from the title
Title = []
for i in TitleO:
table = i.maketrans(':"/', ' ')
Title.append(i.translate(table))
# Download the papers
failed_titles = []
for m, pmc in enumerate(PMCNum):
file = Title[m]
pmid = PMID[m]
try:
download(filename=file, pmc_id=pmc, pmid=pmid)
except Exception as e:
print(f"Error downloading {pmc} for title '{Title[m]}': {e}")
failed_titles.append(Title[m])
continue
# Print the titles that failed to download
if failed_titles:
print("Failed to download the following titles:")
for title in failed_titles:
print(title)
原回答:
1.首先需要准备需要下载文献的PMC ID(必须),和文献的题目(非必须,为了给下载的文献命名)
2.准备一个Excel表格,包括题目和PMC ID(若无题目则取1、2、3......)
格式如下图(左右顺序不能颠倒,若某项有缺失值则在程序中自动删除该行)
将表格命名为 PMCNum.xlsx,并将其与py文件放在同一文件夹中
3.安装Python及相应包
包括bs4、requests、fake_useragent、pandas
在cmd中输入:
pip install xxx
即可安装
如:
pip install bs4
4.运行代码
import pandas as pd
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
def download(pmc_id, filename):
url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/' + str(pmc) + "/"
ua = UserAgent()
headers = {'User-Agent': ua.firefox}
request = requests.get(url, headers=headers)
bs = BeautifulSoup(request.content, 'lxml')
download_url = bs.find_all(attrs={'class': "int-view"})
download_link = download_url[1].get('href')
download_url = "https://www.ncbi.nlm.nih.gov" + download_link
print(filename)
print(download_url)
r = requests.get(download_url, headers=headers)
with open(filename, 'wb') as f:
f.write(r.content)
# 读取Excel文件
df = pd.read_excel('PMCNum.xlsx', header=None)
df.columns = ['Title', 'PMCNum']
# 去除包含空值的行
PMC = df.dropna(axis=0, how='any')
PMCNum = PMC['PMCNum']
TitleO = list(PMC['Title'])
# 处理标题中的特殊字符
Title = []
for i in TitleO:
table = i.maketrans(':"/', ' ')
Title.append(i.translate(table))
# 下载PDF文件,并记录下载失败的标题
failed_titles = []
for m, pmc in enumerate(PMCNum):
file = Title[m] + '.pdf'
try:
download(pmc_id=pmc, filename=file)
except Exception as e:
print(f"Error downloading {pmc} for title '{Title[m]}': {e}")
failed_titles.append(Title[m])
continue
# 输出下载失败的标题
if failed_titles:
print("Failed to download the following titles:")
for title in failed_titles:
print(title)
5.所有文件下载到了同一文件夹中,由于某些原因下载失败的文献在程序完成后显示,可以手动下载。(亲测成功率达到80%以上)