如今,在ncbi查找文献已经成为了读研必不可少的技能之一,今天记录下根据GSE号批量在ncbi获取文献
需要安装biopython
安装方法:pip install biopython
biopython教程:https://biopython-cn.readthedocs.io/zh_CN/latest/cn/chr09.html
1、查看ncbi当前所有的数据库
from Bio import Entrez
hd_info = Entrez.einfo()
# 获取所有数据库列表
read_info = Entrez.read(hd_info)
for db in read_info['DbList']:
print (db)
2、根据gse号批量获取文献
from Bio import Entrez
from Bio import Medline
from Bio import Geo
import csv
# 参数设置
Entrez.email = "abcd@163.com" #告诉ncbi你是谁
Entrez.tool = "abcd"
# GEO.csv文件为一列数据全部为gse号
list1=[]
with open('GEO.csv', encoding='utf-8')as f:
f_csv = csv.reader(f)
for i, rows in enumerate(f_csv):
if i >= 1:
list1.append(rows)
for t in list1:
print(t)
# uid为gse的uid号,取了第一个 gds为gse所在的geo库的名字
handle = Entrez.esearch(db="gds", term=t)
record = Entrez.read(handle)
uid = record["IdList"][0]
# print(uid)
# print(record)
# print(record["Count"])
# print(record["IdList"][0])
# print(record["IdList"])
read_elink = Entrez.read(Entrez.elink(dbfrom="gds", db="pubmed", id=uid))
print ("LinkSetDb: ", read_elink[0]["LinkSetDb"])
# 查看所有相关的目标库
for lsd in read_elink[0]["LinkSetDb"]:
print (lsd["DbTo"], lsd["LinkName"], len(lsd["Link"]))
# 查看相关的所有文献 pmid
for link in read_elink[0]["LinkSetDb"][0]["Link"]:
mid = link["Id"]
print(mid)
# 根据gse号的pmid获取对应文献
# 用 efetch下载
hd_efetch = Entrez.efetch(db="pubmed", id=mid, rettype="medline", retmode="text", )
parse_medline = Medline.parse(hd_efetch)
with open("mouse_pubmed.xls", "a") as file:
file.write("title\tauthors\tsource\tPubMed\n")
for i, ele in enumerate(list(parse_medline)):
line = ele['TI'] + "\t" + ",".join(ele['AU']) + "\t" + ele['SO'] + "\t" + ele['PMID'] + "\n"
file.write(line)
print(i, line)
#有的文章作者很多,比如encode计划等,这种文章再批量获取时,需要把’AU‘作者去掉,不然总会报错