import urllib.request
import pandas as pd
import re
# 1.逐行读取xls文件列名并获取基因ID
def read_xlsx(path, sheetname,i):
sheet = pd.read_excel(path, sheetname)
geneID = []
for row in sheet.index.values:
geneID.append(sheet.iloc[row, i-1])
return geneID
# 2.NCBI搜索基因ID,找到注释信息
def get_infoID(GeneID):
infoID = []
for i in GeneID:
url = f'https://www.ncbi.nlm.nih.gov/gene/?term={i}' # 查找基因的网址
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
p = re.findall('<dd>(.*?)<span class="prov">', content)[0]# (.*?)为正则表达式的目标区域
infoID.append(p)
retur
ncbi爬虫geneID注释
于 2022-05-17 20:10:26 首次发布