- 记录一下通过uniprotID获取网页内容并匹配的方法一,这里主要是通过获取的网页内容作为字符串,然后利用python字符串匹配的方法来解决。另外也可以通过获取标签的方法,如果没有这个标签的话,那么就是没有这个字段,直接返回false。
import requests
import bs4
import xlrd
import xlwt
from xlutils import copy
import time
# 获取读取xls文件中的数据
def get_ID(file):
data = xlrd.open_workbook(file)
data.sheet_names()
print("sheets:" + str(data.sheet_names()))
table = data.sheet_by_name('Sheet1')
rows = table.nrows # 获取行数
print(table.cell_value(1, 0))
all_content = []
for i in range(1,rows) :
cell = table.cell_value(i, 0) # 取第1列数据
all_content.append(cell)
print(all_content)
return all_content
# 根据uniprotID获取网页内容,并匹配有没有Pathway
def get_result(ID):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"}
host = "https://www.uniprot.org/uniprot/"
url = host+ID
res = requests.get(url, headers=headers)
# demo = soup.find_all(name="span", attrs={"class":"context-help tooltipped-click html tipId-3"})
# print(demo)
# print(res.text)
result = "Pathway" in res.text
print(result)
return result
# 写入已有的xls文件中,i_num写入哪一行
def write_result(file,result,i_num):
#excel_path = 'D:\\test.xls' # 文件路径
# excel_path=unicode('D:\\测试.xls','utf-8')#识别中文路径
rbook = xlrd.open_workbook(file, formatting_info=True) # 打开文件
wbook = copy.copy(rbook) # 复制文件并保留格式
w_sheet = wbook.get_sheet(0) # 索引sheet表
col = 1
w_sheet.write(i_num, col, result)
wbook.save(file) # 保存文件
time.sleep(3)
file = './test.xls'
content = []
content = get_ID(file)
# i_num来作为写入的行数
for i_num,ID in enumerate(content):
print(ID)
result = get_result(ID)
write_result(file, result,i_num)
表格数据类型如下;
ID | |
---|---|
A0FDW2 | |
A0FDW3 |