1. 目的:
从Uniprot中下载某个蛋白 (以 P05067 为例) 对应的N-糖基化位点以及该蛋白的序列。
2. python3脚本:
import requests
import re
from requests.adapters import HTTPAdapter
#import time
class Get_Nglyco_Seq(object):
def __init__(self,uniprot_id):
self.uniprot_id = uniprot_id
def __parse_xml_page(self,content_xml): ## 正则表达式匹配xml中N-糖基化和序列对应的部分
patt_gly = re.compile(
'\s+<feature type="glycosylation site" description="N-linked.+'
'\s+<location>'
'\s+<position position="(\d+)"/>'
'\s+</location>',
re.MULTILINE)
patt_seq = re.compile('<sequence length[\w\s="-]+>([A-Z]+)</sequence>',re.MULTILINE)
match_gly = patt_gly.findall(content_xml)
match_seq = patt_seq.findall(content_xml)
return match_gly, match_seq
def get_xml_page(self): ## 获取uniprot_id对应的xml网页
s = requests.Session()
s.mount('https://', HTTPAdapter(max_retries=2))
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
response = s.get('https://rest.uniprot.org/uniprotkb/'+self.uniprot_id+'.xml',headers=headers)
content_xml = response.text
glyco_pos, sequ = self.__parse_xml_page(content_xml=content_xml)
return glyco_pos, sequ
def main(uniprot_id):
try:
Nglyco_Seq = Get_Nglyco_Seq(uniprot_id)
glyco_pos, sequ = Nglyco_Seq.get_xml_page()
if len(glyco_pos) == 0:
print('Uniprot ID: %s\nN-Glyco Pos: %s\nSequence: %s' % (uniprot_id,'No',sequ[0]))
else:
print('Uniprot ID: %s\nN-Glyco Pos: %s\nSequence: %s' % (uniprot_id,glyco_pos,sequ[0]))
except Exception as e:
print('Error: %s\t%s' % (uniprot_id,e))
#time.sleep(random.randint(1,3))
if __name__ == '__main__':
""" 以蛋白 P05067 为例,获取该蛋白的 N-糖基化位点和序列"""
main('P05067')
3. 测试结果:
P05067的N-糖基化位点和序列结果如下:
Uniprot ID: P05067
N-Glyco Pos: ['542', '571']
Sequence: MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMNVQNGKWDSDPSGTKTCIDTKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPHFVIPYRCLVGEFVSDALLVPDKCKFLHQERMDVCETHLHWHTVAKETCSEKSTNLHDYGMLLPCGIDKFRGVEFVCCPLAEESDNVDSADAEEDDSDVWWGGADTDYADGSEDKVVEVAEEEEVAEVEEEEADDDEDDEDGDEVEEEAEEPYEEATERTTSIATTTTTTTESVEEVVREVCSEQAETGPCRAMISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCGSAMSQSLLKTTQEPLARDPVKLPTTAASTPDAVDKYLETPGDENEHAHFQKAKERLEAKHRERMSQVMREWEEAERQAKNLPKADKKAVIQHFQEKVESLEQEAANERQQLVETHMARVEAMLNDRRRLALENYITALQAVPPRPRHVFNMLKKYVRAEQKDRQHTLKHFEHVRMVDPKKAAQIRSQVMTHLRVIYERMNQSLSLLYNVPAVAEEIQDEVDELLQKEQNYSDDVLANMISEPRISYGNDALMPSLTETKTTVELLPVNGEFSLDDLQPWHSFGADSVPANTENEVEPVDARPAADRGLTTRPGSGLTNIKTEEISEVKMDAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIATVIVITLVMLKKKQYTSIHHGVVEVDAAVTPEERHLSKMQQNGYENPTYKFFEQMQN