message=str(input('input words>'))
_list=list(message)
words=re.findall('[a-zA-Z]',message)
blank=re.findall(' ',message)
digit=re.findall('[0-9]',message)
print(f"{words}length={len(words)}")
print(f"{blank}length={len(blank)}")
print(f"{digit}length={len(digit)}")
print(f"others_length={len(_list)-len(words)-len(blank)-len(digit)}")
filename = r"C:\Users\guanh\Desktop\ZN558_HUMAN.txt"
with open(filename, 'r') as f:
_list = []
for _str in f.readlines():
_list.append(_str)
# print(_list)
id_pattern = re.compile('ID\s+\w+\s+Reviewed;')
recname_pattern=re.compile('RecName:\sFull=[\w+(\s)]+')
sapiens_pattern=re.compile('OS\s{3}[\w+\s]+(\(\w+\))')
kegg_pattern=re.compile('KEGG;\s\w+:\d+;')
id_flag,recname_flag,sapiens_flag,kegg_flag=0,0,0,0
for i in _list:
if re.search(id_pattern,i) is not None:
id=re.search(id_pattern,i).group()
id=re.split(' ',id)[3]
id_flag=1
if re.search(recname_pattern,i) is not None:
recname=re.search(recname_pattern,i).group()
recname=re.split('=',recname)[1]
recname_flag=1
if re.search(sapiens_pattern,i) is not None:
sapiens=re.search(sapiens_pattern,i).group()
sapiens=re.split(' ',sapiens,maxsplit=3)[3]
sapiens_flag=1
if re.search(kegg_pattern,i) is not None:
kegg=re.search(kegg_pattern,i).group()
kegg=re.split(' ',kegg,maxsplit=1)[1]
kegg=re.split(';',kegg)[0]
kegg_flag=1
if (id_flag and recname_flag and sapiens_flag ) or kegg_flag:
print(f"{id} {recname} {sapiens} {kegg}")
def is_search_ok(id_p, rec_p, sna_p, kegg_p, seg):
result=list(map(re.search,[id_p,rec_p,sna_p,kegg_p,],[seg for i in range(4)]))
if result.count(None)==1:
ID,rec,sna=result[0].group(),result[1].group(),result[2].group()
ID = re.split(' ', ID)[3]
sna = re.split(' ', sna, maxsplit=3)[3]
rec=re.split('=', rec)[1]
return (ID,sna,rec)
elif result.count(None)==0:
ID, rec, sna,kegg = result[0].group(), result[1].group(), result[2].group(),result[3].group()
ID = re.split(' ', ID)[3]
sna = re.split(' ', sna, maxsplit=3)[3]
rec = re.split('=', rec)[1]
kegg = re.split(' ', kegg, maxsplit=1)[1]
# kegg = re.split(';', kegg)[0]
return (ID,sna,rec,kegg)
"""优化前的代码"""
# if re.search(id_p, seg) is not None and re.search(rec_p, seg) is not None and re.search(
# sna_p, seg) is not None and re.search(kegg_p, seg) is not None:
# id = re.search(id_p, seg).group()
# id = re.split(' ', id)[3]
#
# recname = re.search(rec_p, seg).group()
# recname = re.split('=', recname)[1]
#
# sapiens = re.search(sna_p, seg).group()
# sapiens = re.split(' ', sapiens, maxsplit=3)[3]
#
# kegg = re.search(kegg_p, seg).group()
# kegg = re.split(' ', kegg, maxsplit=1)[1]
# kegg = re.split(';', kegg)[0]
#
# return (id, recname, sapiens, kegg)
# elif re.search(id_p, seg) is not None and re.search(rec_p, seg) is not None and re.search(
# sna_p, seg) is not None :
# id = re.search(id_p, seg).group()
# id = re.split(' ', id)[3]
#
# recname = re.search(rec_p, seg).group()
# recname = re.split('=', recname)[1]
#
# sapiens = re.search(sna_p, seg).group()
# sapiens = re.split(' ', sapiens, maxsplit=3)[3]
# return (id,recname,sapiens)
filename=r"C:\Users\guanh\Desktop\uniprot_test.txt"
with open(filename,'r') as f:
file=f.read()
seg_file=re.split(r'\s//\s',file)
id_pattern = re.compile('ID\s+\w+\s+Reviewed;')
recname_pattern=re.compile('RecName:\sFull=[\w+(\s)]+')
sapiens_pattern=re.compile('OS\s{3}[\w+\s]+(\(\w+\))')
kegg_pattern=re.compile('KEGG;\s\w+:\d+;')
count=0
for seg in seg_file:
result=is_search_ok(id_pattern,recname_pattern,sapiens_pattern,sapiens_pattern,seg)
if result is not None:
for res in result:
print(res,end='')
print('')
count+=1
else:
print('end')
print(count)代码片
实验正则表达式记录
- 文件中的空格很难发现,用专门们的包比较方便,转移//符号和\符号,[]内的字符在正则表达式中只匹配一次,除非加限定符
- 代码前的构思比较重要,如能够存储re.search的结果那么后面就不会有那么多重复代码,其次用None,yes,no等常见的是否,判断代词比各种结果方便
- map函数,zip打包
- 相比于实验中的思路发现findall函数具有代码上的优越性
for seg in seg_file:
_list=list(map(re.findall,[id_pattern,recname_pattern,sapiens_pattern,kegg_pattern],[seg for i in range(4)]))
print(_list)
结果中的一部分:
[['ID ZN558_HUMAN Reviewed;'], ['RecName: Full=Zinc finger protein 558'], ['(Human)'], ['KEGG; hsa:148156;']]
以上部分直接解决了原实验步骤中is_search函数中的问题,后续清洗流程基本相同