正则实验四

message=str(input('input words>'))
_list=list(message)
words=re.findall('[a-zA-Z]',message)
blank=re.findall(' ',message)
digit=re.findall('[0-9]',message)
print(f"{words}length={len(words)}")
print(f"{blank}length={len(blank)}")
print(f"{digit}length={len(digit)}")
print(f"others_length={len(_list)-len(words)-len(blank)-len(digit)}")

filename = r"C:\Users\guanh\Desktop\ZN558_HUMAN.txt"
with open(filename, 'r') as f:
    _list = []
    for _str in f.readlines():
        _list.append(_str)
    # print(_list)
    id_pattern = re.compile('ID\s+\w+\s+Reviewed;')
    recname_pattern=re.compile('RecName:\sFull=[\w+(\s)]+')
    sapiens_pattern=re.compile('OS\s{3}[\w+\s]+(\(\w+\))')
    kegg_pattern=re.compile('KEGG;\s\w+:\d+;')
    id_flag,recname_flag,sapiens_flag,kegg_flag=0,0,0,0
    for i in _list:
        if re.search(id_pattern,i) is not None:
            id=re.search(id_pattern,i).group()
            id=re.split(' ',id)[3]
            id_flag=1
        if re.search(recname_pattern,i) is not None:
            recname=re.search(recname_pattern,i).group()
            recname=re.split('=',recname)[1]
            recname_flag=1
        if re.search(sapiens_pattern,i) is not None:
            sapiens=re.search(sapiens_pattern,i).group()
            sapiens=re.split(' ',sapiens,maxsplit=3)[3]
            sapiens_flag=1
        if re.search(kegg_pattern,i) is not None:
            kegg=re.search(kegg_pattern,i).group()
            kegg=re.split(' ',kegg,maxsplit=1)[1]
            kegg=re.split(';',kegg)[0]
            kegg_flag=1
if (id_flag and recname_flag and sapiens_flag ) or kegg_flag:
    print(f"{id} {recname} {sapiens} {kegg}")

def is_search_ok(id_p, rec_p, sna_p, kegg_p, seg):
    result=list(map(re.search,[id_p,rec_p,sna_p,kegg_p,],[seg for i in range(4)]))
    if result.count(None)==1:
        ID,rec,sna=result[0].group(),result[1].group(),result[2].group()
        ID = re.split(' ', ID)[3]
        sna = re.split(' ', sna, maxsplit=3)[3]
        rec=re.split('=', rec)[1]
        return (ID,sna,rec)
    elif result.count(None)==0:
        ID, rec, sna,kegg = result[0].group(), result[1].group(), result[2].group(),result[3].group()
        ID = re.split(' ', ID)[3]
        sna = re.split(' ', sna, maxsplit=3)[3]
        rec = re.split('=', rec)[1]
        kegg = re.split(' ', kegg, maxsplit=1)[1]
        #     kegg = re.split(';', kegg)[0]
        return (ID,sna,rec,kegg)

    """优化前的代码"""
    # if re.search(id_p, seg) is not None and re.search(rec_p, seg) is not None and re.search(
    #         sna_p, seg) is not None and re.search(kegg_p, seg) is not None:
    #     id = re.search(id_p, seg).group()
    #     id = re.split(' ', id)[3]
    #
    #     recname = re.search(rec_p, seg).group()
    #     recname = re.split('=', recname)[1]
    #
    #     sapiens = re.search(sna_p, seg).group()
    #     sapiens = re.split(' ', sapiens, maxsplit=3)[3]
    #
    #     kegg = re.search(kegg_p, seg).group()
    #     kegg = re.split(' ', kegg, maxsplit=1)[1]
    #     kegg = re.split(';', kegg)[0]
    #
    #     return (id, recname, sapiens, kegg)
    # elif re.search(id_p, seg) is not None and re.search(rec_p, seg) is not None and re.search(
    #         sna_p, seg) is not None :
    #     id = re.search(id_p, seg).group()
    #     id = re.split(' ', id)[3]
    #
    #     recname = re.search(rec_p, seg).group()
    #     recname = re.split('=', recname)[1]
    #
    #     sapiens = re.search(sna_p, seg).group()
    #     sapiens = re.split(' ', sapiens, maxsplit=3)[3]
    #     return (id,recname,sapiens)


filename=r"C:\Users\guanh\Desktop\uniprot_test.txt"
with open(filename,'r') as f:
    file=f.read()
    seg_file=re.split(r'\s//\s',file)
    id_pattern = re.compile('ID\s+\w+\s+Reviewed;')
    recname_pattern=re.compile('RecName:\sFull=[\w+(\s)]+')
    sapiens_pattern=re.compile('OS\s{3}[\w+\s]+(\(\w+\))')
    kegg_pattern=re.compile('KEGG;\s\w+:\d+;')
    count=0
    for seg in seg_file:
        result=is_search_ok(id_pattern,recname_pattern,sapiens_pattern,sapiens_pattern,seg)
        if result is not None:
            for res in result:
                print(res,end='')
            print('')
            count+=1
        else:
            print('end')

    print(count)代码片

实验正则表达式记录

  1. 文件中的空格很难发现,用专门们的包比较方便,转移//符号和\符号,[]内的字符在正则表达式中只匹配一次,除非加限定符
  2. 代码前的构思比较重要,如能够存储re.search的结果那么后面就不会有那么多重复代码,其次用None,yes,no等常见的是否,判断代词比各种结果方便
  3. map函数,zip打包
  4. 相比于实验中的思路发现findall函数具有代码上的优越性
 for seg in seg_file:
         _list=list(map(re.findall,[id_pattern,recname_pattern,sapiens_pattern,kegg_pattern],[seg for i in range(4)]))
         print(_list)
结果中的一部分:
[['ID   ZN558_HUMAN             Reviewed;'], ['RecName: Full=Zinc finger protein 558'], 		['(Human)'], ['KEGG; hsa:148156;']]
以上部分直接解决了原实验步骤中is_search函数中的问题,后续清洗流程基本相同
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值