MD标签是值:比对上的相对位置信息和错误信息。请注意:MD中并不包含insertion的信息。
使用re正则匹配,获取MD标签的比对的情况:
import re
def md_match(md_strings):
# mdlen = len(md_strings)
mat_times = 0
num_re = r"([0-9]+)"
mis_num_re = r"([A-Z]+|\^[A-Z]+)*([0-9]+)*"
while True:
re_str = num_re + mis_num_re * int(mat_times)
re_md = re.compile(re_str)
re_str_lst = re_md.match(md_strings).groups()
if not re_str_lst[-1]:
break
mat_times += 1
return re_str_lst
进一步判断,比对的reads是否有较多错配:
import re
def is_good_match(md_strings, mis_times=3):
"""
mis_times: default 3, <=3 good; >3: not good(>=4)
--------------
MD tag match good or not:
True: if MD tag with mismatch(SNP&Del) times <= 3
> eg: MD:Z:100 (no mismatch)
> eg: MD:Z:80A19 (one mismatch)
> eg: MD:Z:70T9A10T8 (3 mismatch bases)
> eg: MD:Z:69CT9A10T8 (3 mismatch, but with 4 mismatch bases, extension one mismatch[see as one])
> eg: MD:Z:69^AG9A10T8 (3 mismatch, but with 'AC' deletion to ref, del extension one mismatch[see as one])
NOTE: 'MD' info without insertion.
False: mismatch times >= 4:
> eg: MD:Z:20G48CT9^T10T8
"""
# re_md = re.compile(r"([0-9]+)"
# "([A-Z]+|\^[A-Z]+)*([0-9]+)*"
# "([A-Z]+|\^[A-Z]+)*([0-9]+)*"
# "([A-Z]+|\^[A-Z]+)*([0-9]+)*"
# "([A-Z]+|\^[A-Z]+)*([0-9]+)*") # with >=4 mistmatch
num_re = r"([0-9]+)"
mis_num_re = r"([A-Z]+|\^[A-Z]+)*([0-9]+)*"
re_str = num_re + mis_num_re * (int(mis_times) + 1) # NOTE: +1, match list with "None" if 'mis<=3'
re_md = re.compile(re_str)
# print(num_re+mis_num_re*int(mis_times))
re_str_lst = re_md.match(md_strings).groups()
if None in re_str_lst: # match times <= mis_times [3]
return True
else:
return False