建议在Jupyter实践。
1. 使用正则表达式匹配指定的氨基酸序列
import re
# 氨基酸序列
seq = 'VSVLTMFRYAGWLDRLYMLVGTQLAAIIHGVALPLMMLI'
# 正则表达式匹配
match = re.search(r'[A|G]W', seq)
# 打印match及匹配到开始位置和结束位置
print(match)
# <re.Match object; span=(10, 12), match='GW'>
print(match.start())
print(match.end())
if match:
# 打印匹配到氨基酸
print(match.group())
# GW
else:
print("no match!")
2. 使用正则表达式查找全部的氨基酸序列
import re
seq = 'RQSAMGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQRPSKP'
# 匹配R开头、第二个氨基酸为任意、第三个氨基酸为S或T、第四个氨基酸不为P的连续4个氨基酸徐磊
matches = re.findall(r'R.[ST][^P]', seq)
print(matches)
# ['RQSA', 'RRSL', 'RPSK']
# finditer 匹配对象迭代器
match_iter = re.finditer(r'R.[ST][^P]', seq)
# 遍历
for match in match_iter:
# 打印group和span
print(match.group(), match.span())
print(match.start(), match.end())
# RQSA (0, 4)
# 0 4
# RRSL (18, 22)
# 18 22
# RPSK (40, 44)
# 40 44
3. 使用正则表达式匹配多个特殊字符,分割字符串
import re
# 匹配特殊字符|和;,并分割字符串
annotation = 'ATOM:CA|RES:ALA|CHAIN:B;NUMRES:166'
split_string = re.split(r'[|;]', annotation)
print(split_string)
# ['ATOM:CA', 'RES:ALA', 'CHAIN:B', 'NUMRES:166']
4. 正则表达式获取核型染色体数量,区带和CNV大小
karyotype1 = '46,XY; -11{p11.2-p13, 48.32Mb}'
karyotype2 = '47,XXX; +X{+3};-11{p11.2-p13.2, 48.32Mb}'
#### 匹配染色体数量 ####
match = re.search(r'(\d+,\w+);', karyotype1)
print(match)
# <re.Match object; span=(0, 6), match='46,XY;'>
chr = match.group(1)
print(chr)
# 46,XY
#### 匹配染色体开始和结束区带和CNV大小 ####
match2 = re.search(r'([p|q|pter]\d+.?\d+)-([p|q|qter]\d+.?\d+), (\d+.?\d+)Mb', karyotype2)
print(match2)
cyto_start = match2.group(1)
cyto_end = match2.group(2)
size = match2.group(3)
print(cyto_start)
# p11.2
print(cyto_end)
# p13.2
print(size)
# 48.32
5. 正则表达式获取指定格式的字符串内容
# 结果变异VCF文件描述信息
string = """
##ALT=<ID=DEL,Description="Deletion">
##ALT=<ID=DUP,Description="Duplication">
##ALT=<ID=INV,Description="Inversion">
##ALT=<ID=INVDUP,Description="InvertedDUP with unknown boundaries">
##ALT=<ID=TRA,Description="Translocation">
##ALT=<ID=INS,Description="Insertion">
##FILTER=<ID=UNRESOLVED,Description="An insertion that is longer than the read and thus we cannot predict the full size.">
##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
##INFO=<ID=MAPQ,Number=1,Type=Integer,Description="Median mapping quality of paired-ends">
##INFO=<ID=RE,Number=1,Type=Integer,Description="read support">
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
##INFO=<ID=PRECISE,Number=0,Type=Flag,Description="Precise structural variation">
##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the SV">
##INFO=<ID=SVMETHOD,Number=1,Type=String,Description="Type of approach used to detect SV">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=SEQ,Number=1,Type=String,Description="Extracted sequence from the best representative read.">
##INFO=<ID=STRANDS2,Number=4,Type=Integer,Description="alt reads first + ,alt reads first -,alt reads second + ,alt reads second -.">
##INFO=<ID=REF_strand,Number=.,Type=Integer,Description="plus strand ref, minus strand ref.">
##INFO=<ID=Strandbias_pval,Number=A,Type=Float,Description="P-value for fisher exact test for strand bias.">
##INFO=<ID=STD_quant_start,Number=A,Type=Float,Description="STD of the start breakpoints across the reads.">
##INFO=<ID=STD_quant_stop,Number=A,Type=Float,Description="STD of the stop breakpoints across the reads.">
##INFO=<ID=Kurtosis_quant_start,Number=A,Type=Float,Description="Kurtosis value of the start breakpoints across the reads.">
##INFO=<ID=Kurtosis_quant_stop,Number=A,Type=Float,Description="Kurtosis value of the stop breakpoints across the reads.">
##INFO=<ID=SUPTYPE,Number=.,Type=String,Description="Type by which the variant is supported.(SR,AL,NR)">
##INFO=<ID=STRANDS,Number=A,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency.">
##INFO=<ID=ZMW,Number=A,Type=Integer,Description="Number of ZMWs (Pacbio) supporting SV.">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DR,Number=1,Type=Integer,Description="# high-quality reference reads">
##FORMAT=<ID=DV,Number=1,Type=Integer,Description="# high-quality variant reads">
"""
import re
# 创建空dataframe
df_output = pd.DataFrame()
list_type = []
list_id = []
list_description = []
# 遍历字符串内容,内容拷贝至结构变异VCF文件
for str in string.split('\n'):
# 去除末尾\n和字符串内空格
str = str.strip().replace(' ', '')
# 内容为空或字符串为空则跳过
if not str or str == '':
continue
# 正则表达式匹配##后的英文字符
match = re.search(r'##(\w+)', str)
type = match.group(1) if match else 'ERORR'
# 匹配ID内容
match = re.search(r'ID=(\w+)', str)
id = match.group(1) if match else 'ERORR'
# 匹配Description内容
match = re.search(r'Description=\"(.*?)\"', str)
description = match.group(1) if match else 'ERORR'
# 加入列表
list_type.append(type)
list_id.append(id)
list_description.append(description)
print(list_description)
# 加入dataframe
df_output['Type'] = list_type
df_output['ID'] = list_id
df_output['Description'] = list_description
# 保存至excel
df_output.to_excel('结构变异描述信息说明.xlsx', index=False)