这里是使用Python对四种核苷酸(A、C、G、T)的压缩实现
class CompressedGene:
def __init__(self,gene:str)->None:
self._compress(gene)
#压缩
def _compress(self,gene:str)->None:
self.bit_string:int=1
for nuclleotide in gene.upper():
self.bit_string<<=2 #左移两个单位
if nuclleotide=="A":
self.bit_string|=0b00
elif nuclleotide=="C":
self.bit_string|=0b01
elif nuclleotide=="G":
self.bit_string|=0b10
elif nuclleotide=="T":
self.bit_string|=0b11
else:
raise ValueError("无效核苷酸:{}".format(nuclleotide))
#解压
def decompress(self)->str:
gene:str=""
for i in range(0,self.bit_string.bit_length()-1,2):
bits:int=self.bit_string>> i &0b11 #获取两个相关位
if bits==0b00:
gene+="A"
elif bits==0b01:
gene+="C"
elif bits==0b10:
gene+="G"
elif bits==0b11:
gene+="T"
else:
raise ValueError("无效位:{}".format(bits))
return gene[::-1] #逆序排列切片
def __str__(self)->str:
return self.decompress()
if __name__=="__main__":
from sys import getsizeof
original:str="TACGAAGTCAGTCATGCCCGAACTTGTACTGGAATACATGATCATGTCAGTCACGTGCTACGGGCTATATCAAAACGTCCTGCTTTATA"*100
print("original is {}bytes".format(getsizeof(original)))#初始大小
compressed: CompressedGene = CompressedGene(original) #压缩
print("compressed is {} bytes".format(getsizeof(compressed.bit_string)))
print(compressed) #解压(缩)
print("original and decompressed are the same:{}".format(original==compressed.decompress()))