1、统计各碱基的数量, 计算整条序列长度
seq = ''
with open('./1.txt', "r") as f:
for line in f:
if line[0] != ">":
#print(line)
line = line.strip().upper()
seq += line
#统计碱基
print("A={0}".format(seq.count("A")))
print("T={0}".format(seq.count("T")))
print("G={0}".format(seq.count("G")))
print("C={0}".format(seq.count("C")))
#统计长度
print("length={0}".format(len(seq)))
b='AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
print("A={0}".format(b.count("A")))
print("G={0}".format(b.count("G")))
print("C={0}".format(b.count("C")))
print("T={0}".format(b.count("T")))
print("length={0}".format(len(b)))
f = 'AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
counts = []
for i in ['A','G','C','T']:
counts.append(f.count(i))
print (' '.join(map(str, counts)))
print ('seqlength =',len(f))
2、将DNA序列转换为RNA序列
import re
dna = 'GATGGAACTTGACTACGTAAATT'
rnaseq1 = re.sub('T' , 'U', dna)
rnaseq2 = dna.replace('T', 'U')
print(rnaseq1)
print(rnaseq2)
import re
dnaseq = ''
g = open ('C:/Users/Administrator/Desktop/base1.txt', 'w')
f = open ('C:/Users/Administrator/Desktop/base.txt')
for line in f:
if line[0] != '>':
line = line.strip()
dnaseq = dnaseq+line
rnaseq1 = re.sub('T' , 'U', dnaseq)
g.write(rnaseq1)
f.close()
g.close()
str = 'GATGGAACTTGACTACGTAAATT'
print(str.replace('T','U'))
3、打印出DNA互补链和反向互补链
def complement(seq):
ntCom = {'A':'T', 'C':'G', 'T':'A', 'G':'C'}
dna1= list(seq)
dna2 = [ntCom[k] for k in dna1]
dna3 = ''.join(dna2)
return dna3
seq = 'TGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCGTGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGAT'
print (complement(seq))
4、斐波那契兔子繁殖问题:
n个月后,每对兔子至少繁殖k对兔子,最后兔子总数是?
def fibonacciRabbits(n,k):
F = [0,1,1]
generation = 3
while generation <= n:
F.append(F[generation-1]+F[generation-2]*k)
generation += 1
return (F[n])
fibonacciRabbits(5,3)
def fibonacciRabbits(n,k):
if n <= 2:
return (1)
else:
return (fibonacciRabbits(n-1,k) + fibonacciRabbits(n-2,k)*k)
print (fibonacciRabbits(5,3))
斐波那契数列
i, j = 1, 1
while i < 10000:
print (i)
i, j = j, i+j
# 递归
def Fibonacci_Recursion_tool(n):
if n <= 0:
return 0
elif n == 1:
return 1
else:
return Fibonacci_Recursion_tool(n - 1) + Fibonacci_Recursion_tool(n - 2)
def Fibonacci_Recursion(n):
result_list = []
for i in range(1, n + 1):
result_list.append(Fibonacci_Recursion_tool(i))
return result_list
Fibonacci_Recursion(15)
[1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610]
5、计算GC含量最大的DNA序列
### 5. Computing GC Content ###
from operator import itemgetter
from collections import OrderedDict
seqTest = OrderedDict()
gcContent = OrderedDict()
with open('./5.txt', 'rt') as f:
for line in f:
line = line.rstrip()
if line.startswith('>'):
seqName = line[1:]
seqTest[seqName] = ''
continue
seqTest[seqName] += line.upper()
for ke, val in seqTest.items():
totalLength = len(val)
gcNumber = val.count('G') + val.count('C')
gcContent[ke] = float(gcNumber/totalLength)*100
sortedGCContent = sorted(gcContent.items(), key = itemgetter(1))
largeName = sortedGCContent[-1][0]
largeGCContent = sortedGCContent[-1][1]
print ('most GC ratio gene is %s and it is %s ' %(largeName,largeGCContent))
# to open FASTA format sequence file:
s=open('./5.txt','r').readlines()
# to create two lists, one for names, one for sequences
name_list=[]
seq_list=[]
data='' # to put the sequence from several lines together
for line in s:
line=line.strip()
for i in line:
if i == '>':
name_list.append(line[1:])
if data:
seq_list.append(data) #将每一行的的核苷酸字符串连接起来
data='' # 合完后data 清零
break
else:
line=line.upper()
if all([k==k.upper() for k in line]): #验证是不是所有的都是大写
data=data+line
seq_list.append(data) # is there a way to include the last sequence in the for loop?
GC_list=[]
for seq in seq_list:
i=0
for k in seq:
if k=="G" or k=='C':
i+=1
GC_cont=float(i)/len(seq)*100.0
GC_list.append(GC_cont)
m=max(GC_list)
print (name_list[GC_list.index(m)]) # to find the index of max GC
print ("{:0.6f}".format(m)) # 保留6位小数
def parse_fasta(s):
results = {}
strings = s.strip().split('>')
# Python split()通过指定分隔符对字符串进行切片,如果参数num 有指定值,则仅分隔 num 个子字符串
for s in strings:
if len(s) == 0:
continue
# 如果字符串长度为0,就跳出循环。
parts = s.split()
label = parts[0]
bases = ''.join(parts[1:])
results[label] = bases
return results
def gc_content(s):
n = len(s)
m = 0
for c in s:
if c == 'G' or c == 'C':
m += 1
return 100 * (float(m) / n)
if __name__ == "__main__":
small_dataset = """
>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
"""
#large_dataset = open('datasets/rosalind_gc.txt').read()
results = parse_fasta(small_dataset)
results = dict([(k, gc_content(v)) for k, v in results.items()])
# 这里iteritem()和item()功能是一样的
# 前一个results输出,名称+序列,后一个results输出,名称+百分比
highest_k = None
highest_v = 0
for k, v in results.items():
if v > highest_v:
highest_k = k
highest_v = v
# 输出GC含量高的
print (highest_k)
print ('%f%%' % highest_v)
6、计算点突变个数
with open('./6.txt', 'rt') as f:
seq = f.readlines()
seq1,seq2 = seq[0].strip(),seq[1].strip()
HD = 0
for i in range(len(seq1)):
if seq1[i] != seq2[i]:
HD += 1
print (HD)
s = 'GAGCCTACTAACGGGAT'
t = 'CATCGTAATGACGGCCT'
count = 0
for i in range(len(s)):
if s[i] != t[i]:
count +=1
print (count)
7、孟德尔第一定律
from scipy.misc import comb
individuals = input('Number of individuals(k,m,n):')
[k, m, n] = map(int,individuals.split(','))
t = k+m+n
rr = comb(n,2)/comb(t,2)
hh = comb(m,2)/comb(t,2)
hr = comb(n,1)*comb(m,1)/comb(t,2)
prob = 1 - (rr+hh*1/4+hr*1/2)
print(rr, hh,hr)
print (prob)
def f(x, y, z):
s = x + y + z # the sum of population
c = s * (s - 1) / 2.0 # comb(2,s)
p = 1 - (z * (z - 1) / 2 + 0.25 * y * (y - 1) / 2 + y * z * 0.5) / c
return p
print (f(2, 2, 2))
8、RNA序列翻译为蛋白质
def translate_rna(sequence):
codonTable = {
'AUA':'I', 'AUC':'I', 'AUU':'I', 'AUG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACU':'T',
'AAC':'N', 'AAU':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGU':'S', 'AGA':'R', 'AGG':'R',
'CUA':'L', 'CUC':'L', 'CUG':'L', 'CUU':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCU':'P',
'CAC':'H', 'CAU':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGU':'R',
'GUA':'V', 'GUC':'V', 'GUG':'V', 'GUU':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCU':'A',
'GAC':'D', 'GAU':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGU':'G',
'UCA':'S', 'UCC':'S', 'UCG':'S', 'UCU':'S',
'UUC':'F', 'UUU':'F', 'UUA':'L', 'UUG':'L',
'UAC':'Y', 'UAU':'Y', 'UAA':'', 'UAG':'',
'UGC':'C', 'UGU':'C', 'UGA':'', 'UGG':'W',
}
proteinsequence = ''
for n in range(0,len(sequence),3):
proteinsequence += codonTable[sequence[n:n+3]]
return proteinsequence
se = 'AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA' #sequence or se = open('rosalind_prot.txt').read().strip('\n') #sequence
print(translate_rna(se))
import re
def mRNA_protein(RNA_string):
start_code = 'AUG'
end_code = ['UAA', 'UAG', 'UGA']
protein_table = {'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', \
'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', \
'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', \
'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', \
'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', \
'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', \
'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', \
'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', \
'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', \
'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', \
'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', \
'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', \
'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', \
'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', \
'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', \
'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
}
#找到起始密码子的位置
start_sit = re.search(start_code, RNA_string)
protein = ''
#按阅读框匹配蛋白质
for sit in range(start_sit.start(), len(RNA_string), 3):
protein = protein + protein_table[RNA_string[sit:sit+3]]
return protein
if __name__ == '__main__':
RNA_string = 'ACGGGGAUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'
print(mRNA_protein(RNA_string))
import re
from collections import OrderedDict
codonTable = OrderedDict()
with open('rna_codon_table.txt') as f:
for line in f:
line = line.rstrip()
lst = re.split('\s+', line) #\s+ 匹配空格1次或无限次
for i in [0, 2, 4, 6]:
codonTable[lst[i]] = lst[i + 1]
rnaSeq = ''
with open('rosalind_prot.txt', 'rt') as f:
for line in f:
line = line.rstrip()
rnaSeq += line.upper()
aminoAcids = []
i = 0
while i < len(rnaSeq):
codon = rnaSeq[i:i + 3]
if codonTable[codon] != 'Stop':
aminoAcids.append(codonTable[codon])
i += 3
peptide = ''.join(aminoAcids)
print (peptide)
9、寻找DNA Motif
import re
matches = re.finditer('(?=ATAT)', 'GATATATGCATATACTT')
for match in matches:
print (match.start()+1,end = '\t')
seq = 'GATATATGCATATACTT'
pattern = 'ATAT'
def find_motif(seq, pattern):
position = []
for i in range(len(seq) - len(pattern)):
if seq[i:i + len(pattern)] == pattern:
position.append(str(i + 1))
print ('\t'.join(position))
find_motif(seq, pattern)
import re
seq='GATATATGCATATACTT'
print ([i.start()+1 for i in re.finditer('(?=ATAT)',seq)])
10、寻找共同祖先
import numpy as np
import os
from collections import Counter
fhand = open('./10.txt')
t = []
for line in fhand:
if line.startswith('>'):
continue
else:
line = line.rstrip()
line_list = list(line)
t.append(line_list)
a = np.array(t)#创建一个二维数组
print(a)
L1,L2,L3,L4 = [], [], [], []
comsquence=''
for i in range(a.shape[1]):
l = [x[i] for x in a] #调出二维数组的每一列
L1.append(l.count('A'))
L2.append(l.count('C'))
L3.append(l.count('T'))
L4.append(l.count('G'))
comsquence=comsquence+Counter(l).most_common()[0][0]
print (comsquence)
print ('A:',L1,'\n','C:',L2,'\n','T:',L3,'\n','G:',L4)
from collections import Counter
from collections import OrderedDict
fh = open('./11.txt', 'wt')
rosalind = OrderedDict()
seqLength = 0
with open('./10.txt') as f:
for line in f:
line = line.rstrip()
if line.startswith('>'):
rosalindName = line
rosalind[rosalindName] = ''
continue
rosalind[rosalindName] += line
seqLength = len(rosalind[rosalindName]) #len(ATCCAGCT)
A, C, G, T = [], [], [], []
consensusSeq = ''
for i in range(seqLength):
seq = ''
for k in rosalind.keys(): # rosalind.keys = Rosalind_1...Rosalind_7
seq += rosalind[k][i] # 把 Rosalind_1...Rosalind_7 相同顺序的碱基加起来
A.append(seq.count('A'))
C.append(seq.count('C'))
G.append(seq.count('G'))
T.append(seq.count('T'))
counts = Counter(seq) # 为seq计数
consensusSeq += counts.most_common()[0][0]
print(consensusSeq)#从多到少返回,是个list啊,只返回第一个
fh.write(consensusSeq + '\n')
fh.write('\n'.join(['A:\t' + '\t'.join(map(str, A)), 'C:\t' + '\t'.join(map(str, C)),
'G:\t' + '\t'.join(map(str, G)), 'T:\t' + '\t'.join(map(str, T))]))
#.join(map(str,A)) 把 A=[5, 1, 0, 0, 5, 5, 0, 0] 格式转化成 A:5 1 0 0 5 5 0 0
fh.close()
12、重叠序列
from collections import OrderedDict
import re
def overlap_graph(dna,n):
edges = []
for key1, val1 in dna:
for key2, val2 in dna:
if key1 != key2 and val1[-n:] == val2[0:n]:
edges.append(key1 +'\t' +key2)
return edges
seq = OrderedDict()
with open ('./12.txt', 'r') as f:
for line in f:
line = line.rstrip()
if line.startswith('>'):
seqName = re.sub('>','',line)
seq[seqName] = ''
continue
seq[seqName] += line.upper()
fh = open('./12.1.txt', 'wt')
for x in overlap_graph(dna,3):
fh.write(x + '\n')
fh.close()
seq_list = []
stseq = ''
for line in open('./12.txt','r'):
if line[0] == '>':
if stseq != '':
seq_list.append([stname, stseq])
stseq = ''
stname = line[1:-1]
else:
stseq = stseq + line.strip('\n')
seq_list.append([stname, stseq])
l = len(seq_list)
for i in range(0, l):
for j in range(0, i):
if seq_list[i][1] == seq_list[j][1]:
continue
if seq_list[i][1][0:3] == seq_list[j][1][-3:]:
print (seq_list[j][0], seq_list[i][0])
if seq_list[i][1][-3:] == seq_list[j][1][0:3]:
print (seq_list[i][0], seq_list[j][0])
13、计算后代的概率
a = int(input('AA-AA:'))
b = int(input('AA-Aa:'))
c = int(input('AA-aa:'))
d = int(input('Aa-Aa:'))
e = int(input('Aa-aa:'))
f = int(input('aa-aa:'))
def fun(a,b,c,d,e,f):
x1 = 1*a
x2 = 1*b
x3 = 1*c
x4 = 0.75*d
x5 = 0.5*e
x6 = 0*f
return sum([x1,x2,x3,x4,x5,x6])*2
print(fun(a,b,c,d,e,f))
14、发现共享Motif
s = """>Rosalind_1
GATTACA
>Rosalind_2
TAGACCA
>Rosalind_3
ATACA""".split(">")[1:]
for i in range(len(s)):
s[i] = s[i].replace("\n", '')
while s[i][0] not in "ACGT":
s[i] = s[i][1:]
# ^^^^^^^^^^^^^ all of that to format in FAST in array
#Get shortest of DNA strings
index = s.index(min(s, key=len))
motif = 'A'
shortest = s[index]
#cycle over the DNA string letters
for i in range(len(shortest)):
n = 0
present = True
while present:
#cycle inside over all other DNA strings and if it's present in there considered a motif and length gets increased by 1
for each in s:
if shortest[i:i+n] not in each or n>1000:
present = False
break
if present:
motif = max(shortest[i:i+n], motif, key=len)
n += 1
print (motif)
def readfasta(filename, sample):
fa = open('./14.txt', 'r')
fo = open('./14.1.txt', 'w')
res = {}
rres = []
ID = ''
for line in fa:
if line.startswith('>'):
ID = line.strip('\n')
res[ID] = ''
else:
res[ID] += line.strip('\n')
for key in res.values():
rres.append(key)
fo.write(key + '\n')
return rres
def fragement(seq_list):
res = []
seq = seq_list[0]
for i in range(len(seq)):
s_seq = seq[i:]
#print s_seq
for j in range(len(s_seq)):
res.append(s_seq[:(len(s_seq) - j)])
#print res
return res
def main(infile, sample):
seq_list = readfasta(infile, sample) #['TAGACCA','ATACA','GATTACA']
print(seq_list)
frags = fragement(seq_list)
frags.sort(key=len, reverse=True) # 从长到短排列
for i in range(len(frags)):
ans = []
# s = 0
# m+=1
# print(m)
# res[frags[i]] = 0
for j in seq_list:
r = j.count(frags[i])
if r != 0:
ans.append(r)
if len(ans) >= len(seq_list):
print(frags[i])
break
main('14.txt', 'sample.txt')
15、Independent Alleles
import itertools
def f(k,n):
p = []
child_num = 2**k
for i in range(n):
p.append(len(list(itertools.combinations([x for x in range(child_num)],i)))*(0.25**i)*(0.75**(child_num-i)))
# combinations('ABCD', 2) AB AC AD BC BD CD
return 1-sum(p)
print (f(5,8))
16、Finding a Protein Motif
import urllib.request
import re
list = ['A2Z669','B5ZC00','P07204_TRBM_HUMAN','P20840_SAG1_YEAST']
for one in list:
name = one.strip('\n')
url = 'http://www.uniprot.org/uniprot/'+name+'.fasta'
req = urllib.request.Request(url)
response = urllib.request.urlopen(req) #在python3.3里面,用urllib.request代替urllib2
the_page = response.read()
start = the_page.decode().find('\nM') #str→bytes:encode()方法。str通过encode()方法可以转换为bytes。
# bytes→str:decode()方法。如果我们从网络或磁盘上读取了字节流,那么读到的数据就是bytes。
#要把bytes变为str,就需要用decode()方法。
seq = the_page[start+1:].decode().replace('\n','')
seq = ' '+seq
regex = re.compile(r'N(?=[^P][ST][^P])')
index = 0
out = []
'''
out = [m.start() for m in re.finditer(regex, seq)]
'''
index = 0
while(index<len(seq)):
index += 1
if re.search(regex,seq[index:]) == None:
break
#print S[index:]
if re.match(regex,seq[index:]) != None:
out.append(index)
if out != []:
print (name)
print (' '.join([ str(i) for i in out]))