rosalind练习题

最新推荐文章于 2023-12-06 16:45:40 发布

qq_38790244

最新推荐文章于 2023-12-06 16:45:40 发布

阅读量1.4k

点赞数 2

分类专栏： python

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

1、统计各碱基的数量, 计算整条序列长度

seq = ''

with open('./1.txt', "r") as f:
    for line in f:
        if line[0] != ">":
            #print(line)
            line = line.strip().upper()
            
            seq += line
		

#统计碱基
print("A={0}".format(seq.count("A")))
print("T={0}".format(seq.count("T")))
print("G={0}".format(seq.count("G")))
print("C={0}".format(seq.count("C")))

#统计长度
print("length={0}".format(len(seq)))

b='AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
print("A={0}".format(b.count("A")))
print("G={0}".format(b.count("G")))
print("C={0}".format(b.count("C")))
print("T={0}".format(b.count("T")))
print("length={0}".format(len(b)))

f = 'AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
counts = []
for i in ['A','G','C','T']: 
    counts.append(f.count(i))
print (' '.join(map(str, counts)))
print ('seqlength =',len(f))

2、将DNA序列转换为RNA序列

import re 
dna = 'GATGGAACTTGACTACGTAAATT'

rnaseq1 = re.sub('T' , 'U', dna)
rnaseq2 = dna.replace('T', 'U')
print(rnaseq1)
print(rnaseq2)

import re
dnaseq = ''
g = open ('C:/Users/Administrator/Desktop/base1.txt', 'w')
f = open ('C:/Users/Administrator/Desktop/base.txt')
for line in f:
    if line[0] != '>':
        line = line.strip()
        dnaseq = dnaseq+line
    
rnaseq1 = re.sub('T' , 'U', dnaseq)
g.write(rnaseq1)
f.close()
g.close()

str = 'GATGGAACTTGACTACGTAAATT'
print(str.replace('T','U'))

3、打印出DNA互补链和反向互补链

def complement(seq):
    ntCom = {'A':'T', 'C':'G', 'T':'A', 'G':'C'}
     
    dna1= list(seq)
    dna2 = [ntCom[k] for k in dna1]
    dna3 = ''.join(dna2)
    return dna3
    
seq = 'TGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCGTGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGAT'
         
print (complement(seq))

4、斐波那契兔子繁殖问题：
n个月后，每对兔子至少繁殖k对兔子，最后兔子总数是？

def fibonacciRabbits(n,k):
    F = [0,1,1]
    generation = 3
    while generation <= n:
        F.append(F[generation-1]+F[generation-2]*k)
        generation += 1
 
    return (F[n])
 
fibonacciRabbits(5,3)

def fibonacciRabbits(n,k):
    if n <= 2:
        return (1)
    else:
        return (fibonacciRabbits(n-1,k) + fibonacciRabbits(n-2,k)*k)
print (fibonacciRabbits(5,3))

斐波那契数列

i, j = 1, 1
while i < 10000:
    print (i)
    i, j = j, i+j

# 递归
def Fibonacci_Recursion_tool(n):
    if n <= 0:
        return 0
    elif n == 1:
        return 1
    else:
        return Fibonacci_Recursion_tool(n - 1) + Fibonacci_Recursion_tool(n - 2)


def Fibonacci_Recursion(n):
    result_list = []
    for i in range(1, n + 1): 
        result_list.append(Fibonacci_Recursion_tool(i))
    return result_list

Fibonacci_Recursion(15)

[1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610]

5、计算GC含量最大的DNA序列

### 5. Computing GC Content ###
from operator import itemgetter
from collections import OrderedDict
  
seqTest = OrderedDict()
gcContent = OrderedDict()
  
with open('./5.txt', 'rt') as f:
    for line in f:  
        line = line.rstrip()
        if line.startswith('>'):
            seqName = line[1:]
            seqTest[seqName] = ''
            continue
        seqTest[seqName] += line.upper()
  
  
for ke, val in seqTest.items():
    totalLength = len(val)
    gcNumber = val.count('G') + val.count('C')
    gcContent[ke] = float(gcNumber/totalLength)*100
      
sortedGCContent = sorted(gcContent.items(), key = itemgetter(1))
  
largeName = sortedGCContent[-1][0]
largeGCContent = sortedGCContent[-1][1]
 
print ('most GC ratio gene is %s and it is %s ' %(largeName,largeGCContent))

# to open FASTA format sequence file:
s=open('./5.txt','r').readlines()
 
# to create two lists, one for names, one for sequences
name_list=[]
seq_list=[]
 
data='' # to put the sequence from several lines together
 
for line in s:
    line=line.strip()
    for i in line:
        if i == '>':
            name_list.append(line[1:])
            if data:
                seq_list.append(data)         #将每一行的的核苷酸字符串连接起来
                data=''                       # 合完后data 清零
            break
        else:
            line=line.upper()
    if all([k==k.upper() for k in line]):    #验证是不是所有的都是大写
        data=data+line
seq_list.append(data)                         # is there a way to include the last sequence in the for loop?
GC_list=[]
for seq in seq_list:
    i=0
    for k in seq:
        if k=="G" or k=='C':
            i+=1
    GC_cont=float(i)/len(seq)*100.0
    GC_list.append(GC_cont)
 
 
m=max(GC_list)
print (name_list[GC_list.index(m)])             # to find the index of max GC
print ("{:0.6f}".format(m))                  # 保留6位小数

def parse_fasta(s):
    results = {}
    strings = s.strip().split('>')
    # Python split()通过指定分隔符对字符串进行切片，如果参数num 有指定值，则仅分隔 num 个子字符串
 
    for s in strings:
        if len(s) == 0:
            continue
            # 如果字符串长度为0，就跳出循环。
 
        parts = s.split()
        label = parts[0]
        bases = ''.join(parts[1:])
 
        results[label] = bases
 
    return results
 
def gc_content(s):
    n = len(s)
    m = 0
 
    for c in s:
        if c == 'G' or c == 'C':
            m += 1
 
    return 100 * (float(m) / n)
 
if __name__ == "__main__":
 
    small_dataset = """
    >Rosalind_6404
    CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
    TCCCACTAATAATTCTGAGG
    >Rosalind_5959
    CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
    ATATCCATTTGTCAGCAGACACGC
    >Rosalind_0808
    CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
    TGGGAACCTGCGGGCAGTAGGTGGAAT
    """
 
    #large_dataset = open('datasets/rosalind_gc.txt').read()
 
    results = parse_fasta(small_dataset)
    results = dict([(k, gc_content(v)) for k, v in results.items()])
    # 这里iteritem()和item()功能是一样的
    # 前一个results输出，名称+序列，后一个results输出，名称+百分比
 
 
    highest_k = None
    highest_v = 0
 
    for k, v in results.items():
        if v > highest_v:
            highest_k = k
            highest_v = v
            # 输出GC含量高的
    print (highest_k)
    print ('%f%%' % highest_v)

6、计算点突变个数

with open('./6.txt', 'rt') as f:
    seq = f.readlines()
seq1,seq2 = seq[0].strip(),seq[1].strip()    
HD = 0
 
for i in range(len(seq1)):
    if seq1[i] != seq2[i]:
        HD += 1
         
print (HD)

s = 'GAGCCTACTAACGGGAT'
t = 'CATCGTAATGACGGCCT'
count = 0
for i in range(len(s)):
    if s[i] != t[i]:
        count +=1
print (count)

7、孟德尔第一定律

from scipy.misc import comb
  
individuals = input('Number of individuals(k,m,n):')
[k, m, n] = map(int,individuals.split(','))
t = k+m+n
 
rr = comb(n,2)/comb(t,2)
hh = comb(m,2)/comb(t,2)
hr = comb(n,1)*comb(m,1)/comb(t,2) 
 
prob = 1 - (rr+hh*1/4+hr*1/2)
print(rr, hh,hr)  
print (prob)

def f(x, y, z):
    s = x + y + z  # the sum of population
    c = s * (s - 1) / 2.0  # comb(2,s)
    p = 1 - (z * (z - 1) / 2 + 0.25 * y * (y - 1) / 2 + y * z * 0.5) / c
    return p
 
print (f(2, 2, 2))

8、RNA序列翻译为蛋白质

def translate_rna(sequence):
    codonTable = {
    'AUA':'I', 'AUC':'I', 'AUU':'I', 'AUG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACU':'T',
    'AAC':'N', 'AAU':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGU':'S', 'AGA':'R', 'AGG':'R',
    'CUA':'L', 'CUC':'L', 'CUG':'L', 'CUU':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCU':'P',
    'CAC':'H', 'CAU':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGU':'R',
    'GUA':'V', 'GUC':'V', 'GUG':'V', 'GUU':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCU':'A',
    'GAC':'D', 'GAU':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGU':'G',
    'UCA':'S', 'UCC':'S', 'UCG':'S', 'UCU':'S',
    'UUC':'F', 'UUU':'F', 'UUA':'L', 'UUG':'L',
    'UAC':'Y', 'UAU':'Y', 'UAA':'', 'UAG':'',
    'UGC':'C', 'UGU':'C', 'UGA':'', 'UGG':'W',
    }
    proteinsequence = ''
    for n in range(0,len(sequence),3):
            proteinsequence += codonTable[sequence[n:n+3]]
    return proteinsequence
 
se = 'AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA' #sequence or se = open('rosalind_prot.txt').read().strip('\n') #sequence
print(translate_rna(se))

import re
def mRNA_protein(RNA_string):
    start_code = 'AUG'
    end_code = ['UAA', 'UAG', 'UGA']
    protein_table = {'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', \
                     'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', \
                     'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', \
                     'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', \
                     'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', \
                     'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', \
                     'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', \
                     'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', \
                     'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', \
                     'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', \
                     'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', \
                     'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', \
                     'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', \
                     'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', \
                     'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', \
                     'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
                     }
    #找到起始密码子的位置
    start_sit = re.search(start_code, RNA_string)
    protein = ''
    #按阅读框匹配蛋白质
    for sit in range(start_sit.start(), len(RNA_string), 3):
        protein = protein + protein_table[RNA_string[sit:sit+3]]
    return protein

if __name__ == '__main__':
    RNA_string =  'ACGGGGAUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'
    print(mRNA_protein(RNA_string))

import re
from collections import OrderedDict
 
codonTable = OrderedDict()
with open('rna_codon_table.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = re.split('\s+', line)      #\s+ 匹配空格1次或无限次
        for i in [0, 2, 4, 6]:
            codonTable[lst[i]] = lst[i + 1]
rnaSeq = ''
with open('rosalind_prot.txt', 'rt') as f:
    for line in f:
        line = line.rstrip()
        rnaSeq += line.upper()
aminoAcids = []
i = 0
while i < len(rnaSeq):
    codon = rnaSeq[i:i + 3]
    if codonTable[codon] != 'Stop':
        aminoAcids.append(codonTable[codon])
    i += 3
    peptide = ''.join(aminoAcids)
print (peptide)

9、寻找DNA Motif

import re
matches = re.finditer('(?=ATAT)', 'GATATATGCATATACTT')
for match in matches:
    print (match.start()+1,end = '\t')

seq = 'GATATATGCATATACTT'
pattern = 'ATAT'
 
def find_motif(seq, pattern):
    position = []
    for i in range(len(seq) - len(pattern)):
        if seq[i:i + len(pattern)] == pattern:
            position.append(str(i + 1))
 
    print ('\t'.join(position))
 
 
find_motif(seq, pattern)

import re
seq='GATATATGCATATACTT'
print ([i.start()+1 for i in re.finditer('(?=ATAT)',seq)])

10、寻找共同祖先

import numpy as np
import os
from collections import Counter
fhand = open('./10.txt')
t = []
for line in fhand:
    if line.startswith('>'):
        continue
    else:
        line = line.rstrip()
    line_list = list(line)
    t.append(line_list)
a = np.array(t)#创建一个二维数组
print(a)
L1,L2,L3,L4 = [], [], [], []
comsquence=''
for i in range(a.shape[1]):
    l = [x[i] for x in a] #调出二维数组的每一列
    L1.append(l.count('A'))
    L2.append(l.count('C'))
    L3.append(l.count('T'))
    L4.append(l.count('G'))
    comsquence=comsquence+Counter(l).most_common()[0][0]
print (comsquence)
print ('A:',L1,'\n','C:',L2,'\n','T:',L3,'\n','G:',L4)

from collections import Counter
from collections import OrderedDict
 
fh = open('./11.txt', 'wt')
 
rosalind = OrderedDict()
seqLength = 0
with open('./10.txt') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('>'):
            rosalindName = line
            rosalind[rosalindName] = ''
            continue
        rosalind[rosalindName] += line
    seqLength = len(rosalind[rosalindName])   #len(ATCCAGCT)
A, C, G, T = [], [], [], []
consensusSeq = ''
for i in range(seqLength):
    seq = ''
    for k in rosalind.keys():      # rosalind.keys = Rosalind_1...Rosalind_7
        seq += rosalind[k][i]       # 把 Rosalind_1...Rosalind_7 相同顺序的碱基加起来
    A.append(seq.count('A'))
    C.append(seq.count('C'))
    G.append(seq.count('G'))
    T.append(seq.count('T'))
    counts = Counter(seq)           # 为seq计数
    consensusSeq += counts.most_common()[0][0] 
    print(consensusSeq)#从多到少返回,是个list啊，只返回第一个
fh.write(consensusSeq + '\n')
fh.write('\n'.join(['A:\t' + '\t'.join(map(str, A)), 'C:\t' + '\t'.join(map(str, C)),
                    'G:\t' + '\t'.join(map(str, G)), 'T:\t' + '\t'.join(map(str, T))]))
                    #.join(map(str,A))  把 A=[5, 1, 0, 0, 5, 5, 0, 0] 格式转化成 A：5 1 0 0 5 5 0 0
fh.close()

12、重叠序列

from collections import OrderedDict
import re

def overlap_graph(dna,n):
    edges = []
    for key1, val1 in dna:
        for key2, val2 in dna:
            if key1 != key2 and val1[-n:] == val2[0:n]:
                edges.append(key1 +'\t' +key2)
    return edges

seq = OrderedDict()

with open ('./12.txt', 'r') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('>'):
            seqName = re.sub('>','',line)
            seq[seqName] = ''
            continue
        seq[seqName] += line.upper()

fh = open('./12.1.txt', 'wt')

for x in overlap_graph(dna,3):
    fh.write(x + '\n')
fh.close()

seq_list = []
stseq = ''
for line in open('./12.txt','r'):
    if line[0] == '>':
        if stseq != '':
            seq_list.append([stname, stseq])
            stseq = ''
        stname = line[1:-1]
    else:
        stseq = stseq + line.strip('\n')
seq_list.append([stname, stseq])
l = len(seq_list)
 
for i in range(0, l):
    for j in range(0, i):
        if seq_list[i][1] == seq_list[j][1]:
            continue
        if seq_list[i][1][0:3] == seq_list[j][1][-3:]:
            print (seq_list[j][0], seq_list[i][0])
        if seq_list[i][1][-3:] == seq_list[j][1][0:3]:
            print (seq_list[i][0], seq_list[j][0])

13、计算后代的概率

a = int(input('AA-AA:'))
b = int(input('AA-Aa:'))
c = int(input('AA-aa:'))
d = int(input('Aa-Aa:'))
e = int(input('Aa-aa:'))
f = int(input('aa-aa:'))

def fun(a,b,c,d,e,f):
    x1 = 1*a
    x2 = 1*b
    x3 = 1*c
    x4 = 0.75*d
    x5 = 0.5*e
    x6 = 0*f
    return sum([x1,x2,x3,x4,x5,x6])*2

print(fun(a,b,c,d,e,f))

14、发现共享Motif

s = """>Rosalind_1
GATTACA
>Rosalind_2
TAGACCA
>Rosalind_3
ATACA""".split(">")[1:]

for i in range(len(s)):
    s[i] = s[i].replace("\n", '')
    while s[i][0] not in "ACGT":
        s[i] = s[i][1:]
# ^^^^^^^^^^^^^ all of that to format in FAST in array

#Get shortest of DNA strings
index = s.index(min(s, key=len))

motif = 'A'
shortest = s[index]

#cycle over the DNA string letters
for i in range(len(shortest)):
    n = 0
    present = True
    while present:
            #cycle inside over all other DNA strings and if it's present in there considered a motif and length gets increased by 1
        for each in s:
            if shortest[i:i+n] not in each or n>1000:
                present = False
                break
        if present:
            motif = max(shortest[i:i+n], motif, key=len)
        n += 1
print (motif)

def readfasta(filename, sample):
    fa = open('./14.txt', 'r')
    fo = open('./14.1.txt', 'w')
    res = {}
    rres = []
    ID = ''
    for line in fa:
        if line.startswith('>'):
            ID = line.strip('\n')
            res[ID] = ''
        else:
            res[ID] += line.strip('\n')
 
    for key in res.values():
        rres.append(key)
        fo.write(key + '\n')
    return rres
 
def fragement(seq_list):
    res = []
    seq = seq_list[0]
    for i in range(len(seq)):
        s_seq = seq[i:]
        #print s_seq
        for j in range(len(s_seq)):
            res.append(s_seq[:(len(s_seq) - j)])
            #print res
 
    return res
 
def main(infile, sample):
    seq_list = readfasta(infile, sample)   #['TAGACCA','ATACA','GATTACA']
    print(seq_list)
    frags = fragement(seq_list)
    frags.sort(key=len, reverse=True)     # 从长到短排列
    for i in range(len(frags)):
        ans = []
        # s = 0
        # m+=1
        # print(m)
        # res[frags[i]] = 0
        for j in seq_list:
            r = j.count(frags[i])
            if r != 0:
                ans.append(r)
        if len(ans) >= len(seq_list):
            print(frags[i])
            break
 
 
main('14.txt', 'sample.txt')

15、Independent Alleles

import itertools
def f(k,n):
    p = []
    child_num = 2**k
    for i in range(n):
        p.append(len(list(itertools.combinations([x for x in range(child_num)],i)))*(0.25**i)*(0.75**(child_num-i)))
        # combinations('ABCD', 2)       AB AC AD BC BD CD
    return 1-sum(p)
 
print (f(5,8))

16、Finding a Protein Motif

import urllib.request
import re
list = ['A2Z669','B5ZC00','P07204_TRBM_HUMAN','P20840_SAG1_YEAST']
 
for one in list:
    name = one.strip('\n')
    url = 'http://www.uniprot.org/uniprot/'+name+'.fasta'
    req = urllib.request.Request(url)
    response = urllib.request.urlopen(req)   #在python3.3里面，用urllib.request代替urllib2
    the_page = response.read()
    start = the_page.decode().find('\nM')    #str→bytes：encode()方法。str通过encode()方法可以转换为bytes。
                                             # bytes→str：decode()方法。如果我们从网络或磁盘上读取了字节流，那么读到的数据就是bytes。
                                             #要把bytes变为str，就需要用decode()方法。
    seq = the_page[start+1:].decode().replace('\n','')
    seq = ' '+seq
    regex = re.compile(r'N(?=[^P][ST][^P])')
    index = 0
    out = []
    '''
    out = [m.start() for m in re.finditer(regex, seq)]
    '''
 
    index = 0
    while(index<len(seq)):
        index += 1
 
        if re.search(regex,seq[index:]) == None:
            break
 
 
        #print S[index:]
        if re.match(regex,seq[index:]) != None:
            out.append(index)
 
 
 
 
    if out != []:
        print (name)
        print (' '.join([ str(i) for i in out]))