Biopython -- Sequence motif analysis using Bio.motifs_motifs and alignment python-CSDN博客

本文链接：https://blog.csdn.net/jiangshandaiyou/article/details/121745393

Sequence motif analysis using Bio.motifs

Motif objects

# Sequence motif analysis using Bio.motifs
## Motif objects
# We can either create a Motif object from a list of instances of the motif, or we can obtain a Motif object by parsing a file 
# from a motif database or motif finding software

# Creating a motif from instances
from Bio import motifs
from Bio.Seq import Seq

instances = [Seq("TACAA"),Seq("TACGC"),Seq("TACAC"),Seq("TACCC"),Seq("AACCC"),Seq("AATGC"),Seq("AATGC"),]
m = motifs.create(instances) # then we can create a Motif object

len(m) # The length of the motif is defined as the sequence length, which should be the same for all instances
print(m.counts) # Motif object has an attribute .counts containing the counts of each nucleotide at each position

        0      1      2      3      4
A:   3.00   7.00   0.00   2.00   1.00
C:   0.00   0.00   5.00   2.00   6.00
G:   0.00   0.00   0.00   3.00   0.00
T:   4.00   0.00   2.00   0.00   0.00

m.counts["A"] # You can access these counts as a dictionary
m.counts["T", 0] #  you can also think of it as a 2D array with the nucleotide as the first dimension and the position as the second dimension
m.counts[:, 3]

# The motif has an associated consensus sequence, defined as the sequence of letters along the positions of the
# motif for which the largest value in the corresponding columns of the .counts matrix is obtained
m.consensus 

# as well as an anticonsensus sequence, corresponding to the smallest values in the columns of the .counts matrix
m.anticonsensus

# 避免某些位置有多个最大值或者最小值，可以使用模糊碱基代替相应的位置
m.degenerate_consensus

# 先取反向互补序列在取得motif也是不错的选择
# The reverse complement and the degenerate consensus sequence are only defined for DNA motifs
r = m.reverse_complement()

Creating a sequence logo

# Creating a sequence logo
m.weblogo("mymotif.png")

I/O function Reading motifs

# I/O function Reading motifs
# One of the most popular motif databases is JASPAR. In addition to the motif sequence information, the JASPAR database stores a lot of 
# meta-information for each motif. The module Bio.motifs contains a specialized class jaspar.Motif in which this meta-information is represented as attributes

# JASPAR stores motifs in several different ways including three different flat file formats and as an SQL
# database. All of these formats facilitate the construction of a counts matrix

The JASPAR sites format

# The JASPAR sites format
# This format does not store any meta information

from Bio import motifs

with open("Arnt.sites") as handle:
    arnt = motifs.read(handle, "sites")
    
for instance in arnt.instances:
    print(instance) # The instances from which this motif was created is stored in the .instances property
    
print(arnt.counts) # The counts matrix of this motif is automatically calculated from the instances

The JASPAR pfm format

# The JASPAR pfm format
# JASPAR also makes motifs available directly as a count matrix, without the instances from which it was
# created. This pfm format only stores the counts matrix for a single motif
# As with the instances file, no meta information is stored in this format

with open("SRF.pfm") as handle:
    srf = motifs.read(handle, "pfm")

print(srf.counts) # As this motif was created from the counts matrix directly, it has no instances associated with it

print(srf.counts.consensus)

The JASPAR format jaspar

# The JASPAR format jaspar
# The jaspar file format allows multiple motifs to be specified in a single file.In this format each of the motif records consist of a header 
# line followed by four lines defining the counts matrix. The header line begins with a > character (similar to the Fasta file format) 
# and is followed by the unique JASPAR matrix ID and the TF name

fh = open("jaspar_motifs.txt")
for m in motifs.parse(fh, "jaspar"))：
    print(m)

Accessing the JASPAR database

# Accessing the JASPAR database
# In addition to parsing these flat file formats, we can also retrieve motifs from a JASPAR SQL database
from Bio.motifs.jaspar.db import JASPAR5

JASPAR_DB_HOST = <hostname>
JASPAR_DB_NAME = <db_name>
JASPAR_DB_USER = <user>
JASPAR_DB_PASS = <passord>

jdb = JASPAR5(host=JASPAR_DB_HOST, name=JASPAR_DB_NAME,user=JASPAR_DB_USER,password=JASPAR_DB_PASS)

arnt = jdb.fetch_motif_by_id("MA0004") # Now we can fetch a single motif by its unique JASPAR ID with the fetch_motif_by_id method
print(arnt)

motifs = jdb.fetch_motifs_by_name("Arnt") # We can also fetch motifs by name

  File "/tmp/ipykernel_111/2144074979.py", line 5
    JASPAR_DB_HOST = <hostname>
                     ^
SyntaxError: invalid syntax

MEME

# MEME
# MEME is a tool for discovering motifs in a group of related DNA or protein sequences. It takes as input a group of DNA or protein sequences 
# and outputs as many motifs as requested. Therefore, in contrast to JASPAR files, MEME output files typically contain multiple motifs.

with open("meme.INO_up800.classic.oops.xml") as handle:
    record = motifs.parse(handle, "meme") # To parse this file (stored as meme.dna.oops.txt)
    
record.version
record.datafile
record.command
record.alphabet
record.sequences

motif = record[0]
print(motif.consensus)
print(motif.degenerate_consensus)
motif.num_occurrences
motif.length
motif.name
motif.id

Writing motifs

# Writing motifs
arnt.format("pfm") # use the format built-in function to write the motif in the simple JASPAR pfm format
arnt.format("jaspar")

# OR To write out multiple motifs, you can use motifs.write
two_motifs = [arnt, mef2a]
motifs.write(two_motifs, "jaspar") #

Position-Weight Matrices

# Position-Weight Matrices
# The .counts attribute of a Motif object shows how often each nucleotide appeared at each position along the alignment. We can normalize 
# this matrix by dividing by the number of instances in the alignment, resulting in the probability of each nucleotide at each position 
# along the alignment. We refer to these probabilities as the position-weight matrix

# Usually, pseudocounts are added to each position before normalizing. This avoids overfitting of the position-weight matrix to the 
# limited number of motif instances in the alignment, and can also prevent probabilities from becoming zero. To add a fixed pseudocount to 
# all nucleotides at all positions, specify a number for the pseudocounts argument

pwm = m.counts.normalize(pseudocounts=0.5)
print(pwm)

# Alternatively, pseudocounts can be a dictionary specifying the pseudocounts for each nucleotide. For example, as the GC content of the 
# human genome is about 40%, you may want to choose the pseudocounts accordingly:

pwm = m.counts.normalize(pseudocounts={"A":0.6, "C": 0.4, "G": 0.4, "T": 0.6})
print(pwm)

# attribute
pwm.consensus
pwm.anticonsensus
pwm.degenerate_consensus

Position-Specific Scoring Matrices

# Position-Specific Scoring Matrices
# Using the background distribution and PWM with pseudo-counts added, it’s easy to compute the log-odds ratios, telling us what are 
# the log odds of a particular symbol to be coming from a motif against the background.
# We can use the .log_odds() method on the position-weight matrix
pssm = pwm.log_odds() # This assumes that A, C, G, and T are equally likely in the background
print(pssm)

# Here we can see positive values for symbols more frequent in the motif than in the background and negative for symbols more frequent in the background

# To calculate the position-specific scoring matrix against a background with unequal probabilities for A, C, G, T, use the background argument
background = {"A":0.3,"C":0.2,"G":0.2,"T":0.3}
pssm = pwm.log_odds(background)
print(pssm)

Comparing motifs

# Comparing motifs
# Once we have more than one motif, we might want to compare them

# To align the motifs, we use ungapped alignment of PSSMs and substitute zeros for any missing columns at the beginning and end of the 
# matrices. This means that effectively we are using the background distribution for columns missing from the PSSM. The distance function 
# then returns the minimal distance between motifs, as well as the corresponding offset in their alignment

with open("REB1.pfm") as handle:
    m_reb1 = motifs.read(handle, "pfm")

m_reb1.consensus
m_reb1.counts

# To make the motifs comparable, we choose the same values for the pseudocounts and the background distribution as our motif m
m_reb1.pseudocounts = {"A":0.6, "C": 0.4, "G": 0.4, "T": 0.6}
m_reb1.background = {"A":0.3,"C":0.2,"G":0.2,"T":0.3}
pssm_reb1 = m_reb1.pssm
print(pssm_reb1)

# We’ll compare these motifs using the Pearson correlation. Since we want it to resemble a distance measure,
# we actually take 1 − r, where r is the Pearson correlation coefficient (PCC)
distance, offset = pssm.dist_pearson(pssm_reb1)
print("distance = %5.3g" % distance)
print(offset)