病毒启动子预测--基于随机森林算法

gdgfdfj

已于 2024-03-29 14:03:20 修改

阅读量358

点赞数 3

文章标签：算法随机森林机器学习

于 2024-03-29 14:01:13 首次发布

本文链接：https://blog.csdn.net/gdgfdfj/article/details/137143075

版权

本文介绍了如何在噬菌体基因组数据分析中生成负样本，包括随机插入、子序列交换和非编码区域截取，以及使用sklearn库中的随机森林分类器进行模型训练、评估和参数优化的过程。

摘要由CSDN通过智能技术生成

包的导入

import random
from Bio import SeqIO
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score, classification_report
import re
import os
import sys
sys.path.append('/home/hanzequan/test_bectiral/rf_model/model1/algroasm')
from feature import com_seq_feature

负数据集的生成

def inter_group_disruption(seq, k=3):
    """随机交换序列中的子序列位置来生成负样本"""
    subsequences = [seq[i:i+k] for i in range(0, len(seq), k)]
    random.shuffle(subsequences)
    return ''.join(subsequences)
def intra_group_disruption(seq, k=3):
    """随机交换子序列内的碱基位置来生成负样本"""
    def shuffle_subsequence(subseq):
        subseq_list = list(subseq)
        random.shuffle(subseq_list)
        return ''.join(subseq_list)
    
    subsequences = [shuffle_subsequence(seq[i:i+k]) for i in range(0, len(seq), k)]
    return ''.join(subsequences)

def find_noncoding_regions(gbk_record, seq_length):
    """识别非编码区域，并返回一个包含(start, end)元组的列表"""
    noncoding_regions = []
    coding_regions = [feature for feature in gbk_record.features if feature.type == "CDS"]
    sorted_coding_regions = sorted(coding_regions, key=lambda x: x.location.start)

    last_end = 0
    for feature in sorted_coding_regions:
        start = int(feature.location.start)
        if last_end < start:  # 找到非编码区域
            noncoding_regions.append((last_end, start))
        last_end = int(feature.location.end)

    # 处理最后一个编码区域后的序列
    if last_end < len(gbk_record.seq):
        noncoding_regions.append((last_end, len(gbk_record.seq)))
    
    # 筛选出长度至少为seq_length的非编码区域
    suitable_regions = [region for region in noncoding_regions if region[1] - region[0] >= seq_length]
    return suitable_regions

def random_interception_noncoding_extended(accession, seq_length, fasta_dir, gbk_dir):
    gbk_path = f"{gbk_dir}/{accession}.gbk"

    try:
        gbk_record = next(SeqIO.parse(gbk_path, "genbank"))
        genome_seq = str(gbk_record.seq)
        noncoding_regions = find_noncoding_regions(gbk_record, seq_length)
        
        if noncoding_regions:
            selected_region = random.choice(noncoding_regions)
            start_pos = random.randint(selected_region[0], selected_region[1] - seq_length)
            end_pos = start_pos + seq_length
            neg_seq = genome_seq[start_pos:end_pos]
            return neg_seq, start_pos, end_pos
        else:
            # 如果没有足够长的非编码区域，随机选择起始位置并截取，可能涉及编码区
            start_pos = random.randint(0, len(genome_seq) - seq_length)
            end_pos = start_pos + seq_length
            neg_seq = genome_seq[start_pos:end_pos]
            return neg_seq, start_pos, end_pos
    except Exception as e:
        print(f"Error processing {accession}: {e}")
    
    return None, None, None




def generate_wgri_negative_sample_with_start(seq_length, fasta_dir, accessions_list):
    """从整个基因组中随机截取相应长度的序列作为一个WGRI负样本，同时返回起始位置"""
    accession = random.choice(accessions_list)
    phage_fasta_path = f"{fasta_dir}/{accession}.fasta"
    try:
        seq_record = next(SeqIO.parse(phage_fasta_path, "fasta"))
        genome_seq = str(seq_record.seq)
        start_pos = random.randint(0, len(genome_seq) - seq_length)
        neg_seq = genome_seq[start_pos:start_pos + seq_length]
        return neg_seq, start_pos
    except:
        return None, None



def generate_negative_samples_for_missing_accessions(missing_accessions_list, fasta_dir, seq_length):
    negative_samples = []
    num_samples_list = [13, 13, 12]  # 分配每个accession的样本数量
    #共生成正负数据集的差个序列
    for accession, num_samples_per_accession in zip(missing_accessions_list, num_samples_list):
        for _ in range(num_samples_per_accession):  # 对每个缺失的accession生成指定数量的负样本
            neg_seq, start_pos = generate_wgri_negative_sample_with_start(seq_length, fasta_dir, [accession])
            if neg_seq is not None:
                end_pos = start_pos + seq_length - 1
                negative_samples.append({
                    'accession': accession,
                    'start': start_pos,
                    'end': end_pos,
                    'sequence': neg_seq
                })
    
    return pd.DataFrame(negative_samples)

编码

def encode_sequences(sequences, max_len=99):
    # 核苷酸编码，加上一个用于填充的0编码
    nucleotide_to_int = {'A': 1, 'C': 2, 'G': 3, 'T': 4}
    encoded_seqs = []
    for seq in sequences:
        # 编码序列
        encoded_seq = [nucleotide_to_int.get(nuc, 0) for nuc in seq]
        # 如果序列长度小于max_len，则用0填充
        padded_seq = encoded_seq + [0] * (max_len - len(encoded_seq))
        encoded_seqs.append(padded_seq[:max_len])  # 确保序列长度不超过max_len
    return np.array(encoded_seqs)

添加负数据

from Bio.Seq import Seq
from Bio import SeqIO
def calculate_end(row):
    if row['stand'] == '+':
        return row['start'] + len(row['sequence']) - 1
    else:
        return row['start'] - len(row['sequence']) + 1

# 反转互补的函数
def reverse_complement(seq):
    return str(Seq(seq).reverse_complement())

# 延伸序列的函数，包括反转互补处理
def extend_sequence(row, fasta_folder):
    fasta_path = f"{fasta_folder}/{row['accession']}.fasta"
    record = SeqIO.read(fasta_path, "fasta")
    
    if row['stand'] == '+':
        start_pos = max(0, row['start'] - 11)
        end_pos = min(len(record.seq), row['end'] + 10)
        extended_sequence = str(record.seq[start_pos:end_pos])
    else:
        start_pos = max(0, row['end'] - 10)
        end_pos = min(len(record.seq), row['start'] + 11)
        extended_sequence = str(record.seq[start_pos:end_pos])
        # 对负链数据进行反转互补处理
        extended_sequence = reverse_complement(extended_sequence)
    
    return extended_sequence
def extend_and_update(row, fasta_folder):
    fasta_path = f"{fasta_folder}/{row['accession']}.fasta"
    record = SeqIO.read(fasta_path, "fasta")
    
    if row['stand'] == '+':
        # 计算新的起始和终止位置
        new_start = max(0, row['start'] - 11)
        new_end = min(len(record.seq), row['end'] + 10)
        extended_seq = record.seq[new_start:new_end]
    else:
        # 对于负链，延伸并反转互补
        new_start = max(0, row['end'] - 10)
        new_end = min(len(record.seq), row['start'] + 11)
        extended_seq = record.seq[new_start:new_end].reverse_complement()
    
    # 更新行信息
    row['sequence'] = str(extended_seq)
    row['start'] = new_start + 1  # BioPython 序列位置从 0 开始，但通常表示时从 1 开始
    row['end'] = new_end
    return row

将新建启动子数据构造负数据集

#将新建启动子数据构造负数据集
def generate_negative_sequence(seq):
    # 随机选择一个 K 值
    k = randint(2, 20)
    # 将序列划分为长度为 K 的子序列
    subseqs = [seq[i:i+k] for i in range(0, len(seq), k)]
    # 随机交换这些子序列的位置
    shuffle(subseqs)
    # 将交换后的子序列重新组合成一个新的序列
    negative_seq = ''.join(subseqs)
    return negative_seq

def apply_negative_generation(df):
    # 复制 DataFrame 以避免修改原始数据
    negative_df = df.copy()
    # 为每个序列生成负序列
    negative_df['sequence'] = negative_df['sequence'].apply(generate_negative_sequence)
    return negative_df

# 假设 freash_promoter_updated 是你的 DataFrame
# freash_promoter_updated = pd.DataFrame({
#     'sequence': ['AAGCTG', 'TTGCAG', ...],  # 示例序列
#     ...
# })

# # 应用函数生成负序列
# negative_sequences_df = apply_negative_generation(freash_promoter_updated)

# # 查看结果
# negative_sequences_df.head()

读取文件和交叉验证独立数据集，选择最好的模型参数



def read_fasta_extract_data(file_path, header_pattern):
    """
    Read FASTA file and extract data based on a given header pattern.
    
    Parameters:
    - file_path: Path to the FASTA file.
    - header_pattern: Compiled regular expression pattern for extracting data from the header.
    
    Returns:
    - DataFrame containing the extracted data.
    """
    with open(file_path, 'r') as file:
        content = file.read()
    
    cleaned_content = content.replace('..', ',')  # Replace '..' with ',' between start and end numbers
    data = []
    entries = cleaned_content.strip().split('>')[1:]  # Skip the first empty entry if any

    for entry in entries:
        lines = entry.strip().split('\n')
        header = lines[0]
        sequence = ''.join(lines[1:]).strip()  # Join the rest of the lines to form the sequence
        match = header_pattern.search(header)
        
        if match:
            accession = match.group(1).strip()
            start = match.group(2)
            end = match.group(3)
            data.append({
                'accession': accession,
                'start': start,
                'end': end,
                'sequence': sequence
            })
    
    return pd.DataFrame(data)

def predict_and_evaluate(model, sequences, true_labels, max_len=99):
    """
    Predict and evaluate the model performance on given sequences.
    
    Parameters:
    - model: The trained model.
    - sequences: Sequences to predict.
    - true_labels: The true labels for the sequences.
    - max_len: The maximum length for sequence encoding.
    
    Returns:
    - None, but prints the accuracy and classification report.
    """
    # Encoding sequences and extracting handcrafted features
    encoded_sequences = encode_sequences(sequences, max_len=max_len)
    handcrafted_features = com_seq_feature(sequences)
    
    # Combine encoded sequences and handcrafted features
    X = np.hstack([encoded_sequences, handcrafted_features])
    
    # Predict using the model
    y_pred = model.predict(X)
    
    # Evaluate the model
    print("Accuracy:", accuracy_score(true_labels, y_pred))
    print(classification_report(true_labels, y_pred))


def calculate_metrics(labels, y_pred, y_prob=None):
    """
    Calculate and return requested metrics including Recall, Specificity (SPE), Precision (PRE),
    F1 score, Accuracy (ACC), and AUC.
    
    Parameters:
    - labels: True labels.
    - y_pred: Predicted labels.
    - y_prob: Predicted probabilities for the positive class. Optional for AUC calculation.
    
    Returns:
    - A dictionary containing all the requested metrics.
    """
    metrics = {}
    
    # Calculate basic metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, y_pred, average='binary')
    accuracy = accuracy_score(labels, y_pred)
    
    # Calculate specificity
    tn, fp, fn, tp = confusion_matrix(labels, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    # Assign calculated metrics to the dictionary
    metrics['Recall (Sensitivity)'] = recall
    metrics['Specificity (SPE)'] = specificity
    metrics['Precision (PRE)'] = precision
    metrics['F1 Score'] = f1
    metrics['Accuracy (ACC)'] = accuracy
    
    # Calculate AUC if y_prob is provided
    if y_prob is not None:
        try:
            auc = roc_auc_score(labels, y_prob)
            metrics['AUC'] = auc
        except ValueError:
            metrics['AUC'] = None
    else:
        metrics['AUC'] = None
    
    return metrics

负数据集生成

fasta_dir = "/home/public_new/dowlond_phage/phage_fasta"
gbk_dir = "/home/public_new/dowlond_phage/phage_gbk"
#负样本生成
negative_samples = []

for index, row in df_positive.iterrows():
    accession = row['accession']
    seq_length = len(row['sequence'])
    neg_seq, start_pos, end_pos = random_interception_noncoding_extended(accession, seq_length, fasta_dir, gbk_dir)
    if neg_seq:
        negative_samples.append({
            'accession': accession,
            'sequence': neg_seq,
            'start': start_pos,
            'end': end_pos,
            'label': 0  # 假设负样本的标签为0
        })

df_negative = pd.DataFrame(negative_samples)
df_positive['label'] = 1  # 为正样本添加标签

df_combined = pd.concat([df_positive, df_negative], ignore_index=True)

建模

sequences = np.array(df_combined['sequence'])

# 使用com_seq_feature函数计算特征
handcrafted_features = com_seq_feature(sequences)
sequences = df_combined['sequence'].values
encoded_sequences = encode_sequences(sequences)
handcrafted_features = com_seq_feature(df_combined['sequence'])

# 确保encoded_sequences已经通过encode_sequences函数准备好
# 合并特征
X = np.hstack([encoded_sequences, handcrafted_features])
y = df_combined['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化随机森林分类器
rf = RandomForestClassifier(n_estimators=500, random_state=42)

# 训练模型
rf.fit(X_train, y_train)

# 评估模型
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

迭代建模

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# 假设read_fasta_extract_data, encode_sequences, com_seq_feature, calculate_metrics已定义

def process_data(positive_path, negative_path):
    X_combined, labels = prepare_data(positive_path, negative_path)
    X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def optimize_model(X_train, y_train, param_grid):
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=2, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

def evaluate_all_models(models, X_tests, y_tests):
    """
    评估所有模型在各自数据集上的表现，并汇总结果。
    """
    results = []
    for model, X_test, y_test in zip(models, X_tests, y_tests):
        metrics = evaluate_model(model, X_test, y_test)
        results.append(metrics)
    return pd.DataFrame(results)

# 数据集路径
path = '/home/hanzequan/test_bectiral/rf_model/model1/independ_test_data/'
datasets = [
    (path+'independ_test_promoters1.fasta', path+'independent_neg1.txt'),
    (path+'independ_test_promoters2.fasta', path+'independent_neg2.txt'),
    (path+'independ_test_promoters3.fasta', path+'independent_neg3.txt')
]

# 参数网格
param_grid = {
    'n_estimators': [100, 200,300,400,500],
    'max_depth': [None, 10,20,30],
    'min_samples_split': [2, 5,7,9],
    'min_samples_leaf': [1, 2,3,4],
    'max_features': ['sqrt','log2']
}

# 训练并优化每个数据集对应的模型
models = []
X_tests = []
y_tests = []

for positive, negative in datasets:
    X_train, X_test, y_train, y_test = process_data(positive, negative)
    best_model = optimize_model(X_train, y_train, param_grid)
    models.append(best_model)
    X_tests.append(X_test)
    y_tests.append(y_test)

# 评估所有模型
metrics_df = evaluate_all_models(models, X_tests, y_tests)
print(metrics_df)

结果分析

# 假设已经定义了read_fasta_extract_data和其他相关函数

# 读取正类数据
positive_sequences = read_fasta_extract_data(
    '/home/hanzequan/test_bectiral/rf_model/model1/independ_test_data/independ_test_promoters1.fasta',
    header_pattern
)['sequence'].values

# 读取负类数据
# with open('/home/hanzequan/test_bectiral/rf_model/model1/independ_test_data/independent_neg3.txt', 'r') as file:
#     negative_sequences = file.read().splitlines()
negative_sequences = read_fasta_extract_data(
    '/home/hanzequan/test_bectiral/rf_model/model1/independ_test_data/independent_neg1.txt',
    header_pattern
)['sequence'].values
# 合并序列和标签
sequences = np.concatenate([positive_sequences, negative_sequences])
labels = np.concatenate([np.ones(len(positive_sequences)), np.zeros(len(negative_sequences))])

# 整数编码和手工特征提取
encoded_sequences = encode_sequences(sequences, max_len=99)
handcrafted_features = com_seq_feature(sequences)

# 合并特征
X_combined = np.hstack([encoded_sequences, handcrafted_features])

# 使用随机森林模型进行预测
y_pred = best_model.predict(X_combined)

# # 评估模型
# print("Accuracy:", accuracy_score(labels, y_pred))
# print(classification_report(labels, y_pred))
# 使用训练好的模型对特征矩阵X_combined的数据进行概率预测
y_prob = grid_search.predict_proba(X_combined)[:, 1]
# 计算评估指标，包括AUC
metrics = calculate_metrics(labels, y_pred, y_prob)

# 转换为DataFrame并显示结果
metrics_df = pd.DataFrame([metrics])

metrics_df