包的导入
import random
from Bio import SeqIO
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score, classification_report
import re
import os
import sys
sys.path.append('/home/hanzequan/test_bectiral/rf_model/model1/algroasm')
from feature import com_seq_feature
负数据集的生成
def inter_group_disruption(seq, k=3):
"""随机交换序列中的子序列位置来生成负样本"""
subsequences = [seq[i:i+k] for i in range(0, len(seq), k)]
random.shuffle(subsequences)
return ''.join(subsequences)
def intra_group_disruption(seq, k=3):
"""随机交换子序列内的碱基位置来生成负样本"""
def shuffle_subsequence(subseq):
subseq_list = list(subseq)
random.shuffle(subseq_list)
return ''.join(subseq_list)
subsequences = [shuffle_subsequence(seq[i:i+k]) for i in range(0, len(seq), k)]
return ''.join(subsequences)
def find_noncoding_regions(gbk_record, seq_length):
"""识别非编码区域,并返回一个包含(start, end)元组的列表"""
noncoding_regions = []
coding_regions = [feature for feature in gbk_record.features if feature.type == "CDS"]
sorted_coding_regions = sorted(coding_regions, key=lambda x: x.location.start)
last_end = 0
for feature in sorted_coding_regions:
start = int(feature.location.start)
if last_end < start: # 找到非编码区域
noncoding_regions.append((last_end, start))
last_end = int(feature.location.end)
# 处理最后一个编码区域后的序列
if last_end < len(gbk_record.seq):
noncoding_regions.append((last_end, len(gbk_record.seq)))
# 筛选出长度至少为seq_length的非编码区域
suitable_regions = [region for region in noncoding_regions if region[1] - region[0] >= seq_length]
return suitable_regions
def random_interception_noncoding_extended(accession, seq_length, fasta_dir, gbk_dir):
gbk_path = f"{gbk_dir}/{accession}.gbk"
try:
gbk_record = next(SeqIO.parse(gbk_path, "genbank"))
genome_seq = str(gbk_record.seq)
noncoding_regions = find_noncoding_regions(gbk_record, seq_length)
if noncoding_regions:
selected_region = random.choice(noncoding_regions)
start_pos = random.randint(selected_region[0], selected_region[1] - seq_length)
end_pos = start_pos + seq_length
neg_seq = genome_seq[start_pos:end_pos]
return neg_seq, start_pos, end_pos
else:
# 如果没有足够长的非编码区域,随机选择起始位置并截取,可能涉及编码区
start_pos = random.randint(0, len(genome_seq) - seq_length)
end_pos = start_pos + seq_length
neg_seq = genome_seq[start_pos:end_pos]
return neg_seq, start_pos, end_pos
except Exception as e:
print(f"Error processing {accession}: {e}")
return None, None, None
def generate_wgri_negative_sample_with_start(seq_length, fasta_dir, accessions_list):
"""从整个基因组中随机截取相应长度的序列作为一个WGRI负样本,同时返回起始位置"""
accession = random.choice(accessions_list)
phage_fasta_path = f"{fasta_dir}/{accession}.fasta"
try:
seq_record = next(SeqIO.parse(phage_fasta_path, "fasta"))
genome_seq = str(seq_record.seq)
start_pos = random.randint(0, len(genome_seq) - seq_length)
neg_seq = genome_seq[start_pos:start_pos + seq_length]
return neg_seq, start_pos
except:
return None, None
def generate_negative_samples_for_missing_accessions(missing_accessions_list, fasta_dir, seq_length):
negative_samples = []
num_samples_list = [13, 13, 12] # 分配每个accession的样本数量
#共生成正负数据集的差个序列
for accession, num_samples_per_accession in zip(missing_accessions_list, num_samples_list):
for _ in range(num_samples_per_accession): # 对每个缺失的accession生成指定数量的负样本
neg_seq, start_pos = generate_wgri_negative_sample_with_start(seq_length, fasta_dir, [accession])
if neg_seq is not None:
end_pos = start_pos + seq_length - 1
negative_samples.append({
'accession': accession,
'start': start_pos,
'end': end_pos,
'sequence': neg_seq
})
return pd.DataFrame(negative_samples)
编码
def encode_sequences(sequences, max_len=99):
# 核苷酸编码,加上一个用于填充的0编码
nucleotide_to_int = {'A': 1, 'C': 2, 'G': 3, 'T': 4}
encoded_seqs = []
for seq in sequences:
# 编码序列
encoded_seq = [nucleotide_to_int.get(nuc, 0) for nuc in seq]
# 如果序列长度小于max_len,则用0填充
padded_seq = encoded_seq + [0] * (max_len - len(encoded_seq))
encoded_seqs.append(padded_seq[:max_len]) # 确保序列长度不超过max_len
return np.array(encoded_seqs)
添加负数据
from Bio.Seq import Seq
from Bio import SeqIO
def calculate_end(row):
if row['stand'] == '+':
return row['start'] + len(row['sequence']) - 1
else:
return row['start'] - len(row['sequence']) + 1
# 反转互补的函数
def reverse_complement(seq):
return str(Seq(seq).reverse_complement())
# 延伸序列的函数,包括反转互补处理
def extend_sequence(row, fasta_folder):
fasta_path = f"{fasta_folder}/{row['accession']}.fasta"
record = SeqIO.read(fasta_path, "fasta")
if row['stand'] == '+':
start_pos = max(0, row['start'] - 11)
end_pos = min(len(record.seq), row['end'] + 10)
extended_sequence = str(record.seq[start_pos:end_pos])
else:
start_pos = max(0, row['end'] - 10)
end_pos = min(len(record.seq), row['start'] + 11)
extended_sequence = str(record.seq[start_pos:end_pos])
# 对负链数据进行反转互补处理
extended_sequence = reverse_complement(extended_sequence)
return extended_sequence
def extend_and_update(row, fasta_folder):
fasta_path = f"{fasta_folder}/{row['accession']}.fasta"
record = SeqIO.read(fasta_path, "fasta")
if row['stand'] == '+':
# 计算新的起始和终止位置
new_start = max(0, row['start'] - 11)
new_end = min(len(record.seq), row['end'] + 10)
extended_seq = record.seq[new_start:new_end]
else:
# 对于负链,延伸并反转互补
new_start = max(0, row['end'] - 10)
new_end = min(len(record.seq), row['start'] + 11)
extended_seq = record.seq[new_start:new_end].reverse_complement()
# 更新行信息
row['sequence'] = str(extended_seq)
row['start'] = new_start + 1 # BioPython 序列位置从 0 开始,但通常表示时从 1 开始
row['end'] = new_end
return row
将新建启动子数据构造负数据集
#将新建启动子数据构造负数据集
def generate_negative_sequence(seq):
# 随机选择一个 K 值
k = randint(2, 20)
# 将序列划分为长度为 K 的子序列
subseqs = [seq[i:i+k] for i in range(0, len(seq), k)]
# 随机交换这些子序列的位置
shuffle(subseqs)
# 将交换后的子序列重新组合成一个新的序列
negative_seq = ''.join(subseqs)
return negative_seq
def apply_negative_generation(df):
# 复制 DataFrame 以避免修改原始数据
negative_df = df.copy()
# 为每个序列生成负序列
negative_df['sequence'] = negative_df['sequence'].apply(generate_negative_sequence)
return negative_df
# 假设 freash_promoter_updated 是你的 DataFrame
# freash_promoter_updated = pd.DataFrame({
# 'sequence': ['AAGCTG', 'TTGCAG', ...], # 示例序列
# ...
# })
# # 应用函数生成负序列
# negative_sequences_df = apply_negative_generation(freash_promoter_updated)
# # 查看结果
# negative_sequences_df.head()
读取文件和交叉验证独立数据集,选择最好的模型参数
def read_fasta_extract_data(file_path, header_pattern):
"""
Read FASTA file and extract data based on a given header pattern.
Parameters:
- file_path: Path to the FASTA file.
- header_pattern: Compiled regular expression pattern for extracting data from the header.
Returns:
- DataFrame containing the extracted data.
"""
with open(file_path, 'r') as file:
content = file.read()
cleaned_content = content.replace('..', ',') # Replace '..' with ',' between start and end numbers
data = []
entries = cleaned_content.strip().split('>')[1:] # Skip the first empty entry if any
for entry in entries:
lines = entry.strip().split('\n')
header = lines[0]
sequence = ''.join(lines[1:]).strip() # Join the rest of the lines to form the sequence
match = header_pattern.search(header)
if match:
accession = match.group(1).strip()
start = match.group(2)
end = match.group(3)
data.append({
'accession': accession,
'start': start,
'end': end,
'sequence': sequence
})
return pd.DataFrame(data)
def predict_and_evaluate(model, sequences, true_labels, max_len=99):
"""
Predict and evaluate the model performance on given sequences.
Parameters:
- model: The trained model.
- sequences: Sequences to predict.
- true_labels: The true labels for the sequences.
- max_len: The maximum length for sequence encoding.
Returns:
- None, but prints the accuracy and classification report.
"""
# Encoding sequences and extracting handcrafted features
encoded_sequences = encode_sequences(sequences, max_len=max_len)
handcrafted_features = com_seq_feature(sequences)
# Combine encoded sequences and handcrafted features
X = np.hstack([encoded_sequences, handcrafted_features])
# Predict using the model
y_pred = model.predict(X)
# Evaluate the model
print("Accuracy:", accuracy_score(true_labels, y_pred))
print(classification_report(true_labels, y_pred))
def calculate_metrics(labels, y_pred, y_prob=None):
"""
Calculate and return requested metrics including Recall, Specificity (SPE), Precision (PRE),
F1 score, Accuracy (ACC), and AUC.
Parameters:
- labels: True labels.
- y_pred: Predicted labels.
- y_prob: Predicted probabilities for the positive class. Optional for AUC calculation.
Returns:
- A dictionary containing all the requested metrics.
"""
metrics = {}
# Calculate basic metrics
precision, recall, f1, _ = precision_recall_fscore_support(labels, y_pred, average='binary')
accuracy = accuracy_score(labels, y_pred)
# Calculate specificity
tn, fp, fn, tp = confusion_matrix(labels, y_pred).ravel()
specificity = tn / (tn + fp)
# Assign calculated metrics to the dictionary
metrics['Recall (Sensitivity)'] = recall
metrics['Specificity (SPE)'] = specificity
metrics['Precision (PRE)'] = precision
metrics['F1 Score'] = f1
metrics['Accuracy (ACC)'] = accuracy
# Calculate AUC if y_prob is provided
if y_prob is not None:
try:
auc = roc_auc_score(labels, y_prob)
metrics['AUC'] = auc
except ValueError:
metrics['AUC'] = None
else:
metrics['AUC'] = None
return metrics
负数据集生成
fasta_dir = "/home/public_new/dowlond_phage/phage_fasta"
gbk_dir = "/home/public_new/dowlond_phage/phage_gbk"
#负样本生成
negative_samples = []
for index, row in df_positive.iterrows():
accession = row['accession']
seq_length = len(row['sequence'])
neg_seq, start_pos, end_pos = random_interception_noncoding_extended(accession, seq_length, fasta_dir, gbk_dir)
if neg_seq:
negative_samples.append({
'accession': accession,
'sequence': neg_seq,
'start': start_pos,
'end': end_pos,
'label': 0 # 假设负样本的标签为0
})
df_negative = pd.DataFrame(negative_samples)
df_positive['label'] = 1 # 为正样本添加标签
df_combined = pd.concat([df_positive, df_negative], ignore_index=True)
建模
sequences = np.array(df_combined['sequence'])
# 使用com_seq_feature函数计算特征
handcrafted_features = com_seq_feature(sequences)
sequences = df_combined['sequence'].values
encoded_sequences = encode_sequences(sequences)
handcrafted_features = com_seq_feature(df_combined['sequence'])
# 确保encoded_sequences已经通过encode_sequences函数准备好
# 合并特征
X = np.hstack([encoded_sequences, handcrafted_features])
y = df_combined['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 初始化随机森林分类器
rf = RandomForestClassifier(n_estimators=500, random_state=42)
# 训练模型
rf.fit(X_train, y_train)
# 评估模型
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
迭代建模
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
# 假设read_fasta_extract_data, encode_sequences, com_seq_feature, calculate_metrics已定义
def process_data(positive_path, negative_path):
X_combined, labels = prepare_data(positive_path, negative_path)
X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=42)
return X_train, X_test, y_train, y_test
def optimize_model(X_train, y_train, param_grid):
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
return grid_search.best_estimator_
def evaluate_all_models(models, X_tests, y_tests):
"""
评估所有模型在各自数据集上的表现,并汇总结果。
"""
results = []
for model, X_test, y_test in zip(models, X_tests, y_tests):
metrics = evaluate_model(model, X_test, y_test)
results.append(metrics)
return pd.DataFrame(results)
# 数据集路径
path = '/home/hanzequan/test_bectiral/rf_model/model1/independ_test_data/'
datasets = [
(path+'independ_test_promoters1.fasta', path+'independent_neg1.txt'),
(path+'independ_test_promoters2.fasta', path+'independent_neg2.txt'),
(path+'independ_test_promoters3.fasta', path+'independent_neg3.txt')
]
# 参数网格
param_grid = {
'n_estimators': [100, 200,300,400,500],
'max_depth': [None, 10,20,30],
'min_samples_split': [2, 5,7,9],
'min_samples_leaf': [1, 2,3,4],
'max_features': ['sqrt','log2']
}
# 训练并优化每个数据集对应的模型
models = []
X_tests = []
y_tests = []
for positive, negative in datasets:
X_train, X_test, y_train, y_test = process_data(positive, negative)
best_model = optimize_model(X_train, y_train, param_grid)
models.append(best_model)
X_tests.append(X_test)
y_tests.append(y_test)
# 评估所有模型
metrics_df = evaluate_all_models(models, X_tests, y_tests)
print(metrics_df)
结果分析
# 假设已经定义了read_fasta_extract_data和其他相关函数
# 读取正类数据
positive_sequences = read_fasta_extract_data(
'/home/hanzequan/test_bectiral/rf_model/model1/independ_test_data/independ_test_promoters1.fasta',
header_pattern
)['sequence'].values
# 读取负类数据
# with open('/home/hanzequan/test_bectiral/rf_model/model1/independ_test_data/independent_neg3.txt', 'r') as file:
# negative_sequences = file.read().splitlines()
negative_sequences = read_fasta_extract_data(
'/home/hanzequan/test_bectiral/rf_model/model1/independ_test_data/independent_neg1.txt',
header_pattern
)['sequence'].values
# 合并序列和标签
sequences = np.concatenate([positive_sequences, negative_sequences])
labels = np.concatenate([np.ones(len(positive_sequences)), np.zeros(len(negative_sequences))])
# 整数编码和手工特征提取
encoded_sequences = encode_sequences(sequences, max_len=99)
handcrafted_features = com_seq_feature(sequences)
# 合并特征
X_combined = np.hstack([encoded_sequences, handcrafted_features])
# 使用随机森林模型进行预测
y_pred = best_model.predict(X_combined)
# # 评估模型
# print("Accuracy:", accuracy_score(labels, y_pred))
# print(classification_report(labels, y_pred))
# 使用训练好的模型对特征矩阵X_combined的数据进行概率预测
y_prob = grid_search.predict_proba(X_combined)[:, 1]
# 计算评估指标,包括AUC
metrics = calculate_metrics(labels, y_pred, y_prob)
# 转换为DataFrame并显示结果
metrics_df = pd.DataFrame([metrics])
metrics_df