feature_selection目录下的代码
在终端输入
python -u ffs_final.py "./data/Final_sample_dataset_v1.csv" "./data/Pairwise_featcorr_0.8.pkl" 3 5 "all" "./output"
显示报错
因为数据太大,修改代码
修改ffs_final为如下代码进行运行
import sys
import csv
import pickle
import pandas as pd
from itertools import combinations
from sklearn import linear_model
from scipy.stats import pearsonr
from multiprocessing import Pool
import itertools
data = sys.argv[1]
two_combos = sys.argv[2]
nomit = sys.argv[3]
n_feat = sys.argv[4]
pass_feat = sys.argv[5]
outpath = sys.argv[6]
n_feat = int(n_feat)
nomit = int(nomit)
df = pd.read_csv(data, sep='\t', header=0)
feats_final = []
if pass_feat == "all":
feats_final = list(df.columns)[nomit:-1]
else:
with open(pass_feat) as feats:
for feat in feats.readlines():
feat = feat.strip()
feats_final.append(feat)
print("Total no. of features = " + str(len(feats_final)))
with open(two_combos, 'rb') as f:
pair_corrs = pickle.load(f)
pair_corrs = set(pair_corrs)
def regression_model(data, combination, y):
X = data[list(combination)]
model = linear_model.LinearRegression()
model.fit(X, y)
predicted = model.predict(X)
pearson_corr, _ = pearsonr(y, predicted)
return combination, pearson_corr, model.coef_
def find_pass_combos(combination, pair_corrs, data, y):
feat_pairs = set(list(combinations(combination, 2)))
feat_pass = [True for pair in feat_pairs if pair in pair_corrs]
if len(feat_pass) == len(feat_pairs):
return regression_model(data, combination, y)
y = df["pKd"]
# Define generator function to yield batches of combinations
def batch_combinations(features, batch_size):
for i in range(0, len(features), batch_size):
yield features[i:i+batch_size]
# Process combinations in batches
batch_size = 1000 # Adjust this according to your memory capacity
for n in range(n_feat, n_feat+1):
feat_combinations = set(combinations(feats_final, n))
print("No. of possible {} feature combinations: {}".format(n, len(feat_combinations)))
# Process combinations in batches
for feat_batch in batch_combinations(list(feat_combinations), batch_size):
results = []
for combination in feat_batch:
result = find_pass_combos(combination, pair_corrs, df, y)
if result:
results.append(result)
with open(outpath + "Aptamers_best_" + str(n) + "_feature_combos.log", 'a') as f:
for feat_combo, corr, coef in results:
f.write('\t'.join(feat_combo) + "\t" + str(corr) + "\t" + str(coef) + "\n")
print("Processed {} combinations".format(len(feat_batch)))
print(str(n) + " feature combinations done.")