mclust example1
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# n_features=2是默认的
plt.scatter(X[:,0], X[:,1])
plt.show()
def mclust(features, num_cluster, modelNames='EEE', random_seed=2020):
"""\
Clustering using the mclust algorithm.
The parameters are the same as those in the R package mclust.
"""
np.random.seed(random_seed)
import rpy2.robjects as robjects
robjects.r.library("mclust")
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
r_random_seed = robjects.r['set.seed']
r_random_seed(random_seed)
rmclust = robjects.r['Mclust']
res = rmclust(rpy2.robjects.numpy2ri.numpy2rpy(features), num_cluster, modelNames)
mclust_res = np.array(res[-2])
return mclust_res.astype(int)
label_mclust = mclust(X, num_cluster=4)
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
print("ARI = {}".format(adjusted_rand_score(label_mclust,y)))
print("NMI = {}".format(normalized_mutual_info_score(label_mclust,y)))
pred_y = label_mclust.copy()
fig=plt.figure()
for label in np.unique(pred_y):
plt.scatter(X[label==pred_y,0], X[label==pred_y,1],label=label)
plt.show()
结果如下
mclust (example 2)
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs
def mclust(features, num_cluster, modelNames='EEE', random_seed=2020):
"""\
Clustering using the mclust algorithm.
The parameters are the same as those in the R package mclust.
"""
np.random.seed(random_seed)
import rpy2.robjects as robjects
robjects.r.library("mclust")
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
r_random_seed = robjects.r['set.seed']
r_random_seed(random_seed)
rmclust = robjects.r['Mclust']
res = rmclust(rpy2.robjects.numpy2ri.numpy2rpy(features), num_cluster, modelNames)
mclust_res = np.array(res[-2])
return mclust_res.astype(int)
X, y = make_blobs(n_samples=1000, n_features=50,centers=5, random_state=0)
print(X.shape)
label_mclust = mclust(X, num_cluster=5)
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
print("ARI = {}".format(adjusted_rand_score(label_mclust,y)))
print("NMI = {}".format(normalized_mutual_info_score(label_mclust,y)))
mclust问题
stagate的默认代码是对30dim的数据进行聚类,所以并不出现问题
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import os
import sys
from sklearn.metrics.cluster import adjusted_rand_score
import STAGATE
#####################################################
data_root='/DATA2/zhangjingxiao/yxk/dataset/LIBD/'
section_id = '151676'
save_root="./LIBD_result/"
#####################################################
input_dir = os.path.join(data_root, section_id)
adata = sc.read_visium(path=input_dir, count_file='filtered_feature_bc_matrix.h5')
adata.var_names_make_unique()
print(adata)
#Normalization
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=3000)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.tl.pca(adata)
# read the annotation
Ann_df = pd.read_csv(data_root+"/"+section_id+ "/" + section_id +"_layer_guess.csv",index_col=0)
del Ann_df["barcode"]
del Ann_df["sample_name"]
Ann_df.columns = ['Ground Truth']
#Ann_df
adata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth']
def mclust(features, num_cluster, modelNames='EEE', random_seed=2020):
"""\
Clustering using the mclust algorithm.
The parameters are the same as those in the R package mclust.
"""
np.random.seed(random_seed)
import rpy2.robjects as robjects
robjects.r.library("mclust")
import rpy2.robjects.numpy2ri
rpy2.robjects.numpy2ri.activate()
r_random_seed = robjects.r['set.seed']
r_random_seed(random_seed)
rmclust = robjects.r['Mclust']
res = rmclust(rpy2.robjects.numpy2ri.numpy2rpy(features), num_cluster, modelNames)
mclust_res = np.array(res[-2])
return mclust_res.astype(int)
features = adata.obsm["X_pca"][:,0:30]
label_mclust = mclust(features, num_cluster=7)
print(label_mclust)
但是奇怪的问题出现出现了
features = adata.obsm["X_pca"]
label_mclust = mclust(features, num_cluster=7)
print(label_mclust)
结果如下
所以目前只能默认对于50维的数据做不了,慢慢来