The sample data is in file “taylor2010_data.txt” included in this homework. This dataset has expression profiles of 23974 genes in 27 normal samples, 129 primary cancer samples, 18 metastasized cancer samples, and 5 unknown samples. Assume the data has been normalized and summarized to expression index.
Clustering (unsupervised learning)
1. Try hierarchical clustering average linkage on the samples (normal, primary, metastasized and unknown) using all genes, and plot the dendrogram. Hint: use functionsdist, hclust, plot
taylor = as.matrix(read.csv("taylor2010_data.txt", sep="\t",row.names=1,header=T))
In = grepl("N.P", colnames(taylor))
Ip = grepl("P.P", colnames(taylor))
Im = grepl("M.P", colnames(taylor))
Nn = sum(In)
Np = sum(Ip)
Nm = sum(Im)
expr <- read.table("week_5/taylor2010_data.txt",sep='\t',header=TRUE)
pcor <- cor(expr)
lower<-as.dist(1-pcor)
pdf('hierarchical_cluster.pdf',width=5,height=5)
hc_average<-hclust(lower,method=c("average"))
plot(hc_average,main="average",cex=0.2)
hc_complete<-hclust(lower,method=c("complete"))
plot(hc_complete,main="complete",cex=0.2)
hc_single<-hclust(lower,method=c("single"))
plot(hc_single,main="single",cex=0.2)
dev.off()
2. Do PCA biplot on the samples with all genes, and use 4 different colors to distinguish the 4 types of samples (normal, primary, metastasized and unknown). The samples from different groups look separable. Hint: use the PCA biplot R code from lab2, also function legend is useful.
pc.cr <- princomp(expr)
summary(pc.cr)
pdf('pca.pdf',width=5,height=5)
plot(pc.cr)
load<-loadings(pc.cr)
colors <- function(mol.biol) { if (substr(mol.biol,1,5)=="N.PAN") "#FF0000" else if (substr(mol.biol,1,5)=="P.PCA") "#0000FF" else if (substr(mol.biol,1,5)=="M.PCA") "Tomato" else "lawngreen"}
plot(load[,1:2],pch=16,col=unlist(lapply(colnames(expr), colors)))
legend("topright",c("normal", "primary", "metastasized","unknown"),col=c("#FF0000","#0000FF","Tomato","lawngreen"),lty=1,box.lty=0)
dev.off()
pc.cr <- prcomp(t(expr))
summary(pc.cr)
28.463% of variation in the data is captured in the first two principle components. We need 83 principle components do to capture 85% of the variation in the data.
3. For the 174 samples with known type (normal, primary, metastasized), use LIMMA to find the differentially expressed genes with fold change threshold 1.5, and adjusted p-value threshold 0.05. 946 differentially expressed genes are found.Hint: the design vector consists of type indicator for the 174 samples. For example, 0 for normal, 1 for primary, and 2 for metastasized.
expr<-read.table("/mnt/Storage/home/tuser/cailingling/week_5/taylor2010_data.txt",sep='\t',header=TRUE)
library(limma)
library(affy)
design <- c(rep(0,27),rep(1,129),rep(2,18))
design.mat <- model.matrix(~design)
fit <- lmFit(expr[1:174],design.mat)
fit <- eBayes(fit)
topTable(fit,coef = 'design')
difexp <- topTable(fit, n=nrow(fit),lfc = log2(1.5))
difexp <-difexp[difexp[,"P.Value"]<0.05,]
expr_difexp <- expr[rownames(difexp),]
> nrow(difexp)
[1] 946
4. Repeat the hierarchical clustering and PCA on all samples with the differentially expressed genes found in 3.
pcor <- cor(expr_difexp)
lower<-as.dist(1-pcor)
pdf('hierarchical_cluster_difexp.pdf',width=5,height=5)
hc_average<-hclust(lower,method=c("average"))
plot(hc_average,main="average",cex=0.2)
hc_complete<-hclust(lower,method=c("complete"))
plot(hc_complete,main="complete",cex=0.2)
hc_single<-hclust(lower,method=c("single"))
plot(hc_single,main="single",cex=0.2)
dev.off()
#use differentially expressed genes to do pca
pc.cr <- princomp(expr_difexp)
summary(pc.cr)
pdf('pca_difexp.pdf',width=5,height=5)
plot(pc.cr)
load<-loadings(pc.cr)
colors <- function(mol.biol) { if (substr(mol.biol,1,5)=="N.PAN") "#FF0000" else if (substr(mol.biol,1,5)=="P.PCA") "#0000FF" else if (substr(mol.biol,1,5)=="M.PCA") "Tomato" else "lawngreen"}
plot(load[,1:2],pch=16,col=unlist(lapply(colnames(expr), colors)))
legend("topright",c("normal", "primary", "metastasized","unknown"),col=c("#FF0000","#0000FF","Tomato","lawngreen"),lty=1,box.lty=0)
dev.off()
Based on the PCA biplot, I can classify the 5 unknown samples.2 are classfied into primary cancer samples. 3 are classfied into metastasized cancer samples.
5. Try k-means clustering on all samples using the differentially expressed genes.
Hint: function kmeans
# use differentially expressed genes to do kmeans cluster
km_3 <- kmeans(t(expr_difexp),3)
km_4 <- kmeans(t(expr_difexp),4)
pdf('kmeans_difexp_3.pdf',width=5,height=5)
plot(t(expr_difexp)[km_3$cluster==1,],col="red")
points(t(expr_difexp)[km_3$cluster==2,],col="blue")
points(t(expr_difexp)[km_3$cluster==3,],col="seagreen")
points(km_3$centers,pch=2,col="green")
dev.off()
pdf('kmeans_difexp_4.pdf',width=5,height=5)
plot(t(expr_difexp)[km_4$cluster==1,],col="red")
points(t(expr_difexp)[km_4$cluster==2,],col="blue")
points(t(expr_difexp)[km_4$cluster==3,],col="yellow")
points(t(expr_difexp)[km_4$cluster==4,],col="black")
points(km_4$centers,pch=2,col="green")
dev.off()
k = 3
k = 4
We can run hierarchical cluster to pick initial cluster centers. And we can randomly start many times with different initial centers.
Classification (supervised learning)
1. Use differentially expressed genes in the known samples to perform LDA, and predict the unknown samples. Hint: use MASS library and function lda.
expr_difexp_t <- data.frame(t(expr_difexp))
expr_difexp_t$sample<-c(rep("normal",27),rep("primary",129),rep("metastasized",18),rep("unknown",5))
library(MASS)
ldaout <- lda(sample~.data=expr_difexp_t[1:174,])
class.p <- predict(ldaout,newdata=expr_difexp_t[175:179,])$class
2. K-Nearest-Neighbor (KNN)
For observation X with unknown label, find the k observation in the training data closest ( e.g correlation ) to X. Predict the label of X based on majority vote by KNN. K can be determined by predictability of known samples ( semi-supervised ).Now I run KNN (try K = 1, 3 and 5) on all the genes and all the samples, and predict the unknown samples based on the 174 labeled samples. Hint: use library class and functionknn.
expr_t <- data.frame(t(expr))
expr_t$sample <- c(rep("normal",27),rep("primary",129),rep("metastasized",18),rep("unknown",5))
data_train <- expr_t[1:174,]
data_test <- expr_t[175:179,]
train_sample <- expr_t$sample[1:174]
#test_sample <- expr_difexp_t[175:179,ncol(expr_difexp_t)+1]
library(class)
ml_1 <- knn(t(exp_difexp[, 1:174]), t(exp_difexp[, 175:179]), c(rep("normal",Nn),rep("primary",Np),rep("metastasized",Nm)),k=1)
ml_3 <- knn(t(exp_difexp[, 1:174]), t(exp_difexp[, 175:179]), c(rep("normal",Nn),rep("primary",Np),rep("metastasized",Nm)),k=3)
ml_4 <- knn(t(exp_difexp[, 1:174]), t(exp_difexp[, 175:179]), c(rep("normal",Nn),rep("primary",Np),rep("metastasized",Nm)),k=4)
3. SVM
Run SVM (try a linear kernel) on all the genes and all the samples, and predict the unknown samples based on the 174 labeled samples. Hint: use library e1071 and function svm.
library('e1071')
model1 = svm(t(exp_diff[, 1:174]), c(rep("normal",Nn),rep("primary",Np),rep("metastasized",Nm)), type='C', kernel='linear')
predict(model1, t(exp_diff[, 175:179]))
4. K-fold cross-validation
Divide data into K subset ( equal size ), build classifier in ( K-1 ) subsets, compute error rate on left out subset. They will tell me how good is my method's sensitivity on accuracy.Implement a 5-fold cross validation to estimate the classification error rate of KNN and SVM, based on the 174 samples with known label. Here is the code.
exp <- read.table("taylor2010_data.txt",sep='\t',header=TRUE)
ex<-data.frame(t(exp[1:174]))
a <- sample(174,140)
rownames(ex)<-c(1:174)
train <- ex[a,]
test <- ex[-a,]
b<-rbind(a,rep("train",140))
c<-rbind(rownames(ex[-a,]),rep("test",34))
d<-cbind(b,c)
sort(as.numeric(d[1,]))
pred.knn <- class::knn(train=train,test=test,cl=ex$sample[a,],k=3)
accuracy.knn <- sum(pred.knn==ex$sample[-a,])/length(ex$sample[a,])
accuracy.knn
( To be continued ... )