-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.R
136 lines (107 loc) · 6.69 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# box link to download "input_data" folder containing input data files used for the original analysis:
## https://bcm.box.com/s/5lxqk347vvwhkmeh34ml7jybejrpmh9y
# box link to download "ClusteringTree" folder containing clustering tree results from the original analysis:
## https://bcm.box.com/s/y0accowv3zm3j8a72nokwfpkcsc8ty6v
# reading in discovery cohort metadata
metadf <- openxlsx::read.xlsx("./input_data/Metadata-AllBalanced-MA.xlsx")
# reading in discovery cohort combat normalized expression data
exprsdf_cbat_ma <- readRDS("./input_data/exprs_cbat-AllBalanced-MA.rds")
# creating a new metadata dataframe with sample ID, test or control group, and batch number
metadata <- data.frame("ID"=metadf$GSM.code[match(colnames(exprsdf_cbat_ma),metadf$GSM.code)],
"Group"=metadf$Group[match(colnames(exprsdf_cbat_ma),metadf$GSM.code)],
"Set"=metadf$Batch[match(colnames(exprsdf_cbat_ma),metadf$GSM.code)],
stringsAsFactors = F)
# adding GSEs as batch names
metadata$Set <- unname(sapply(metadata$ID, function(x){metadf$GSE[metadf$GSM.code==x]}))
# reorder samples to match sample order between columns of expression data and rows of metadata
exprsdf_cbat_ma <- exprsdf_cbat_ma[, metadata$ID]
identical(colnames(exprsdf_cbat_ma), metadata$ID)
# clustering Tree
source("R/ClusteringTree.R")
# generating the clustering tree
ClusteringTree(outdir = "./ClusteringTree/", exprs_df = exprsdf_cbat_ma, meta_df = metadata, res.seq = seq(0,1,0.2), initrun = T)
# loading in clustering information
my_obj <- readRDS("./ClusteringTree/FullSeuratObj_0-1-0.2.rds")
clusteringdf <- [email protected]
# loading UQ normalized expression data from the Borstel cohort (validation cohort)
exprsdf_uq_bor <- readRDS("./input_data/Exprs_UQ-combined.rds")
# loading UQ normalized RNA-Seq expression data (validation cohort)
exprsdf_uq_rna <- readRDS("./input_data/exprs_UQ-AllBalanced-RNASeq.rds")
# reducing discovery cohort to genes common between discovery, borstel and rna-seq cohorts
exprsdf_training <- exprsdf_cbat_ma[rownames(exprsdf_cbat_ma) %in% rownames(exprsdf_uq_bor), ]
exprsdf_training <- exprsdf_training[rownames(exprsdf_training) %in% rownames(exprsdf_uq_rna), ]
identical(colnames(exprsdf_training), metadata$ID)
# z-score normalization
source("./R/Zscore_normalization_by_controls.R")
exprsdf_training <- zscore_norm_by_ctrls(exprsdf = exprsdf_training, metadf = metadata)
# using cluster assignment for samples at resolution 0.4
cluster.list <- clusteringdf[,"RNA_snn_res.0.4"]
metadata$Cluster <- sapply(metadata$ID,
function(x){
if(x %in% rownames(clusteringdf)){as.numeric(cluster.list[rownames(clusteringdf)==x])-1}else{99}
})
exprsdf_training_tb <- data.frame(Cluster=as.factor(metadata$Cluster[metadata$Cluster!=99]), t(exprsdf_training[, metadata$Cluster!=99]))
# RF training
# create classification output directory if it does not exist
outdir <- "./Classification_0.4/"
if (!dir.exists(outdir)){dir.create(outdir, recursive = TRUE)}
library(randomForest)
# myRF <- randomForest(Cluster~., data = exprsdf_training_tb, importance = T)
#
# myimp <- data.frame(Gene=row.names(myRF$importance), myRF$importance, stringsAsFactors = F)
# myimp <- myimp[order(myimp$MeanDecreaseGini, decreasing = T), ]
# write.xlsx(myimp[,-c(2,3)], "./Classification_0.4/RF_Importancedf_AllBalanced_0.4_commonDBV.xlsx", rownames=F)
myimp <- read.xlsx("./Classification_0.4/RF_Importancedf_AllBalanced_0.4_commonDBV.xlsx")
# # building RF models with a range of number of genes and saving their OOB error rate estimate
# myseq <- c(seq(10,100,10), 200, 500, seq(1000,ncol(exprsdf_training_tb)-1,1000), ncol(exprsdf_training_tb)-1)
# oob.err <- c()
# allRFmodels <- list()
#
# for (numgenes in myseq) {
# myRF <- randomForest(Cluster ~ ., data = exprsdf_training_tb[,c("Cluster", myimp$Gene[1:numgenes])], importance = T)
# oob.err <- c(oob.err,mean(predict(myRF)!= exprsdf_training_tb$Cluster))
# allRFmodels[[numgenes]] <- myRF
# }
# ooberrdf <- data.frame("Genes"=myseq,"OOBErrorRate"=paste0(round(oob.err*100, 2),"%"), row.names = NULL)
# ooberrdf$Rank <- rank(oob.err, ties.method = "min")
# write.xlsx(ooberrdf,"./Classification_0.4/OOBerrorrate.xlsx", overwrite = T)
# # after investigating OOBerrorrate.xlsx, we pick the 50 gene model as the best one to move forward with
# # running validation on rna-seq and borstel data using this 50 gene RF classifier model
# numgenes <- 50
#
# myRF <- allRFmodels[[numgenes]]
# saveRDS(myRF, paste0("./Classification_0.4/RF_Classifier_",numgenes,"genes_AllBalanced_0.4_commonDBV.rds"))
# loading borstel metadata
metadata_bor <- read.xlsx("./input_data/meta_Borstel_TB_HC_Combined-Batch.xlsx")
metadata_bor$Set <- metadata_bor$Set %>% gsub(1, "GIC",.) %>% gsub(2, "GVC",.) %>% gsub(3, "RVC",.) %>% as.factor()
identical(colnames(exprsdf_uq_bor), metadata_bor$ID)
# loading rna-seq metadata
metadf_rna <- read.xlsx("./input_data/Metadata-AllBalanced-RNASeq.xlsx")
metadata_rna <- data.frame("ID"=metadf_rna$GSM.code[match(colnames(exprsdf_uq_rna),metadf_rna$GSM.code)],
"Group"=metadf_rna$Group[match(colnames(exprsdf_uq_rna),metadf_rna$GSM.code)],
"Set"=metadf_rna$Batch[match(colnames(exprsdf_uq_rna),metadf_rna$GSM.code)])
metadata_rna$Set <- as.factor(metadata_rna$Set)
identical(colnames(exprsdf_uq_rna), metadata_rna$ID)
# validation - borstel
# Loading RF classifier model
myRF <- readRDS(paste0("./Classification_0.4/RF_Classifier_",numgenes,"genes_AllBalanced_0.4_commonDBV.rds"))
myRFpred <- function(exprs_df, meta_df, cohort, numgenes) {
source("./R/Zscore_normalization_by_controls.R")
topgenes <- myimp$Gene[1:numgenes]
# Z-Score Normalization
# range(exprsdf)
testingdf <- zscore_norm_by_ctrls(exprs_df, meta_df)
# range(exprsdf)
## Restricting testingdf to top 50 genes and picking out only Test samples
## Check the conversion of "-" to "." in gene names to keep uniformity
testingdf <- t(testingdf[topgenes,meta_df$Group=="Test"])
# Testing
myprob_RF <- predict(myRF, testingdf, type = "prob")
# table(round(myprob_RF[,2]))
resultsdf <- data.frame(Sample=row.names(myprob_RF), Cluster=round(myprob_RF[,2]))
meta_df$Cluster <- sapply(meta_df$ID,function(x){ifelse(x %in% resultsdf$Sample, resultsdf$Cluster[resultsdf$Sample==x], 99)})
table(meta_df$Group,meta_df$Cluster)
write.table(meta_df, paste0("./Classification_0.4/RF_FullClusterPred_",numgenes,"genes_",cohort,"_AllBalanced_0.4_commonDBV.xls"), row.names = F, sep = "\t")
}
myRFpred(exprs_df = exprsdf_uq_bor, meta_df = metadata_bor, cohort = "Borstel", numgenes = 50)
myRFpred(exprs_df = exprsdf_uq_rna, meta_df = metadata_rna, cohort = "RNA", numgenes = 50)