-
Notifications
You must be signed in to change notification settings - Fork 0
/
modelado.R
215 lines (184 loc) · 10.7 KB
/
modelado.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env Rscript
start.time <- Sys.time() # para devolver al final el tiempo de ejecución
# -------------------------
# FUNCIONES
# -------------------------
library("parallel")
library("optparse")
home <- strsplit(paste0(getopt::get_Rscript_filename()),split="/")[[1]]
home <- paste(home[-length(home)],collapse="/")
print(home)
if (nchar(home) == 0) {
home <- "."
}
# -------------------------------
# 1 --> Definiciones preliminares
# -------------------------------
option_list <- list(
make_option(c("-n", "--nodes"), type="character", default=NULL,
help="Node input information text file name (e.g. 'my_node_data.txt'). The file needs to have at least three columns, that are assumed to be on the 1st, 9th and 10th fields: a 'node name' column (Node35562), a 'leaf list' column (4296486;593890;740825;4322506;4343405;1552835;99357;1697002;1116749;542614;) and a taxonomy column (k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae)", metavar="character"),
make_option(c("-m", "--medium"), type="character", default="M9",
help="medium (e.g. 'M9', 'M9[glc]'", metavar="character"),
make_option(c("--mediadb"), type="character", default=paste0(home,"/my_media.tsv"),
help="media database file name", metavar="character"),
make_option(c("-c", "--checking"), type="logical", default=FALSE,
help="TRUE or FALSE (default). If TRUE, generated and analysed models are limited to those pairs of species that are co-ocurring in pairs in one or more samples of a specified experiment.", metavar="logical"),
make_option(c("-a", "--abuntable"), type="character", default=NULL,
help="Abundance table text file name (e.g. 'table.from_biom.tsv'). The tab-separated file needs to have the following format:\n\t\t#OTU ID\tS1\tS2\tS3\t(...)\n\t\tO1\t000000\t000001\t000002\t(...) ", metavar="character"),
make_option(c("--coupling"), type="logical", default=TRUE,
help="If TRUE, smetana computes an aditional analysis without the --no-coupling option (see smetana help). Default is TRUE.", metavar="logical"),
make_option(c("--nucmer"), type="character", default=paste0(home,"/MUMmer3.23/nucmer"),
help="Path to the nucmer executable (e.g. './MUMmer3.23/nucmer', '~/my_apps/nucmer'.", metavar="character"),
make_option(c("--showcoords"), type="character", default=paste0(home,"/MUMmer3.23/show-coords"),
help="Path to the show-coords executable (e.g. './MUMmer3.23/show-coords', '~/my_apps/show_coords'.", metavar="character"),
make_option(c("--fasta"), type="character", default=paste0(home,"/99_otus.fasta"),
help="16S sequences to be analyzed in multifasta format. Default is './99_otus_nodes.tree' (Greengenes gg_13_5).", metavar="character"),
make_option(c("--db16s"), type="character", default=paste0(home,"/bac120_ssu_reps_r95.fna"),
help="16S sequences database. Nucmer will align the tree leaves' 16S sequences to this database. Default is './bac120_ssu_reps_r95.fna' (from GTDB https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/genomic_files_reps/).", metavar="character"),
make_option(c("--dbproteins"), type="character", default=paste0(home,"/protein_faa_reps/bacteria/"),
help="Aminoacid sequences database. CarveMe will take files from here that correspond to Nucmer hits and create SBML models from those files. Default is 'protein_faa_reps/bacteria/' (from GTDB https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/genomic_files_reps/).", metavar="character"),
make_option(c("--run_smetana"), type="logical", default=TRUE,
help="If TRUE, runs a Smetana analysis for inter-node pairs. If FALSE, skips the Smetana analysis. Default: TRUE.", metavar="logical"),
make_option(c("--cores"), type="numeric", default=4,
help="Number of cores to use in parallelization processes (mclapply). Default: 4.", metavar="numeric"))
parser <- OptionParser(option_list=option_list)
opt <- parse_args(parser)
tab <- na.omit(read.csv(opt$nodes,sep = "\t"))
node_names <- tab[[1]]
leaves <- tab[[9]]
taxonom <- tab[[10]] # for Gram-type checking
medium <- opt$medium
mediadb <- opt$mediadb
checking <- opt$checking
run_smetana<- opt$run_smetana
coupling <- opt$coupling
if (checking==TRUE & is.null(opt$abuntable)) {
stop("When --checking TRUE, an abundance table file must be specified")
} else if (checking == TRUE) {
abuntable = opt$abuntable
}
nucmer_path <- opt$nucmer
showcoords <- opt$showcoords
otus_fasta_file <- opt$fasta
db_16S <- opt$db16s
db_protein_folder <- opt$dbproteins
grampospath <- paste0(home,"/grampos.csv") # estos dos paths no son personalizables (no le veo sentido)
gramnegpath <- paste0(home,"/gramneg.csv")
cores <- opt$cores
# -----------------------
# 2 --> carga de archivos
# -----------------------
# TODO insert length warning here and not in check and smetana
nodos <- mclapply((1:length(node_names)), function(nodo) {
list(tip.label=strsplit(leaves[nodo], ";")[[1]])
}, mc.cores=cores)
# divido el fasta en dos archivos para facilitar su parsing después:
system(paste0("cat ",otus_fasta_file," | grep '>' > 99_otus_col1_",rndnum))
system(paste0('cat ',otus_fasta_file,' | grep ">" -v > 99_otus_col2_',rndnum))
df1 <- read.csv(paste0("99_otus_col1_",rndnum), header = F)
fasta <- read.csv(paste0("99_otus_col2_",rndnum),header = F)
rownames(fasta) <- sub(">", "", df1[,1]) # elimino ">"
fasta <- as.data.frame(t(fasta)) # Columna 1: > ID. Columna 2: secuencia
system(paste0("rm 99_otus_col*_",rndnum)) # elimino archivos temporales
# ------------------------------------------------
# 3 --> alinear cada hoja, filtrar y hacer modelos
# ------------------------------------------------
# Asigno secuencias 16S a cada hoja
if (checking == TRUE) {
# De cada hoja que pase el checking, tomo la secuencia de 16S del fasta original.
# Son todas las hojas de cada nodo que estén presentes en la tabla de abundancia
checked_tipl <- check(nodos = nodos,
cutoff = 0,
abuntable = abuntable,
cores = cores)
nodos_16S <- mclapply(checked_tipl, function(n) {fasta[as.character(unique(n))]}, mc.cores=cores)
} else {
# De cada hoja (sin checking), cojo la secuencia de 16S del fasta original
nodos_16S <- mclapply(nodos, function(nodo) {fasta[nodo$tip.label]}, mc.cores=cores)
}
system("mkdir models")
# Para cada nodo creo una carpeta de resultados
###### skip those otus that have already had a model made
for (i in 1:length(nodos_16S)) {
filtered_nodos_16S <- list()
filepath <- paste("models/",node_names[i],"/",sep="")
system(paste("mkdir", filepath)) # la carpeta tendrá el mismo nombre que el nodo
already_existing <- lapply(list.files(filepath), basename)
for (otu in names(nodos_16S[[i]])) {
if (paste0(otu, ".xml") %in% already_existing) {
message(paste0("skipping ", otu, ", model already exists"))
} else {
filtered_nodos_16S[otu] <- nodos_16S[[i]][otu]
}
}
# Alineo cada hoja de cada nodo con Nucmer y obtengo un genoma adecuado para cada una,
# seleccionando solo las hojas cuyos hits pasan un filtro de calidad
nucmer_res_final <- find_alignment_hits(filepath, setNames(data.frame(filtered_nodos_16S), names(filtered_nodos_16S)), nucmer_path, db_16S, showcoords, cores)
#Modelado con CarveMe de todas las hojas de cada nodo que pasan el filtro de Nucmer
print(paste0("Creating models for ",node_names[i],"..."))
if (!is.null(nucmer_res_final)) {
out <- tryCatch({
dump <- simplify2array(mclapply(nucmer_res_final,
FUN = function(line) {carve(line, taxonom[i], mediadb, media, filepath, db_protein_folder)},mc.cores=cores))
print(paste0("Finished generating models for ",node_names[i],"."))
}, error=function(cond) {
message("nucmer_res_final:")
message(nucmer_res_final)
print(paste0("No models generated for ",node_names[i],"; THERE USED TO BE AN ERROR HERE, IS IT OK NOW; debug"))
},
warning=function(cond) {warning(cond)})
}
print(paste0("No models generated for ",node_names[i],"."))
}
# -------------------------------------
# 4 --> análisis metabólico con Smetana
# -------------------------------------
save.image("ola.RData")
if (run_smetana == TRUE) {
# Definimos la lista de parejas a analizar
if (checking == TRUE) {
pairs <- checked_tipl
} else {
pairs <- expand.grid(nodos[[1]]$tip.label, nodos[[2]]$tip.label, KEEP.OUT.ATTRS = F)
}
# Ejecutamos Smetana para cada pareja inter-nodo de hojas para las que se ha creado
# un modelo. Esta lista se parejas se guardará en smetana_results/generated_pairs.txt.
# También guardamos la lista de parejas que no han sido analizadas por Smetana, en el
# archivo smetana_results/filtered_out_pairs.txt. Incluimos el porcentaje de parejas
# que pasan y no pasan el filtro.
#~~~~~~~~~~~~~~~~~~
output = paste0(node_names[1],"_",node_names[2],"smetana_results/")
system(paste0("mkdir ",output))
system(paste0("mkdir ",output,"global"))
system(paste0("mkdir ",output,"detailed"))
if (coupling==TRUE) {
output_coupling = paste0(output,"coupling/")
system(paste("mkdir",output_coupling))
system(paste0("mkdir ",output_coupling,"global"))
system(paste0("mkdir ",output_coupling,"detailed"))
} else {
output_coupling = NULL
}
#~~~~~~~~~~~~~~~~~
generated_pairs_filename = paste0(node_names[1],"_",node_names[2],"_pairs.txt")
dump <- file.create(paste0(output, generated_pairs_filename)) # vaciamos el archivo, de existir, o lo creamos si no existe
# Paralelización con parApply (solo para este paso)
failed_pairs <- mclapply(data.frame(t(pairs)), FUN=function(z) {
smetana(z, nodos=node_names, modelfilepath="models/", output=output,
coupling=coupling, output_coupling=output_coupling,
generated_pairs_filename=generated_pairs_filename)
}, mc.cores=cores)
failed_pairs[simplify2array(mclapply(failed_pairs, is.null, mc.cores=cores))] <- NULL
failed_pairs <- t(failed_pairs) # preparamos la matriz para el siguiente paso
colnames(failed_pairs) <- seq(length(failed_pairs[1,]))
# Anotamos las parejas que no han sido analizadas por Smetana (no pasaron el filtro de Nucmer)
write(x=paste0("filtered out: ", 100*length(failed_pairs)/length(pairs[,1])/2,"%"), file=paste0(output,"filtered_out_pairs_",rndnum,".txt"))
dump <- mclapply(colnames(failed_pairs), FUN=function(col){
cat(failed_pairs[,col],"\n", file=paste0(output,"filtered_out_pairs_",rndnum,".txt"), append=T)
}, mc.cores=cores)
print(paste0("Finished SMETANA analysis."))
}
#
end.time <- Sys.time()
time.taken <- end.time - start.time
print(paste0("Execution time: ",format(time.taken,format = "%H %M %S")))