-
Notifications
You must be signed in to change notification settings - Fork 0
/
annotate.R
187 lines (150 loc) · 9.67 KB
/
annotate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env Rscript
start.time <- Sys.time() # para devolver al final el tiempo de ejecución
# INPUT: annotate.R -g genomes -o outputname --outputdir outputdir
# INFO: THIS SCRIPT TAKES A LIST OF GENOMES, ANNOTATES ALL OF THEM WITH EGGNOG MAPPER AND RETURNS THE ANNOTATED FILES PLUS A CONSENSUS ONE
# =======================
# FUNCIONES
# =======================
library("optparse")
library("parallel")
home <- strsplit(paste0(getopt::get_Rscript_filename()),split="/")[[1]]
home <- paste(home[-length(home)],collapse="/")
print(home)
if (nchar(home) == 0) {
home <- "."
}
source(paste0(home,"/utils.R")) # cargo las funciones del paquete
# ==============
# Definiciones
# ==============
option_list <- list(
make_option(c("-g", "--genomes"), type="character", default=NULL,
help="File containing the list of genomes to annotate. The format may be the following:\n\tRS_GCF_000281895.1\n\tRS_GCF_900100495.1\n\tRS_GCF_900104015.1\n\t(...)", metavar="character"),
make_option(c("-n", "--nodes"), type="character", default=NULL,
help="Node input information text file name (e.g. 'my_node_data.txt'). The file needs to have at least three columns, that are assumed to be on the 1st, 9th and 10th fields: a 'node name' column (Node35562), a 'leaf list' column (4296486;593890;740825;4322506;4343405;1552835;99357;1697002;1116749;542614;) and a taxonomy column (k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae)", metavar="character"), make_option(c("--skip_consensus"), type="logical", default=FALSE,
help="If TRUE, eggNOG-mapper creates the annotation files but a consensus file is not made. Default: FALSE.", metavar="logical"),
make_option(c("-m", "--medium"), type="character", default="M9",
help="medium (e.g. 'M9', 'M9[glc]'", metavar="character"),
make_option(c("--outputdir"), type="character", default="./annotate_results",
help="output directory for annotation files and the consensus model(s)", metavar="character"),
make_option(c("-o","--outputname"), type="character", default="node_consensus",
help="output name for the consensus model(s)", metavar="character"),
make_option(c("--mediadb"), type="character", default=paste0(home,"/my_media.tsv"),
help="media database file name", metavar="character"),
make_option(c("-c", "--checking"), type="logical", default=FALSE,
help="TRUE or FALSE (default). If TRUE, annotated genomes are limited to those pairs of species that are co-ocurring in pairs in one or more samples of a specified experiment. Only applicable when input is --nodes.", metavar="logical"),
make_option(c("-e", "--experiment"), type="character", default=NULL,
help="Experiment input information text file name (e.g. 'table.from_biom.tsv'). Only needed when input is --nodes and --checking is TRUE. The tab-separated file needs to have the following format:\n\t\t#OTU ID\tS1\tS2\tS3\t(...)\n\t\tO1\t000000\t000001\t000002\t(...) ", metavar="character"),
make_option(c("--nucmer"), type="character", default=paste0(home,"/MUMmer3.23/nucmer"),
help="Path to the Nucmer executable (e.g. './MUMmer3.23/nucmer', '~/my_apps/nucmer'). Only needed when input is --nodes.", metavar="character"),
make_option(c("--showcoords"), type="character", default=paste0(home,"/MUMmer3.23/show-coords"),
help="Path to the show-coords executable (e.g. './MUMmer3.23/show-coords', '~/my_apps/show_coords'). Only needed when input is --nodes.", metavar="character"),
make_option(c("--emapper_path"), type="character", default=paste0(home,"/eggnog-mapper-master/emapper.py"),
help="Path to the emapper.py executable file (e.g. './eggnog-mapper-my_version/emapper.py').",metavar="character"),
make_option(c("--fasta"), type="character", default=paste0(home,"/99_otus.fasta"),
help="16S sequences to be analyzed in multifasta format. Only needed when input is --nodes. Default is './99_otus_nodes.tree' (Greengenes gg_13_5).", metavar="character"),
make_option(c("--db16s"), type="character", default=paste0(home,"/bac120_ssu_reps_r95.fna"),
help="16S sequences database. Only needed when input is --nodes. Nucmer will align the 16S sequences to this database. Default is './bac120_ssu_reps_r95.fna' (from GTDB https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/genomic_files_reps/).", metavar="character"),
make_option(c("--dbproteins"), type="character", default=paste0(home,"/protein_faa_reps/bacteria/"),
help="Aminoacid sequences database. EggNOG-mapper will annotate files from here. Default is 'protein_faa_reps/bacteria/' (from GTDB https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/genomic_files_reps/).", metavar="character"),
make_option(c("--dmnd_db"), type="character", default=NULL,
help="Path to a custom Diamond database. Specially useful when using a Diamond version different from the eggNOG-mapper one, such as the one created with create_compatible_database.R.", metavar="character"),
make_option(c("--cores"), type="numeric", default=4,
help="Number of cores to use in parallelization processes (mclapply). Default: 4.", metavar="numeric"))
parser <- OptionParser(option_list=option_list, usage = "Desktop/TFM/annotate.R [options]\n\nThe main input data is either --genomes or --nodes")
opt <- parse_args(parser)
cores <- opt$cores
skip_consensus <- opt$skip_consensus
if (!is.null(opt$nodes) && is.null(opt$genomes)) {
skip_alignment <- FALSE
t <- read.table(opt$nodes,sep = "\t")
nodos <- levels(t[[1]])
taxonom <- levels(t[[2]])
nucmer_path<- opt$nucmer
showcoords <- opt$showcoords
otus_fasta_file <- opt$fasta
db_16S <- opt$db16s
checking <- opt$checking
if (checking==TRUE & is.null(opt$experiment)) {
stop("When --checking TRUE, a experiment file must be specified")
} else if (checking == TRUE) {
exp = opt$experiment
}
} else if (is.null(opt$nodes) && !is.null(opt$genomes)) {
skip_alignment=TRUE
} else if (is.null(opt$nodes) && is.null(opt$genomes)) {
stop("Please provide an input file. It may be either a list of genomes to annotate or a node data file.")
} else { # both --genomes and --nodes were provided
stop("Only one kind of input (--genomes or --nodes) can be provided")}
# Annotation variables
outputdir <- opt$outputdir
outputname <- opt$outputname
db_protein_folder <- opt$dbproteins
emapper_path <- opt$emapper_path
genomes <- opt$genomes
# dmnd_db is used by emapper() , which is used by annotate()
if (is.null(opt$dmnd_db)) {
emapper_folder <- system(paste0("echo $(dirname ",emapper_path,")"),intern=TRUE)
dmnd_db <- paste0(emapper_folder,"/data/eggnog_proteins.dmnd")
} else {
dmnd_db <- opt$dmnd_db
}
# =======================
# Anotar los genomas
# =======================
if (skip_alignment == TRUE) {
genomes <- scan(genomes, character(),sep="\n")
# Anotar el genoma asignado a cada hoja con eggNOG-mapper
# ------------------------------------------------------------
annotate(genomes, outputdir, db_protein_folder, emapper_path, cores)
system(paste0("rm ",outputdir,"/*seed_orthologs"))
# Crear genoma consenso
# --------------------------
if (!skip_consensus) {
system(paste("python3 consenso_EGG.py $(realpath",outputdir,") $(realpath",outputdir,")",outputname))
# Fabricar modelo con CarveMe
# -----------------------------
# TODO input taxonomy for -g?
system(paste0("carve --egg ",outputname,".tsv -o ",outputdir,"/",outputname,".xml"))
}
} else {
# Obtener secuencias 16S para todas las hojas de ambos nodos
# --------------------------------------------------------------
if (checking == TRUE) {
filtered_pairs <- check(nodos=nodos_hojas, exp=exp)
checked_tipl <- list(levels(filtered_pairs[,1]),levels(filtered_pairs[,2]))
nodos_16S <- mclapply(checked_tipl, function(n) {fasta[n]},mc.cores=cores)
} else {
nodos_16S <- mclapply(nodos_hojas, function(nodo) {fasta[nodo$tip.label]},mc.cores=cores)
}
system("mkdir models")
# Para cada nodo haremos alineamientos, anotaciones y un consenso
genomes=list()
for (i in c(1:length(nodos))) {
filepath <- paste("models/",nodos[i],"/",sep="")
system(paste("mkdir", filepath))
# Alineamiento con Nucmer
# ------------------------------
nucmer_res_final <- find_alignment_hits(filepath, nodos_16S[[i]], nucmer_path, db_16S, showcoords, cores)
genomes[[i]] <- scan(file=paste0(filepath,"genomes"),what=character(),sep="\n")
# Anotar el genoma asignado a cada hoja con eggNOG-mapper
# ------------------------------------------------------------
node_outputdir <- paste0(outputdir,"/",nodos[i])
node_outputname <- paste0(outputname,"_",nodos[i])
# En esta(s) carpeta(s) se guardará la lista de los nombres de los genomas obtenidos por Nucmer
system(paste0("mkdir ",node_outputdir))
annotate(genomes[[i]], node_outputdir, db_protein_folder, emapper_path, cores)
system(paste0("rm ",outputdir,"/*seed_orthologs"))
if (!skip_consensus) {
# Crear genoma consenso
# --------------------------
system(paste("python3 consenso_EGG.py $(realpath",node_outputdir,") $(realpath",node_outputdir,")",node_outputname))
# Fabricar modelo con CarveMe
# -----------------------------
system(paste0("carve --egg ",node_outputname,".tsv ",gram(taxonom[i])," -o ",node_outputdir,"/",node_outputname,".xml"))
}
}
}
end.time <- Sys.time()
time.taken <- end.time - start.time
print(paste0("Execution time: ",format(time.taken,format = "%H %M %S")))