-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_metadata.R
122 lines (106 loc) · 3.79 KB
/
create_metadata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
library(tidyverse)
# Read the ncbi metadatafile
ncbi <- read_tsv("/home/jonr/Prosjekter/mpox/phylogenetic/data/metadata.tsv.gz")
# Read the consensus sequences using seqinr
fasta_files <- list.files("/home/jonr/Prosjekter/mpox/phylogenetic/data/", ".fa$", full.names = T)
for (i in seq_along(fasta_files)) {
seq <- seqinr::read.fasta(fasta_files[i])
if (!exists("fasta")) {
fasta <- seq
} else {
fasta <- c(fasta, seq)
}
}
# Simplify the fasta headers and write out as a single fasta
seqinr::write.fasta(fasta, names = names(fasta), file.out = "/home/jonr/Prosjekter/mpox/phylogenetic/data/norw_seqs.fa")
# This file will be concatenated with the ncbi fasta file
# Create the metadata with the same headers as the ncbi metadatafile
fields <- colnames(ncbi)
metadata <- matrix(nrow = length(fasta), ncol = length(fields))
colnames(metadata) <- fields
metadata <- as.data.frame(metadata)
# Add the fasta header as strain name
for (i in seq_along(fasta)) {
metadata$strain[i] <- names(fasta[i])
metadata$accession[i] <- names(fasta[i])
metadata$country[i] <- "Norway"
metadata$region[i] <- "Europe"
metadata$host[i] <- "Homo sapiens"
metadata$QC_rare_mutations <- "good"
}
metadata <- tibble(metadata)
# Add sampling dates
metadata <- metadata %>%
mutate(date = case_when(
strain == "2023122020" ~ "2022-06-22",
strain == "202401196" ~ "2023-10-05",
strain == "202401262" ~ "2022-06-22",
strain == "202401263" ~ "2022-06-27",
strain == "202401264" ~ "2022-07-25",
strain == "202401265" ~ "2022-07-29",
strain == "202401266" ~ "2022-06-22",
strain == "202401267" ~ "2023-11-29",
.default = date
)) %>%
mutate(date_submitted = case_when(
strain == "2023122020" ~ "2022-06-22",
strain == "202401196" ~ "2023-10-05",
strain == "202401262" ~ "2022-06-22",
strain == "202401263" ~ "2022-06-27",
strain == "202401264" ~ "2022-07-25",
strain == "202401265" ~ "2022-07-29",
strain == "202401266" ~ "2022-06-22",
strain == "202401267" ~ "2023-11-29",
.default = date_submitted
))
# Read the nextclade results from the Norwegian sequences
nc <- read_tsv("/home/jonr/Prosjekter/mpox/phylogenetic/data/nextclade.tsv") %>%
mutate(seqName = as.character(seqName)) %>%
# keep relevant fields
select(seqName, lineage, outbreak, clade, coverage, isReverseComplement, frameShifts)
# merge nextclade data
metadata <- metadata %>%
left_join(nc, by = c("strain" = "seqName"))
# Create final data structure
metadata <- metadata %>%
select(accession,
genbank_accession_rev,
strain,
date,
region,
country,
division,
location,
host,
date_submitted,
sra_accession,
abbr_authors,
reverse,
authors,
institution,
"clade" = clade.y, # Use clade from Nextclade
"outbreak" = outbreak.y, # Use outbreak from Nextclade
"lineage" = lineage.y, # Use lineage from Nextclade
"coverage" = coverage.y, # Use coverage from Nextclade
missing_data,
divergence,
nonACGTN,
QC_missing_data,
QC_mixed_sites,
QC_rare_mutations,
QC_frame_shifts,
QC_stop_codons,
"frame_shifts" = frameShifts, # Use frame shifts from Nextclade
"is_reverse_complement" = isReverseComplement) # Use reverse complement from Nextclade
metadata <- metadata %>%
mutate(date_submitted = as.Date(date_submitted))
# Merge own data with ncbi and write file
# Test if column structures are the same
if (identical(colnames(metadata), colnames(ncbi))) {
final_metadata <- bind_rows(metadata, ncbi)
} else {
print("Colnames are not the same")
}
if (exists("final_metadata")) {
write_tsv(final_metadata, "/home/jonr/Prosjekter/mpox/phylogenetic/data/metadata.tsv")
}