From ebee26dbcd5ba144b8e0267560c6d933cce03663 Mon Sep 17 00:00:00 2001 From: dereckmezquita Date: Sun, 14 Jul 2024 09:19:25 -0500 Subject: [PATCH] Docs and dev example for Pca. --- dev/pca/pca-comparison.R | 13 +- man/Comparison.Rd | 63 +++++---- man/Pca.Rd | 270 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 304 insertions(+), 42 deletions(-) create mode 100644 man/Pca.Rd diff --git a/dev/pca/pca-comparison.R b/dev/pca/pca-comparison.R index 9a71448..f49d3ce 100644 --- a/dev/pca/pca-comparison.R +++ b/dev/pca/pca-comparison.R @@ -6,16 +6,9 @@ data <- feature_counts[GeneBiotype == "protein_coding", ] colnames(data)[1] <- "feature" -comp_table <- dt$fread( -"group sample -A T64552 -A T64553 -A T64554 -A T64555 -B T64546 -B T64548 -B T64549 -B T64550" +comp_table <- data.frame( + group = c("A", "A", "A", "A", "B", "B", "B", "B"), + sample = c("T64552", "T64553", "T64554", "T64555", "T64546", "T64548", "T64549", "T64550") ) comp <- Comparison$new( diff --git a/man/Comparison.Rd b/man/Comparison.Rd index 206d35e..4284afc 100644 --- a/man/Comparison.Rd +++ b/man/Comparison.Rd @@ -4,49 +4,36 @@ \alias{Comparison} \title{Comparison Class} \description{ -The `Comparison` class represents a comparison between two groups of samples. It includes methods to initialize, print, and validate the data. +Comparison Class + +Comparison Class } \details{ -An R6 class that represents a comparison between two groups of samples. +An R6 class that represents a comparison between two groups of samples. This class contains the comparison name, the group order, and the comparison table. +It includes methods to initialise, print, and validate the data. } -\section{Initialisation}{ - -``` -Comparison$new( - comparison_name = "My comparison", - group_order = c("Control group", "Test group"), - comparison_table = data.table::data.table( - group = c("Control group", "Control group", "Test group", "Test group"), - sample = c("Sample1", "Sample2", "Sample3", "Sample4") - ) -) -``` -} - -\section{Usage}{ - -``` +\examples{ comparison <- Comparison$new( - comparison_name = "My comparison", - group_order = c("Control group", "Test group"), + comparison_name = "Treatment vs Control", + group_order = c("Control", "Treatment"), comparison_table = data.table::data.table( - group = c("Control group", "Control group", "Test group", "Test group"), - sample = c("Sample1", "Sample2", "Sample3", "Sample4") - ) + group = c("Control", "Control", "Treatment", "Treatment"), + sample = c("Sample1", "Sample2", "Sample3", "Sample4") + ) ) -comparison$print() # Print the Comparison object -``` -} +print(comparison) +} \section{Public fields}{ \if{html}{\out{
}} \describe{ \item{\code{comparison_name}}{Character. The name of the comparison.} -\item{\code{group_order}}{Character vector. The order of groups for the comparison, with length 2. The first element will be treated as the "control" group, and the second as the "test" group.} +\item{\code{group_order}}{Character vector. The order of groups for the comparison, with length 2. +The first element is treated as the "control" group, and the second as the "test" group.} -\item{\code{comparison_table}}{A data.table that contains the group and sample information for the comparison.} +\item{\code{comparison_table}}{A data.table that contains the group, sample, and condition information for the comparison.} } \if{html}{\out{
}} } @@ -62,6 +49,7 @@ comparison$print() # Print the Comparison object \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Comparison-new}{}}} \subsection{Method \code{new()}}{ +Create a new Comparison object. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Comparison$new(comparison_name, group_order, comparison_table)}\if{html}{\out{
}} } @@ -69,11 +57,18 @@ comparison$print() # Print the Comparison object \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{comparison_name}}{A character string representing the name of the comparison. Must be of length 1 and not exceed 100 characters.} +\item{\code{comparison_name}}{A character string representing the name of the comparison. +Must be of length 1 and not exceed 100 characters.} -\item{\code{group_order}}{A character vector specifying the order of groups for the comparison. Must be of length 2.} +\item{\code{group_order}}{A character vector specifying the order of groups for the comparison. +Must be of length 2.} -\item{\code{comparison_table}}{A data.table that contains group and sample information for the comparison. This table is crucial for tracking which samples belong to which clinical group. It is used to distinguish samples for the purposes of experimental analysis. It should have two columns, named "group" and "sample". The "group" column identifies the clinical group to which each sample belongs, and the "sample" column lists the names/IDs of the samples. The groups in this table should match the names specified in `group_order`.} +\item{\code{comparison_table}}{A data.table that contains group and sample information for the comparison. + It should have two columns, named "group" and "sample". The "group" column identifies + the clinical group to which each sample belongs, and the "sample" column lists the + names/IDs of the samples. The groups in this table should match the names specified + in `group_order`. +Print a summary of the Comparison object.} } \if{html}{\out{
}} } @@ -86,6 +81,10 @@ comparison$print() # Print the Comparison object \if{html}{\out{
}}\preformatted{Comparison$print()}\if{html}{\out{
}} } +\subsection{Returns}{ +None. This method is called for its side effect of printing to the console. +Validate the Comparison object. +} } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/Pca.Rd b/man/Pca.Rd new file mode 100644 index 0000000..4b3ed74 --- /dev/null +++ b/man/Pca.Rd @@ -0,0 +1,270 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Pca.R +\name{Pca} +\alias{Pca} +\title{Principal Component Analysis (PCA) Class} +\description{ +The `Pca` class implements Principal Component Analysis, a dimensionality reduction +technique widely used in data analysis and visualisation. This class provides methods +for performing PCA on a dataset, visualising the results, and interpreting the output. +} +\details{ +PCA is a powerful technique for analysing high-dimensional data, such as gene expression +data in bioinformatics. It works by transforming the data into a new coordinate system +where the axes (principal components) are ordered by the amount of variance they explain. + +The PCA process involves several steps: + +1. Standardisation: + PCA begins with a dataset of n-dimensions (in the below demonstration, genes are + dimensions and samples are observations). The data is standardised, transforming + each dimension to have a mean of 0 and a standard deviation of 1. + +2. Covariance Matrix Computation: + A covariance matrix is computed. This matrix indicates the covariance + (shared variance) between each pair of dimensions. The covariance between different + dimensions is used to understand the correlation structure of the original dimensions. + +3. Eigendecomposition: + The covariance matrix is then decomposed into its eigenvectors and eigenvalues. + Each eigenvector represents a principal component, which is a linear combination of + the original dimensions. The associated eigenvalue represents the amount of variance + explained by the principal component. The eigenvectors are ordered by their corresponding + eigenvalues, so the first principal component (PC1) explains the most variance, followed by PC2, etc. + +4. Selection of Principal Components: + Depending on the goal of the analysis, some or all of the principal components can + be selected for further analysis. The 'elbow method' is commonly used, where you plot + the explained variance by each principal component and look for an 'elbow' in the plot as a cut-off point. + +5. Interpretation: + The 'top rotations' in the context of PCA refer to the features (genes) that contribute + most to each principal component. The 'rotation' matrix from prcomp() gives the loadings + of each feature onto each PC. By identifying features with large absolute loadings, we can + understand what features drive the separation in the data along the principal components. + In other words, the top rotations tell us which genes are most important for explaining + the variance in our data along each PC. + +This class provides methods for each step of the PCA process, from data preparation +to visualisation of results. It's designed to work with any kind of high-dimensional +numerical data, as long as the data is in a tabular format with features as rows and +samples as columns. The first column must be named "feature" and contain the feature names. +} +\examples{ +# Load required packages +box::use(dmplot[Pca, Comparison]) + +# Load example data +data(feature_counts, package = "dmplot") + +# Prepare the data +data <- feature_counts[GeneBiotype == "protein_coding", ] +colnames(data)[1] <- "feature" + +# Create a comparison table +comp_table <- data.frame( + group = c("A", "A", "A", "A", "B", "B", "B", "B"), + sample = c("T64552", "T64553", "T64554", "T64555", "T64546", "T64548", "T64549", "T64550") +) + +# Create a Comparison object +comp <- Comparison$new( + comparison_name = "A_over_B", + group_order = c("B", "A"), + comparison_table = comp_table +) + +# Create a Pca object +pca_obj <- Pca$new(data, comp) + +# Perform PCA +pca_obj$prcomp() + +# Access PCA results +pca_obj$data # View the input data +pca_obj$prcomp_results # View the raw PCA results +pca_obj$prcomp_refined # View the refined PCA results + +# Create visualisations +scree_plot <- pca_obj$plot_scree() # Generate a scree plot +scatter_plot <- pca_obj$plot_scatter() # Generate a scatter plot + +# Print the scree plot +print(scree_plot) + +# Print the scatter plot +print(scatter_plot) + +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{data}}{The input data for PCA, typically a data.table with features as rows and samples as columns} + +\item{\code{comparison}}{An optional Comparison object for group comparisons} + +\item{\code{prcomp_results}}{Results from the stats::prcomp function, containing the raw PCA output} + +\item{\code{prcomp_refined}}{Refined PCA results, including percentage of variance explained by each PC} + +\item{\code{top_rotations}}{Top contributors (features) to each principal component} + +\item{\code{scatter}}{The scatter plot of the first two principal components} + +\item{\code{scree}}{The scree plot showing variance explained by each PC} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-Pca-new}{\code{Pca$new()}} +\item \href{#method-Pca-prcomp}{\code{Pca$prcomp()}} +\item \href{#method-Pca-print}{\code{Pca$print()}} +\item \href{#method-Pca-plot_scree}{\code{Pca$plot_scree()}} +\item \href{#method-Pca-plot_scatter}{\code{Pca$plot_scatter()}} +\item \href{#method-Pca-clone}{\code{Pca$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Pca-new}{}}} +\subsection{Method \code{new()}}{ +Create a new Pca object +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Pca$new(data, comparison = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{data}}{A data.table containing the input data for PCA. The first column must be named "feature".} + +\item{\code{comparison}}{An optional Comparison object for group comparisons} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Pca-prcomp}{}}} +\subsection{Method \code{prcomp()}}{ +Perform Principal Component Analysis on the data +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Pca$prcomp(...)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{...}}{Additional arguments passed to stats::prcomp} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +NULL (results stored in Pca$prcomp_results, Pca$prcomp_refined) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Pca-print}{}}} +\subsection{Method \code{print()}}{ +Print a summary of the PCA results +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Pca$print()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +NULL (prints to console) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Pca-plot_scree}{}}} +\subsection{Method \code{plot_scree()}}{ +Generate a scree plot of the PCA results +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Pca$plot_scree(num_pc = 50)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{num_pc}}{Number of principal components to include in the plot} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A ggplot2 object representing the scree plot +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Pca-plot_scatter}{}}} +\subsection{Method \code{plot_scatter()}}{ +Generate a scatter plot of the first two principal components +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Pca$plot_scatter( + point_default_colour = "grey", + point_size = 3, + point_alpha = 1, + point_labels = list(show = TRUE, size = 4, max_overlaps = 10, alpha = 0.75, font_face = + "bold"), + top_contributors = list(show = TRUE, truncate = 30), + title = if (!is.null(self$comparison)) + stringr::str_interp("${self$comparison$comparison_name}: principal components 1 and 2") + else "Principal components 1 and 2", + subtitle = + stringr::str_interp("${nrow(self$prcomp_results$x)} samples, ${ncol(self$prcomp_results$rotation)} principal components, calculated from ${nrow(self$prcomp_results$rotation)} features"), + caption = if (top_contributors$show) + stringr::str_interp("Top contributors to variance:\\nPC1: ${paste0(stringr::str_trunc(names(self$top_rotations$PC1), top_contributors$truncate), collapse = \\", \\")}\\nPC2: ${paste0(stringr::str_trunc(names(self$top_rotations$PC2), top_contributors$truncate), collapse = \\", \\")}") + else NULL +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{point_default_colour}}{Default colour for points when no comparison is provided} + +\item{\code{point_size}}{Size of the points in the scatter plot} + +\item{\code{point_alpha}}{Alpha (transparency) of the points} + +\item{\code{point_labels}}{List of parameters for point labels} + +\item{\code{top_contributors}}{List of parameters for displaying top contributors} + +\item{\code{title}}{Title of the plot} + +\item{\code{subtitle}}{Subtitle of the plot} + +\item{\code{caption}}{Caption of the plot} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A ggplot2 object representing the scatter plot +Filter samples based on a comparison object +Refine PCA results for easier interpretation +Prepare data for PCA by dropping non-numerical columns +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Pca-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Pca$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +}