Merge pull request #178 from nf-core/max-shift-plots

Max shift plots
nf-core · Dec 4, 2024 · 1e16e22 · 1e16e22
2 parents 7e01b25 + 25d4c9a
commit 1e16e22
Show file tree

Hide file tree

Showing 4 changed files with 160 additions and 3 deletions.
diff --git a/modules/local/combinebeds/shifts/environment.yml b/modules/local/combinebeds/shifts/environment.yml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::polars=1.8.2
+  - conda-forge::altair=5.5.0
+  - conda-forge::vl-convert-python==1.7.0
diff --git a/modules/local/combinebeds/shifts/main.nf b/modules/local/combinebeds/shifts/main.nf
@@ -0,0 +1,21 @@
+process COMBINEBEDS_SHIFTS {
+    tag "$meta.id"
+    label "process_low"
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'oras://community.wave.seqera.io/library/altair_polars_vl-convert-python:e6f1dca28de76d13' :
+        'community.wave.seqera.io/library/altair_polars_vl-convert-python:a6c5ee679445250d' }"
+
+    input:
+    tuple val(meta), path(beds)
+
+    output:
+    path "*.png"       , emit: plots
+    path "*.json"      , emit: multiqc
+    path "versions.yml", emit: versions
+
+    script:
+    prefix      = task.ext.prefix      ?: "${meta.id}"
+    template "shifts.py"
+}
diff --git a/modules/local/combinebeds/shifts/templates/shifts.py b/modules/local/combinebeds/shifts/templates/shifts.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+import platform
+import base64
+import json
+from itertools import product
+
+import polars as pl
+import altair as alt
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+meta_id = "${meta.id}"
+
+df = pl.scan_csv("${beds}".split(" "),
+                    separator="\\t",
+                    has_header=False,
+                    new_columns=["chr", "start", "end", "name", "score", "strand", "sample", "tool"])
+
+df = df.group_by("chr", "start", "end", "strand").agg(tools=pl.col("tool").unique(), samples=pl.col("sample").unique())
+
+def get_group_sizes(df: pl.LazyFrame, max_shift: int, consider_strand: bool) -> pl.LazyFrame:
+    df = df.sort("end"  ).with_columns(end_group  =pl.col("end"  ).diff().fill_null(0).gt(max_shift).cum_sum())
+    df = df.sort("start").with_columns(start_group=pl.col("start").diff().fill_null(0).gt(max_shift).cum_sum())
+
+    group_cols = ["chr", "start_group", "end_group"] + (["strand"] if consider_strand else [])
+    df = df.join(df, on=group_cols, how="inner")
+    df = df.filter((pl.col("start") - pl.col("start_right")).abs() <= max_shift)
+    df = df.filter((pl.col("end") - pl.col("end_right")).abs() <= max_shift)
+    df = df.select(["chr", "start", "end", "strand", "samples_right", "tools_right"])
+    df = df.group_by(["chr", "start", "end", "strand"]).agg(**{
+        "samples": pl.col("samples_right").flatten().unique(),
+        "tools": pl.col("tools_right").flatten().unique()
+    }).with_columns(n_samples=pl.col("samples").map_elements(lambda x: len(x), return_dtype=int),
+                    n_tools=pl.col("tools").map_elements(lambda x: len(x), return_dtype=int))
+
+    df_sample_counts = (df.group_by("n_samples").agg(count=pl.col("n_samples").count())
+        .sort("n_samples")
+        .rename({"n_samples": "value"})
+        .with_columns(metric=pl.lit("n_samples")))
+    df_tool_counts = (df.group_by("n_tools").agg(count=pl.col("n_tools").count())
+        .sort("n_tools")
+        .rename({"n_tools": "value"})
+        .with_columns(metric=pl.lit("n_tools")))
+
+    df = (pl.concat([df_sample_counts, df_tool_counts])
+        .with_columns(max_shift=max_shift, consider_strand=consider_strand))
+
+    return df
+
+shifts = [0, 1, 2, 3, 4, 5, 10, 20, 50]
+consider_strands = [True, False]
+
+dfs = []
+for max_shift, consider_strand in product(shifts, consider_strands):
+    df_ = get_group_sizes(df, max_shift, consider_strand)
+    dfs.append(df_.collect())
+
+df = pl.concat(dfs).with_columns(max_shift=pl.col("max_shift"))
+
+metrics = {
+    "n_samples": "Number of samples",
+    "n_tools": "Number of tools"
+}
+
+for metric, title in metrics.items():
+    n_unique = df.filter(pl.col("metric") == metric).select("value").n_unique("value")
+    df_ = df.filter(pl.col("metric") == metric)
+    plot = df_.plot.bar(x="max_shift:N", y="count", color=f"value:{("Q" if n_unique > 10 else "N")}", column="consider_strand")
+    plot = plot.properties(title=title)
+
+    plot_file = f"{metric}.png"
+    plot.save(plot_file)
+
+    image_string = base64.b64encode(open(plot_file, "rb").read()).decode("utf-8")
+    image_html = f'<div class="mqc-custom-content-image"><img src="data:image/png;base64,{image_string}" /></div>'
+
+    multiqc = {
+        'id': f"{meta_id}_shifts_{metric}",
+        'parent_id': "shift_plots",
+        'parent_name': 'Shift Plots',
+        'parent_description': 'Stacked bar plots showing the agreement between tools and samples for different shift values',
+        'section_name': title,
+        'description': f'Stacked bar plot showing the agreement between tools and samples for different shift values, {"considering" if consider_strand else "ignoring"} strand',
+        'plot_type': 'image',
+        'data': image_html
+    }
+
+    with open(f"{metric}.shifts_mqc.json", "w") as f:
+        f.write(json.dumps(multiqc, indent=4))
+
+# Versions
+
+versions = {
+    "${task.process}": {
+        "python": platform.python_version(),
+        "polars": pl.__version__,
+        "altair": alt.__version__
+    }
+}
+
+with open("versions.yml", "w") as f:
+    f.write(format_yaml_like(versions))
diff --git a/subworkflows/local/bsj_detection.nf b/subworkflows/local/bsj_detection.nf
@@ -4,9 +4,10 @@ include { CSVTK_JOIN as COMBINE_COUNTS_PER_TOOL          } from '../../modules/n
 include { GAWK as FILTER_BSJS                            } from '../../modules/nf-core/gawk'
 include { GAWK as BED_ADD_SAMPLE_TOOL                    } from '../../modules/nf-core/gawk'
 include { COMBINEBEDS_FILTER as COMBINE_TOOLS_PER_SAMPLE } from '../../modules/local/combinebeds/filter'
+include { COMBINEBEDS_SHIFTS as INVESTIGATE_SHIFTS       } from '../../modules/local/combinebeds/shifts'
 include { COMBINEBEDS_FILTER as COMBINE_SAMPLES          } from '../../modules/local/combinebeds/filter'
-include { AGAT_SPADDINTRONS as ADD_INTRONS          } from '../../modules/nf-core/agat/spaddintrons'
-include { GAWK as EXTRACT_EXONS_INTRONS             } from '../../modules/nf-core/gawk'
+include { AGAT_SPADDINTRONS as ADD_INTRONS               } from '../../modules/nf-core/agat/spaddintrons'
+include { GAWK as EXTRACT_EXONS_INTRONS                  } from '../../modules/nf-core/gawk'
 include { BEDTOOLS_GETFASTA as FASTA_COMBINED            } from '../../modules/nf-core/bedtools/getfasta'
 include { BEDTOOLS_GETFASTA as FASTA_PER_SAMPLE          } from '../../modules/nf-core/bedtools/getfasta'
 include { BEDTOOLS_GETFASTA as FASTA_PER_SAMPLE_TOOL     } from '../../modules/nf-core/bedtools/getfasta'
@@ -148,8 +149,12 @@ workflow BSJ_DETECTION {
     ch_bsj_bed_per_sample = COMBINE_TOOLS_PER_SAMPLE.out.combined
         .filter{ meta, bed -> !bed.isEmpty() }
 
+    ch_all_samples = ch_bsj_bed_per_sample_tool_meta
+        .map{ meta, bed -> [[id: "all"], bed] }
+        .groupTuple()
+
     COMBINE_SAMPLES(
-        ch_bsj_bed_per_sample_tool_meta.map{ meta, bed -> [[id: "all"], bed] }.groupTuple(),
+        ch_all_samples,
         params.max_shift,
         params.consider_strand,
         params.min_tools,
@@ -160,6 +165,10 @@ workflow BSJ_DETECTION {
         .filter{ meta, bed -> !bed.isEmpty() }
         .collect()
 
+    INVESTIGATE_SHIFTS(ch_all_samples)
+    ch_versions = ch_versions.mix(INVESTIGATE_SHIFTS.out.versions)
+    ch_multiqc_files = INVESTIGATE_SHIFTS.out.multiqc
+
     //
     // ANNOTATION
     //