monarch-initiative · yaseminbridges · Sep 21, 2023 · Sep 14, 2023 · Sep 14, 2023 · Sep 14, 2023
diff --git a/src/pheval/analyse/analysis.py b/src/pheval/analyse/analysis.py
diff --git a/src/pheval/analyse/disease_prioritisation_analysis.py b/src/pheval/analyse/disease_prioritisation_analysis.py
@@ -0,0 +1,138 @@
+from collections import defaultdict
+from pathlib import Path
+
+from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
+from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
+from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResult
+from pheval.analyse.rank_stats import RankStats
+from pheval.analyse.run_data_parser import TrackInputOutputDirectories
+from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
+from pheval.utils.file_utils import all_files, obtain_closest_file_name
+from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader
+
+
+class AssessDiseasePrioritisation:
+    def __init__(
+        self,
+        phenopacket_path: Path,
+        results_dir: Path,
+        standardised_disease_results: [RankedPhEvalDiseaseResult],
+        threshold: float,
+        score_order: str,
+        proband_diseases: [ProbandDisease],
+    ):
+        self.phenopacket_path = phenopacket_path
+        self.results_dir = results_dir
+        self.standardised_disease_results = standardised_disease_results
+        self.threshold = threshold
+        self.score_order = score_order
+        self.proband_diseases = proband_diseases
+
+    def _record_disease_prioritisation_match(
+        self,
+        disease: ProbandDisease,
+        result_entry: RankedPhEvalDiseaseResult,
+        rank_stats: RankStats,
+    ) -> DiseasePrioritisationResult:
+        """Record the disease prioritisation rank if found within results."""
+        rank = result_entry.rank
+        rank_stats.add_rank(rank)
+        return DiseasePrioritisationResult(self.phenopacket_path, disease, rank)
+
+    def _assess_disease_with_threshold_ascending_order(
+        self,
+        result_entry: RankedPhEvalDiseaseResult,
+        disease: ProbandDisease,
+        rank_stats: RankStats,
+    ) -> DiseasePrioritisationResult:
+        """Record the disease prioritisation rank if it meets the ascending order threshold."""
+        if float(self.threshold) > float(result_entry.score):
+            return self._record_disease_prioritisation_match(disease, result_entry, rank_stats)
+
+    def _assess_disease_with_threshold(
+        self,
+        result_entry: RankedPhEvalDiseaseResult,
+        disease: ProbandDisease,
+        rank_stats: RankStats,
+    ) -> DiseasePrioritisationResult:
+        """Record the disease prioritisation rank if it meets the score threshold."""
+        if float(self.threshold) < float(result_entry.score):
+            return self._record_disease_prioritisation_match(disease, result_entry, rank_stats)
+
+    def _record_matched_disease(
+        self,
+        disease: ProbandDisease,
+        rank_stats: RankStats,
+        standardised_disease_result: RankedPhEvalDiseaseResult,
+    ) -> DiseasePrioritisationResult:
+        """Return the disease rank result - dealing with the specification of a threshold."""
+        if float(self.threshold) == 0.0:
+            return self._record_disease_prioritisation_match(
+                disease, standardised_disease_result, rank_stats
+            )
+        else:
+            return (
+                self._assess_disease_with_threshold(
+                    standardised_disease_result, disease, rank_stats
+                )
+                if self.score_order != "ascending"
+                else self._assess_disease_with_threshold_ascending_order(
+                    standardised_disease_result, disease, rank_stats
+                )
+            )
+
+    def assess_disease_prioritisation(
+        self, rank_stats: RankStats, rank_records: defaultdict
+    ) -> None:
+        """Assess disease prioritisation."""
+        for disease in self.proband_diseases:
+            rank_stats.total += 1
+            disease_match = DiseasePrioritisationResult(self.phenopacket_path, disease)
+            for standardised_disease_result in self.standardised_disease_results:
+                if (
+                    disease.disease_identifier == standardised_disease_result.disease_identifier
+                    or disease.disease_name == standardised_disease_result.disease_name
+                ):
+                    disease_match = self._record_matched_disease(
+                        disease, rank_stats, standardised_disease_result
+                    )
+                    break
+            PrioritisationRankRecorder(
+                rank_stats.total,
+                self.results_dir,
+                DiseasePrioritisationResult(self.phenopacket_path, disease)
+                if disease_match is None
+                else disease_match,
+                rank_records,
+            ).record_rank()
+
+
+def _obtain_causative_diseases(phenopacket_path: Path) -> [ProbandDisease]:
+    """Obtain known diseases from a phenopacket."""
+    phenopacket = phenopacket_reader(phenopacket_path)
+    phenopacket_util = PhenopacketUtil(phenopacket)
+    return phenopacket_util.diagnoses()
+
+
+def assess_phenopacket_disease_prioritisation(
+    standardised_disease_result: Path,
+    score_order: str,
+    results_dir_and_input: TrackInputOutputDirectories,
+    threshold: float,
+    disease_rank_stats: RankStats,
+    disease_rank_comparison: defaultdict,
+) -> None:
+    """Assess disease prioritisation for a phenopacket."""
+    phenopacket_path = obtain_closest_file_name(
+        standardised_disease_result, all_files(results_dir_and_input.phenopacket_dir)
+    )
+    pheval_disease_result = read_standardised_result(standardised_disease_result)
+    proband_diseases = _obtain_causative_diseases(phenopacket_path)
+    AssessDiseasePrioritisation(
+        phenopacket_path,
+        results_dir_and_input.results_dir.joinpath("pheval_disease_results/"),
+        parse_pheval_result(RankedPhEvalDiseaseResult, pheval_disease_result),
+        threshold,
+        score_order,
+        proband_diseases,
+    ).assess_disease_prioritisation(disease_rank_stats, disease_rank_comparison)
diff --git a/src/pheval/analyse/gene_prioritisation_analysis.py b/src/pheval/analyse/gene_prioritisation_analysis.py
@@ -0,0 +1,135 @@
+from collections import defaultdict
+from pathlib import Path
+
+import pandas as pd
+
+from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
+from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
+from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
+from pheval.analyse.rank_stats import RankStats
+from pheval.analyse.run_data_parser import TrackInputOutputDirectories
+from pheval.post_processing.post_processing import RankedPhEvalGeneResult
+from pheval.utils.file_utils import all_files, obtain_closest_file_name
+from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader
+
+
+class AssessGenePrioritisation:
+    """Assess gene prioritisation."""
+
+    def __init__(
+        self,
+        phenopacket_path: Path,
+        results_dir: Path,
+        standardised_gene_results: [RankedPhEvalGeneResult],
+        threshold: float,
+        score_order: str,
+        proband_causative_genes: [ProbandCausativeGene],
+    ):
+        self.phenopacket_path = phenopacket_path
+        self.results_dir = results_dir
+        self.standardised_gene_results = standardised_gene_results
+        self.threshold = threshold
+        self.score_order = score_order
+        self.proband_causative_genes = proband_causative_genes
+
+    def _record_gene_prioritisation_match(
+        self,
+        gene: ProbandCausativeGene,
+        result_entry: RankedPhEvalGeneResult,
+        rank_stats: RankStats,
+    ) -> GenePrioritisationResult:
+        """Record the gene prioritisation rank if found within results."""
+        rank = result_entry.rank
+        rank_stats.add_rank(rank)
+        return GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol, rank)
+
+    def _assess_gene_with_threshold_ascending_order(
+        self,
+        result_entry: RankedPhEvalGeneResult,
+        gene: ProbandCausativeGene,
+        rank_stats: RankStats,
+    ) -> GenePrioritisationResult:
+        """Record the gene prioritisation rank if it meets the ascending order threshold."""
+        if float(self.threshold) > float(result_entry.score):
+            return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
+
+    def _assess_gene_with_threshold(
+        self,
+        result_entry: RankedPhEvalGeneResult,
+        gene: ProbandCausativeGene,
+        rank_stats: RankStats,
+    ) -> GenePrioritisationResult:
+        """Record the gene prioritisation rank if it meets the score threshold."""
+        if float(self.threshold) < float(result_entry.score):
+            return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
+
+    def _record_matched_gene(
+        self, gene: ProbandCausativeGene, rank_stats: RankStats, standardised_gene_result: pd.Series
+    ) -> GenePrioritisationResult:
+        """Return the gene rank result - dealing with the specification of a threshold."""
+        if float(self.threshold) == 0.0:
+            return self._record_gene_prioritisation_match(
+                gene, standardised_gene_result, rank_stats
+            )
+        else:
+            return (
+                self._assess_gene_with_threshold(standardised_gene_result, gene, rank_stats)
+                if self.score_order != "ascending"
+                else self._assess_gene_with_threshold_ascending_order(
+                    standardised_gene_result, gene, rank_stats
+                )
+            )
+
+    def assess_gene_prioritisation(self, rank_stats: RankStats, rank_records: defaultdict) -> None:
+        """Assess gene prioritisation."""
+        for gene in self.proband_causative_genes:
+            rank_stats.total += 1
+            gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
+            for standardised_gene_result in self.standardised_gene_results:
+                if (
+                    gene.gene_identifier == standardised_gene_result.gene_identifier
+                    or gene.gene_symbol == standardised_gene_result.gene_symbol
+                ):
+                    gene_match = self._record_matched_gene(
+                        gene, rank_stats, standardised_gene_result
+                    )
+                    break
+            PrioritisationRankRecorder(
+                rank_stats.total,
+                self.results_dir,
+                GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
+                if gene_match is None
+                else gene_match,
+                rank_records,
+            ).record_rank()
+
+
+def _obtain_causative_genes(phenopacket_path: Path) -> [ProbandCausativeGene]:
+    """Obtain causative genes from a phenopacket."""
+    phenopacket = phenopacket_reader(phenopacket_path)
+    phenopacket_util = PhenopacketUtil(phenopacket)
+    return phenopacket_util.diagnosed_genes()
+
+
+def assess_phenopacket_gene_prioritisation(
+    standardised_gene_result: Path,
+    score_order: str,
+    results_dir_and_input: TrackInputOutputDirectories,
+    threshold: float,
+    gene_rank_stats: RankStats,
+    gene_rank_comparison: defaultdict,
+) -> None:
+    """Assess gene prioritisation for a phenopacket."""
+    phenopacket_path = obtain_closest_file_name(
+        standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir)
+    )
+    pheval_gene_result = read_standardised_result(standardised_gene_result)
+    proband_causative_genes = _obtain_causative_genes(phenopacket_path)
+    AssessGenePrioritisation(
+        phenopacket_path,
+        results_dir_and_input.results_dir.joinpath("pheval_gene_results/"),
+        parse_pheval_result(RankedPhEvalGeneResult, pheval_gene_result),
+        threshold,
+        score_order,
+        proband_causative_genes,
+    ).assess_gene_prioritisation(gene_rank_stats, gene_rank_comparison)
diff --git a/src/pheval/analyse/prioritisation_rank_recorder.py b/src/pheval/analyse/prioritisation_rank_recorder.py
@@ -0,0 +1,52 @@
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union
+
+from pheval.analyse.prioritisation_result_types import (
+    DiseasePrioritisationResult,
+    GenePrioritisationResult,
+    VariantPrioritisationResult,
+)
+
+
+@dataclass
+class PrioritisationRankRecorder:
+    """Compare the ranks of different runs."""
+
+    index: int
+    directory: Path
+    prioritisation_result: Union[
+        GenePrioritisationResult, VariantPrioritisationResult, DiseasePrioritisationResult
+    ]
+    run_comparison: defaultdict
+
+    def _record_gene_rank(self) -> None:
+        """Record gene prioritisation rank."""
+        self.run_comparison[self.index]["Gene"] = self.prioritisation_result.gene
+
+    def _record_variant_rank(self) -> None:
+        """Record variant prioritisation rank."""
+        variant = self.prioritisation_result.variant
+        self.run_comparison[self.index]["Variant"] = "_".join(
+            [variant.chrom, str(variant.pos), variant.ref, variant.alt]
+        )
+
+    def _record_disease_rank(self) -> None:
+        """Record disease prioritisation rank."""
+        self.run_comparison[self.index][
+            "Disease"
+        ] = self.prioritisation_result.disease.disease_identifier
+
+    def record_rank(self) -> None:
+        """Records the rank for different runs."""
+        self.run_comparison[self.index][
+            "Phenopacket"
+        ] = self.prioritisation_result.phenopacket_path.name
+        if type(self.prioritisation_result) is GenePrioritisationResult:
+            self._record_gene_rank()
+        elif type(self.prioritisation_result) is VariantPrioritisationResult:
+            self._record_variant_rank()
+        elif type(self.prioritisation_result) is DiseasePrioritisationResult:
+            self._record_disease_rank()
+        self.run_comparison[self.index][self.directory] = self.prioritisation_result.rank
diff --git a/src/pheval/analyse/prioritisation_result_types.py b/src/pheval/analyse/prioritisation_result_types.py
@@ -0,0 +1,31 @@
+from dataclasses import dataclass
+from pathlib import Path
+
+from pheval.utils.phenopacket_utils import GenomicVariant, ProbandDisease
+
+
+@dataclass
+class GenePrioritisationResult:
+    """Store rank data for causative genes."""
+
+    phenopacket_path: Path
+    gene: str
+    rank: int = 0
+
+
+@dataclass
+class VariantPrioritisationResult:
+    """Store rank data for causative variants."""
+
+    phenopacket_path: Path
+    variant: GenomicVariant
+    rank: int = 0
+
+
+@dataclass
+class DiseasePrioritisationResult:
+    """Store rank data for known diseases."""
+
+    phenopacket_path: Path
+    disease: ProbandDisease
+    rank: int = 0
diff --git a/src/pheval/analyse/run_data_parser.py b/src/pheval/analyse/run_data_parser.py
@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from pathlib import Path
+
+import pandas as pd
+
+
+@dataclass
+class TrackInputOutputDirectories:
+    """Track the input testdata for a corresponding pheval output directory"""
+
+    phenopacket_dir: Path
+    results_dir: Path
+
+
+def parse_run_data_text_file(run_data_path: Path) -> [TrackInputOutputDirectories]:
+    """Parse run data .txt file returning a list of input testdata and corresponding output directories."""
+    run_data = pd.read_csv(run_data_path, delimiter="\t", header=None)
+    run_data_list = []
+    for _index, row in run_data.iterrows():
+        run_data_list.append(
+            TrackInputOutputDirectories(phenopacket_dir=Path(row[0]), results_dir=Path(row[1]))
+        )
+    return run_data_list