Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

197 move gene prioritisation specific methods in analysispy #198

Merged
490 changes: 8 additions & 482 deletions src/pheval/analyse/analysis.py

Large diffs are not rendered by default.

138 changes: 138 additions & 0 deletions src/pheval/analyse/disease_prioritisation_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from collections import defaultdict
from pathlib import Path

from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResult
from pheval.analyse.rank_stats import RankStats
from pheval.analyse.run_data_parser import TrackInputOutputDirectories
from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
from pheval.utils.file_utils import all_files, obtain_closest_file_name
from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader


class AssessDiseasePrioritisation:
def __init__(
self,
phenopacket_path: Path,
results_dir: Path,
standardised_disease_results: [RankedPhEvalDiseaseResult],
threshold: float,
score_order: str,
proband_diseases: [ProbandDisease],
):
self.phenopacket_path = phenopacket_path
self.results_dir = results_dir
self.standardised_disease_results = standardised_disease_results
self.threshold = threshold
self.score_order = score_order
self.proband_diseases = proband_diseases

def _record_disease_prioritisation_match(
self,
disease: ProbandDisease,
result_entry: RankedPhEvalDiseaseResult,
rank_stats: RankStats,
) -> DiseasePrioritisationResult:
"""Record the disease prioritisation rank if found within results."""
rank = result_entry.rank
rank_stats.add_rank(rank)
return DiseasePrioritisationResult(self.phenopacket_path, disease, rank)

def _assess_disease_with_threshold_ascending_order(
self,
result_entry: RankedPhEvalDiseaseResult,
disease: ProbandDisease,
rank_stats: RankStats,
) -> DiseasePrioritisationResult:
"""Record the disease prioritisation rank if it meets the ascending order threshold."""
if float(self.threshold) > float(result_entry.score):
return self._record_disease_prioritisation_match(disease, result_entry, rank_stats)

def _assess_disease_with_threshold(
self,
result_entry: RankedPhEvalDiseaseResult,
disease: ProbandDisease,
rank_stats: RankStats,
) -> DiseasePrioritisationResult:
"""Record the disease prioritisation rank if it meets the score threshold."""
if float(self.threshold) < float(result_entry.score):
return self._record_disease_prioritisation_match(disease, result_entry, rank_stats)

def _record_matched_disease(
self,
disease: ProbandDisease,
rank_stats: RankStats,
standardised_disease_result: RankedPhEvalDiseaseResult,
) -> DiseasePrioritisationResult:
"""Return the disease rank result - dealing with the specification of a threshold."""
if float(self.threshold) == 0.0:
return self._record_disease_prioritisation_match(
disease, standardised_disease_result, rank_stats
)
else:
return (
self._assess_disease_with_threshold(
standardised_disease_result, disease, rank_stats
)
if self.score_order != "ascending"
else self._assess_disease_with_threshold_ascending_order(
standardised_disease_result, disease, rank_stats
)
)

def assess_disease_prioritisation(
self, rank_stats: RankStats, rank_records: defaultdict
) -> None:
"""Assess disease prioritisation."""
for disease in self.proband_diseases:
rank_stats.total += 1
disease_match = DiseasePrioritisationResult(self.phenopacket_path, disease)
for standardised_disease_result in self.standardised_disease_results:
if (
disease.disease_identifier == standardised_disease_result.disease_identifier
or disease.disease_name == standardised_disease_result.disease_name
):
disease_match = self._record_matched_disease(
disease, rank_stats, standardised_disease_result
)
break
PrioritisationRankRecorder(
rank_stats.total,
self.results_dir,
DiseasePrioritisationResult(self.phenopacket_path, disease)
if disease_match is None
else disease_match,
rank_records,
).record_rank()


def _obtain_causative_diseases(phenopacket_path: Path) -> [ProbandDisease]:
"""Obtain known diseases from a phenopacket."""
phenopacket = phenopacket_reader(phenopacket_path)
phenopacket_util = PhenopacketUtil(phenopacket)
return phenopacket_util.diagnoses()


def assess_phenopacket_disease_prioritisation(
standardised_disease_result: Path,
score_order: str,
results_dir_and_input: TrackInputOutputDirectories,
threshold: float,
disease_rank_stats: RankStats,
disease_rank_comparison: defaultdict,
) -> None:
"""Assess disease prioritisation for a phenopacket."""
phenopacket_path = obtain_closest_file_name(
standardised_disease_result, all_files(results_dir_and_input.phenopacket_dir)
)
pheval_disease_result = read_standardised_result(standardised_disease_result)
proband_diseases = _obtain_causative_diseases(phenopacket_path)
AssessDiseasePrioritisation(
phenopacket_path,
results_dir_and_input.results_dir.joinpath("pheval_disease_results/"),
parse_pheval_result(RankedPhEvalDiseaseResult, pheval_disease_result),
threshold,
score_order,
proband_diseases,
).assess_disease_prioritisation(disease_rank_stats, disease_rank_comparison)
135 changes: 135 additions & 0 deletions src/pheval/analyse/gene_prioritisation_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
from collections import defaultdict
from pathlib import Path

import pandas as pd

from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
from pheval.analyse.rank_stats import RankStats
from pheval.analyse.run_data_parser import TrackInputOutputDirectories
from pheval.post_processing.post_processing import RankedPhEvalGeneResult
from pheval.utils.file_utils import all_files, obtain_closest_file_name
from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader


class AssessGenePrioritisation:
"""Assess gene prioritisation."""

def __init__(
self,
phenopacket_path: Path,
results_dir: Path,
standardised_gene_results: [RankedPhEvalGeneResult],
threshold: float,
score_order: str,
proband_causative_genes: [ProbandCausativeGene],
):
self.phenopacket_path = phenopacket_path
self.results_dir = results_dir
self.standardised_gene_results = standardised_gene_results
self.threshold = threshold
self.score_order = score_order
self.proband_causative_genes = proband_causative_genes

def _record_gene_prioritisation_match(
self,
gene: ProbandCausativeGene,
result_entry: RankedPhEvalGeneResult,
rank_stats: RankStats,
) -> GenePrioritisationResult:
"""Record the gene prioritisation rank if found within results."""
rank = result_entry.rank
rank_stats.add_rank(rank)
return GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol, rank)

def _assess_gene_with_threshold_ascending_order(
self,
result_entry: RankedPhEvalGeneResult,
gene: ProbandCausativeGene,
rank_stats: RankStats,
) -> GenePrioritisationResult:
"""Record the gene prioritisation rank if it meets the ascending order threshold."""
if float(self.threshold) > float(result_entry.score):
return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)

def _assess_gene_with_threshold(
self,
result_entry: RankedPhEvalGeneResult,
gene: ProbandCausativeGene,
rank_stats: RankStats,
) -> GenePrioritisationResult:
"""Record the gene prioritisation rank if it meets the score threshold."""
if float(self.threshold) < float(result_entry.score):
return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)

def _record_matched_gene(
self, gene: ProbandCausativeGene, rank_stats: RankStats, standardised_gene_result: pd.Series
) -> GenePrioritisationResult:
"""Return the gene rank result - dealing with the specification of a threshold."""
if float(self.threshold) == 0.0:
return self._record_gene_prioritisation_match(
gene, standardised_gene_result, rank_stats
)
else:
return (
self._assess_gene_with_threshold(standardised_gene_result, gene, rank_stats)
if self.score_order != "ascending"
else self._assess_gene_with_threshold_ascending_order(
standardised_gene_result, gene, rank_stats
)
)

def assess_gene_prioritisation(self, rank_stats: RankStats, rank_records: defaultdict) -> None:
"""Assess gene prioritisation."""
for gene in self.proband_causative_genes:
rank_stats.total += 1
gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
for standardised_gene_result in self.standardised_gene_results:
if (
gene.gene_identifier == standardised_gene_result.gene_identifier
or gene.gene_symbol == standardised_gene_result.gene_symbol
):
gene_match = self._record_matched_gene(
gene, rank_stats, standardised_gene_result
)
break
PrioritisationRankRecorder(
rank_stats.total,
self.results_dir,
GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
if gene_match is None
else gene_match,
rank_records,
).record_rank()


def _obtain_causative_genes(phenopacket_path: Path) -> [ProbandCausativeGene]:
"""Obtain causative genes from a phenopacket."""
phenopacket = phenopacket_reader(phenopacket_path)
phenopacket_util = PhenopacketUtil(phenopacket)
return phenopacket_util.diagnosed_genes()


def assess_phenopacket_gene_prioritisation(
standardised_gene_result: Path,
score_order: str,
results_dir_and_input: TrackInputOutputDirectories,
threshold: float,
gene_rank_stats: RankStats,
gene_rank_comparison: defaultdict,
) -> None:
"""Assess gene prioritisation for a phenopacket."""
phenopacket_path = obtain_closest_file_name(
standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir)
)
pheval_gene_result = read_standardised_result(standardised_gene_result)
proband_causative_genes = _obtain_causative_genes(phenopacket_path)
AssessGenePrioritisation(
phenopacket_path,
results_dir_and_input.results_dir.joinpath("pheval_gene_results/"),
parse_pheval_result(RankedPhEvalGeneResult, pheval_gene_result),
threshold,
score_order,
proband_causative_genes,
).assess_gene_prioritisation(gene_rank_stats, gene_rank_comparison)
52 changes: 52 additions & 0 deletions src/pheval/analyse/prioritisation_rank_recorder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Union

from pheval.analyse.prioritisation_result_types import (
DiseasePrioritisationResult,
GenePrioritisationResult,
VariantPrioritisationResult,
)


@dataclass
class PrioritisationRankRecorder:
"""Compare the ranks of different runs."""

index: int
directory: Path
prioritisation_result: Union[
GenePrioritisationResult, VariantPrioritisationResult, DiseasePrioritisationResult
]
run_comparison: defaultdict

def _record_gene_rank(self) -> None:
"""Record gene prioritisation rank."""
self.run_comparison[self.index]["Gene"] = self.prioritisation_result.gene

def _record_variant_rank(self) -> None:
"""Record variant prioritisation rank."""
variant = self.prioritisation_result.variant
self.run_comparison[self.index]["Variant"] = "_".join(
[variant.chrom, str(variant.pos), variant.ref, variant.alt]
)

def _record_disease_rank(self) -> None:
"""Record disease prioritisation rank."""
self.run_comparison[self.index][
"Disease"
] = self.prioritisation_result.disease.disease_identifier

def record_rank(self) -> None:
"""Records the rank for different runs."""
self.run_comparison[self.index][
"Phenopacket"
] = self.prioritisation_result.phenopacket_path.name
if type(self.prioritisation_result) is GenePrioritisationResult:
self._record_gene_rank()
elif type(self.prioritisation_result) is VariantPrioritisationResult:
self._record_variant_rank()
elif type(self.prioritisation_result) is DiseasePrioritisationResult:
self._record_disease_rank()
self.run_comparison[self.index][self.directory] = self.prioritisation_result.rank
31 changes: 31 additions & 0 deletions src/pheval/analyse/prioritisation_result_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from dataclasses import dataclass
from pathlib import Path

from pheval.utils.phenopacket_utils import GenomicVariant, ProbandDisease


@dataclass
class GenePrioritisationResult:
"""Store rank data for causative genes."""

phenopacket_path: Path
gene: str
rank: int = 0


@dataclass
class VariantPrioritisationResult:
"""Store rank data for causative variants."""

phenopacket_path: Path
variant: GenomicVariant
rank: int = 0


@dataclass
class DiseasePrioritisationResult:
"""Store rank data for known diseases."""

phenopacket_path: Path
disease: ProbandDisease
rank: int = 0
23 changes: 23 additions & 0 deletions src/pheval/analyse/run_data_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from dataclasses import dataclass
from pathlib import Path

import pandas as pd


@dataclass
class TrackInputOutputDirectories:
"""Track the input testdata for a corresponding pheval output directory"""

phenopacket_dir: Path
results_dir: Path


def parse_run_data_text_file(run_data_path: Path) -> [TrackInputOutputDirectories]:
"""Parse run data .txt file returning a list of input testdata and corresponding output directories."""
run_data = pd.read_csv(run_data_path, delimiter="\t", header=None)
run_data_list = []
for _index, row in run_data.iterrows():
run_data_list.append(
TrackInputOutputDirectories(phenopacket_dir=Path(row[0]), results_dir=Path(row[1]))
)
return run_data_list
Loading