From a4691d764a26a46db4674c48f56786ac0b5b0dec Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Thu, 14 Sep 2023 11:10:17 +0100 Subject: [PATCH 1/9] removing prioritisation specific methods and classes --- src/pheval/analyse/analysis.py | 489 +-------------------------------- 1 file changed, 8 insertions(+), 481 deletions(-) diff --git a/src/pheval/analyse/analysis.py b/src/pheval/analyse/analysis.py index 88c13cbbf..197a99062 100644 --- a/src/pheval/analyse/analysis.py +++ b/src/pheval/analyse/analysis.py @@ -1,10 +1,10 @@ from collections import defaultdict -from dataclasses import dataclass from pathlib import Path import click -import pandas as pd +from pheval.analyse.disease_prioritisation_analysis import assess_phenopacket_disease_prioritisation +from pheval.analyse.gene_prioritisation_analysis import assess_phenopacket_gene_prioritisation from pheval.analyse.generate_plots import AnalysisResults, TrackRunPrioritisation from pheval.analyse.generate_summary_outputs import ( RankStatsWriter, @@ -15,484 +15,11 @@ generate_benchmark_gene_output, generate_benchmark_variant_output, ) -from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result from pheval.analyse.rank_stats import RankStats -from pheval.post_processing.post_processing import ( - RankedPhEvalDiseaseResult, - RankedPhEvalGeneResult, - RankedPhEvalVariantResult, -) +from pheval.analyse.run_data_parser import TrackInputOutputDirectories, _parse_run_data_text_file +from pheval.analyse.variant_prioritisation_analysis import assess_phenopacket_variant_prioritisation from pheval.prepare.custom_exceptions import InputError -from pheval.utils.file_utils import all_files, files_with_suffix, obtain_closest_file_name -from pheval.utils.phenopacket_utils import ( - GenomicVariant, - PhenopacketUtil, - ProbandCausativeGene, - ProbandDisease, - phenopacket_reader, -) - - -@dataclass -class GenePrioritisationResult: - """Store rank data for causative genes.""" - - phenopacket_path: Path - gene: str - rank: int = 0 - - -@dataclass -class VariantPrioritisationResult: - """Store rank data for causative variants.""" - - phenopacket_path: Path - variant: GenomicVariant - rank: int = 0 - - -@dataclass -class DiseasePrioritisationResult: - """Store rank data for known diseases.""" - - phenopacket_path: Path - disease: ProbandDisease - rank: int = 0 - - -@dataclass -class PrioritisationRankRecorder: - """Compare the ranks of different runs.""" - - index: int - directory: Path - prioritisation_result: VariantPrioritisationResult or GenePrioritisationResult - run_comparison: defaultdict - - def _record_gene_rank(self) -> None: - """Record gene prioritisation rank.""" - self.run_comparison[self.index]["Gene"] = self.prioritisation_result.gene - - def _record_variant_rank(self) -> None: - """Record variant prioritisation rank.""" - variant = self.prioritisation_result.variant - self.run_comparison[self.index]["Variant"] = "_".join( - [variant.chrom, str(variant.pos), variant.ref, variant.alt] - ) - - def _record_disease_rank(self) -> None: - """Record disease prioritisation rank.""" - self.run_comparison[self.index][ - "Disease" - ] = self.prioritisation_result.disease.disease_identifier - - def record_rank(self) -> None: - """Records the rank for different runs.""" - self.run_comparison[self.index][ - "Phenopacket" - ] = self.prioritisation_result.phenopacket_path.name - if type(self.prioritisation_result) is GenePrioritisationResult: - self._record_gene_rank() - elif type(self.prioritisation_result) is VariantPrioritisationResult: - self._record_variant_rank() - elif type(self.prioritisation_result) is DiseasePrioritisationResult: - self._record_disease_rank() - self.run_comparison[self.index][self.directory] = self.prioritisation_result.rank - - -@dataclass -class TrackInputOutputDirectories: - """Track the input testdata for a corresponding pheval output directory""" - - phenopacket_dir: Path - results_dir: Path - - -def _parse_run_data_text_file(run_data_path: Path) -> [TrackInputOutputDirectories]: - """Parse run data .txt file returning a list of input testdata and corresponding output directories.""" - run_data = pd.read_csv(run_data_path, delimiter="\t", header=None) - run_data_list = [] - for _index, row in run_data.iterrows(): - run_data_list.append( - TrackInputOutputDirectories(phenopacket_dir=Path(row[0]), results_dir=Path(row[1])) - ) - return run_data_list - - -class AssessGenePrioritisation: - """Assess gene prioritisation.""" - - def __init__( - self, - phenopacket_path: Path, - results_dir: Path, - standardised_gene_results: [RankedPhEvalGeneResult], - threshold: float, - score_order: str, - proband_causative_genes: [ProbandCausativeGene], - ): - self.phenopacket_path = phenopacket_path - self.results_dir = results_dir - self.standardised_gene_results = standardised_gene_results - self.threshold = threshold - self.score_order = score_order - self.proband_causative_genes = proband_causative_genes - - def _record_gene_prioritisation_match( - self, - gene: ProbandCausativeGene, - result_entry: RankedPhEvalGeneResult, - rank_stats: RankStats, - ) -> GenePrioritisationResult: - """Record the gene prioritisation rank if found within results.""" - rank = result_entry.rank - rank_stats.add_rank(rank) - return GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol, rank) - - def _assess_gene_with_threshold_ascending_order( - self, - result_entry: RankedPhEvalGeneResult, - gene: ProbandCausativeGene, - rank_stats: RankStats, - ) -> GenePrioritisationResult: - """Record the gene prioritisation rank if it meets the ascending order threshold.""" - if float(self.threshold) > float(result_entry.score): - return self._record_gene_prioritisation_match(gene, result_entry, rank_stats) - - def _assess_gene_with_threshold( - self, - result_entry: RankedPhEvalGeneResult, - gene: ProbandCausativeGene, - rank_stats: RankStats, - ) -> GenePrioritisationResult: - """Record the gene prioritisation rank if it meets the score threshold.""" - if float(self.threshold) < float(result_entry.score): - return self._record_gene_prioritisation_match(gene, result_entry, rank_stats) - - def _record_matched_gene( - self, gene: ProbandCausativeGene, rank_stats: RankStats, standardised_gene_result: pd.Series - ) -> GenePrioritisationResult: - """Return the gene rank result - dealing with the specification of a threshold.""" - if float(self.threshold) == 0.0: - return self._record_gene_prioritisation_match( - gene, standardised_gene_result, rank_stats - ) - else: - return ( - self._assess_gene_with_threshold(standardised_gene_result, gene, rank_stats) - if self.score_order != "ascending" - else self._assess_gene_with_threshold_ascending_order( - standardised_gene_result, gene, rank_stats - ) - ) - - def assess_gene_prioritisation(self, rank_stats: RankStats, rank_records: defaultdict) -> None: - """Assess gene prioritisation.""" - for gene in self.proband_causative_genes: - rank_stats.total += 1 - gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol) - for standardised_gene_result in self.standardised_gene_results: - if ( - gene.gene_identifier == standardised_gene_result.gene_identifier - or gene.gene_symbol == standardised_gene_result.gene_symbol - ): - gene_match = self._record_matched_gene( - gene, rank_stats, standardised_gene_result - ) - break - PrioritisationRankRecorder( - rank_stats.total, - self.results_dir, - GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol) - if gene_match is None - else gene_match, - rank_records, - ).record_rank() - - -class AssessVariantPrioritisation: - """Assess variant prioritisation.""" - - def __init__( - self, - phenopacket_path: Path, - results_dir: Path, - standardised_variant_results: [RankedPhEvalVariantResult], - threshold: float, - score_order: str, - proband_causative_variants: [GenomicVariant], - ): - self.phenopacket_path = phenopacket_path - self.results_dir = results_dir - self.standardised_variant_results = standardised_variant_results - self.threshold = threshold - self.score_order = score_order - self.proband_causative_variants = proband_causative_variants - - def _record_variant_prioritisation_match( - self, - result_entry: RankedPhEvalVariantResult, - rank_stats: RankStats, - ) -> VariantPrioritisationResult: - """Record the variant prioritisation rank if found within results.""" - rank = result_entry.rank - rank_stats.add_rank(rank) - return VariantPrioritisationResult( - self.phenopacket_path, - GenomicVariant( - chrom=result_entry.chromosome, - pos=result_entry.start, - ref=result_entry.ref, - alt=result_entry.alt, - ), - rank, - ) - - def _assess_variant_with_threshold_ascending_order( - self, result_entry: RankedPhEvalVariantResult, rank_stats: RankStats - ) -> VariantPrioritisationResult: - """Record the variant prioritisation rank if it meets the ascending order threshold.""" - if float(self.threshold) > float(result_entry.score): - return self._record_variant_prioritisation_match(result_entry, rank_stats) - - def _assess_variant_with_threshold( - self, result_entry: pd.Series, rank_stats: RankStats - ) -> VariantPrioritisationResult: - """Record the variant prioritisation rank if it meets the score threshold.""" - if float(self.threshold) < float(result_entry.score): - return self._record_variant_prioritisation_match(result_entry, rank_stats) - - def _record_matched_variant( - self, rank_stats: RankStats, standardised_variant_result: pd.Series - ) -> VariantPrioritisationResult: - """Return the variant rank result - dealing with the specification of a threshold.""" - if float(self.threshold) == 0.0: - return self._record_variant_prioritisation_match( - standardised_variant_result, rank_stats - ) - else: - return ( - self._assess_variant_with_threshold(standardised_variant_result, rank_stats) - if self.score_order != "ascending" - else self._assess_variant_with_threshold_ascending_order( - standardised_variant_result, rank_stats - ) - ) - - def assess_variant_prioritisation( - self, rank_stats: RankStats, rank_records: defaultdict - ) -> None: - """Assess variant prioritisation.""" - for variant in self.proband_causative_variants: - rank_stats.total += 1 - variant_match = VariantPrioritisationResult(self.phenopacket_path, variant) - for result in self.standardised_variant_results: - result_variant = GenomicVariant( - chrom=result.chromosome, - pos=result.start, - ref=result.ref, - alt=result.alt, - ) - if variant == result_variant: - variant_match = self._record_matched_variant(rank_stats, result) - break - PrioritisationRankRecorder( - rank_stats.total, - self.results_dir, - VariantPrioritisationResult(self.phenopacket_path, variant) - if variant_match is None - else variant_match, - rank_records, - ).record_rank() - - -class AssessDiseasePrioritisation: - def __init__( - self, - phenopacket_path: Path, - results_dir: Path, - standardised_disease_results: [RankedPhEvalDiseaseResult], - threshold: float, - score_order: str, - proband_diseases: [ProbandDisease], - ): - self.phenopacket_path = phenopacket_path - self.results_dir = results_dir - self.standardised_disease_results = standardised_disease_results - self.threshold = threshold - self.score_order = score_order - self.proband_diseases = proband_diseases - - def _record_disease_prioritisation_match( - self, - disease: ProbandDisease, - result_entry: RankedPhEvalDiseaseResult, - rank_stats: RankStats, - ) -> DiseasePrioritisationResult: - """Record the disease prioritisation rank if found within results.""" - rank = result_entry.rank - rank_stats.add_rank(rank) - return DiseasePrioritisationResult(self.phenopacket_path, disease, rank) - - def _assess_disease_with_threshold_ascending_order( - self, - result_entry: RankedPhEvalDiseaseResult, - disease: ProbandDisease, - rank_stats: RankStats, - ) -> DiseasePrioritisationResult: - """Record the disease prioritisation rank if it meets the ascending order threshold.""" - if float(self.threshold) > float(result_entry.score): - return self._record_disease_prioritisation_match(disease, result_entry, rank_stats) - - def _assess_disease_with_threshold( - self, - result_entry: RankedPhEvalDiseaseResult, - disease: ProbandDisease, - rank_stats: RankStats, - ) -> DiseasePrioritisationResult: - """Record the disease prioritisation rank if it meets the score threshold.""" - if float(self.threshold) < float(result_entry.score): - return self._record_disease_prioritisation_match(disease, result_entry, rank_stats) - - def _record_matched_disease( - self, - disease: ProbandDisease, - rank_stats: RankStats, - standardised_disease_result: RankedPhEvalDiseaseResult, - ) -> DiseasePrioritisationResult: - """Return the gene rank result - dealing with the specification of a threshold.""" - if float(self.threshold) == 0.0: - return self._record_disease_prioritisation_match( - disease, standardised_disease_result, rank_stats - ) - else: - return ( - self._assess_disease_with_threshold( - standardised_disease_result, disease, rank_stats - ) - if self.score_order != "ascending" - else self._assess_disease_with_threshold_ascending_order( - standardised_disease_result, disease, rank_stats - ) - ) - - def assess_disease_prioritisation( - self, rank_stats: RankStats, rank_records: defaultdict - ) -> None: - """Assess disease prioritisation.""" - for disease in self.proband_diseases: - rank_stats.total += 1 - disease_match = DiseasePrioritisationResult(self.phenopacket_path, disease) - for standardised_disease_result in self.standardised_disease_results: - if ( - disease.disease_identifier == standardised_disease_result.disease_identifier - or disease.disease_name == standardised_disease_result.disease_name - ): - disease_match = self._record_matched_disease( - disease, rank_stats, standardised_disease_result - ) - break - PrioritisationRankRecorder( - rank_stats.total, - self.results_dir, - DiseasePrioritisationResult(self.phenopacket_path, disease) - if disease_match is None - else disease_match, - rank_records, - ).record_rank() - - -def _obtain_causative_genes(phenopacket_path: Path) -> [ProbandCausativeGene]: - """Obtain causative genes from a phenopacket.""" - phenopacket = phenopacket_reader(phenopacket_path) - phenopacket_util = PhenopacketUtil(phenopacket) - return phenopacket_util.diagnosed_genes() - - -def _obtain_causative_variants(phenopacket_path: Path) -> [GenomicVariant]: - """Obtain causative variants from a phenopacket.""" - phenopacket = phenopacket_reader(phenopacket_path) - phenopacket_util = PhenopacketUtil(phenopacket) - return phenopacket_util.diagnosed_variants() - - -def _obtain_causative_diseases(phenopacket_path: Path) -> [ProbandDisease]: - """Obtain known diseases from a phenopacket.""" - phenopacket = phenopacket_reader(phenopacket_path) - phenopacket_util = PhenopacketUtil(phenopacket) - return phenopacket_util.diagnoses() - - -def _assess_phenopacket_gene_prioritisation( - standardised_gene_result: Path, - score_order: str, - results_dir_and_input: TrackInputOutputDirectories, - threshold: float, - gene_rank_stats: RankStats, - gene_rank_comparison: defaultdict, -) -> None: - """Assess gene prioritisation for a phenopacket.""" - phenopacket_path = obtain_closest_file_name( - standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir) - ) - pheval_gene_result = read_standardised_result(standardised_gene_result) - proband_causative_genes = _obtain_causative_genes(phenopacket_path) - AssessGenePrioritisation( - phenopacket_path, - results_dir_and_input.results_dir.joinpath("pheval_gene_results/"), - parse_pheval_result(RankedPhEvalGeneResult, pheval_gene_result), - threshold, - score_order, - proband_causative_genes, - ).assess_gene_prioritisation(gene_rank_stats, gene_rank_comparison) - - -def _assess_phenopacket_variant_prioritisation( - standardised_variant_result: Path, - score_order: str, - results_dir_and_input: TrackInputOutputDirectories, - threshold: float, - variant_rank_stats: RankStats, - variant_rank_comparison: defaultdict, -) -> None: - """Assess variant prioritisation for a phenopacket""" - phenopacket_path = obtain_closest_file_name( - standardised_variant_result, all_files(results_dir_and_input.phenopacket_dir) - ) - proband_causative_variants = _obtain_causative_variants(phenopacket_path) - pheval_variant_result = read_standardised_result(standardised_variant_result) - AssessVariantPrioritisation( - phenopacket_path, - results_dir_and_input.results_dir.joinpath("pheval_variant_results/"), - parse_pheval_result(RankedPhEvalVariantResult, pheval_variant_result), - threshold, - score_order, - proband_causative_variants, - ).assess_variant_prioritisation(variant_rank_stats, variant_rank_comparison) - - -def _assess_phenopacket_disease_prioritisation( - standardised_disease_result: Path, - score_order: str, - results_dir_and_input: TrackInputOutputDirectories, - threshold: float, - disease_rank_stats: RankStats, - disease_rank_comparison: defaultdict, -) -> None: - """Assess gene prioritisation for a phenopacket.""" - phenopacket_path = obtain_closest_file_name( - standardised_disease_result, all_files(results_dir_and_input.phenopacket_dir) - ) - pheval_disease_result = read_standardised_result(standardised_disease_result) - proband_diseases = _obtain_causative_diseases(phenopacket_path) - AssessDiseasePrioritisation( - phenopacket_path, - results_dir_and_input.results_dir.joinpath("pheval_disease_results/"), - parse_pheval_result(RankedPhEvalDiseaseResult, pheval_disease_result), - threshold, - score_order, - proband_diseases, - ).assess_disease_prioritisation(disease_rank_stats, disease_rank_comparison) +from pheval.utils.file_utils import files_with_suffix def _assess_prioritisation_for_results_directory( @@ -515,7 +42,7 @@ def _assess_prioritisation_for_results_directory( for standardised_result in files_with_suffix( results_directory_and_input.results_dir.joinpath("pheval_gene_results/"), ".tsv" ): - _assess_phenopacket_gene_prioritisation( + assess_phenopacket_gene_prioritisation( standardised_result, score_order, results_directory_and_input, @@ -529,7 +56,7 @@ def _assess_prioritisation_for_results_directory( results_directory_and_input.results_dir.joinpath("pheval_variant_results/"), ".tsv", ): - _assess_phenopacket_variant_prioritisation( + assess_phenopacket_variant_prioritisation( standardised_result, score_order, results_directory_and_input, @@ -543,7 +70,7 @@ def _assess_prioritisation_for_results_directory( results_directory_and_input.results_dir.joinpath("pheval_disease_results/"), ".tsv", ): - _assess_phenopacket_disease_prioritisation( + assess_phenopacket_disease_prioritisation( standardised_result, score_order, results_directory_and_input, From d05541ae8862e123be6c701cd61750b5cd616a7e Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Thu, 14 Sep 2023 11:34:59 +0100 Subject: [PATCH 2/9] adding disease analysis specific methods --- .../disease_prioritisation_analysis.py | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 src/pheval/analyse/disease_prioritisation_analysis.py diff --git a/src/pheval/analyse/disease_prioritisation_analysis.py b/src/pheval/analyse/disease_prioritisation_analysis.py new file mode 100644 index 000000000..30320f013 --- /dev/null +++ b/src/pheval/analyse/disease_prioritisation_analysis.py @@ -0,0 +1,138 @@ +from collections import defaultdict +from pathlib import Path + +from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result +from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder +from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResult +from pheval.analyse.rank_stats import RankStats +from pheval.analyse.run_data_parser import TrackInputOutputDirectories +from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult +from pheval.utils.file_utils import all_files, obtain_closest_file_name +from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader + + +class AssessDiseasePrioritisation: + def __init__( + self, + phenopacket_path: Path, + results_dir: Path, + standardised_disease_results: [RankedPhEvalDiseaseResult], + threshold: float, + score_order: str, + proband_diseases: [ProbandDisease], + ): + self.phenopacket_path = phenopacket_path + self.results_dir = results_dir + self.standardised_disease_results = standardised_disease_results + self.threshold = threshold + self.score_order = score_order + self.proband_diseases = proband_diseases + + def _record_disease_prioritisation_match( + self, + disease: ProbandDisease, + result_entry: RankedPhEvalDiseaseResult, + rank_stats: RankStats, + ) -> DiseasePrioritisationResult: + """Record the disease prioritisation rank if found within results.""" + rank = result_entry.rank + rank_stats.add_rank(rank) + return DiseasePrioritisationResult(self.phenopacket_path, disease, rank) + + def _assess_disease_with_threshold_ascending_order( + self, + result_entry: RankedPhEvalDiseaseResult, + disease: ProbandDisease, + rank_stats: RankStats, + ) -> DiseasePrioritisationResult: + """Record the disease prioritisation rank if it meets the ascending order threshold.""" + if float(self.threshold) > float(result_entry.score): + return self._record_disease_prioritisation_match(disease, result_entry, rank_stats) + + def _assess_disease_with_threshold( + self, + result_entry: RankedPhEvalDiseaseResult, + disease: ProbandDisease, + rank_stats: RankStats, + ) -> DiseasePrioritisationResult: + """Record the disease prioritisation rank if it meets the score threshold.""" + if float(self.threshold) < float(result_entry.score): + return self._record_disease_prioritisation_match(disease, result_entry, rank_stats) + + def _record_matched_disease( + self, + disease: ProbandDisease, + rank_stats: RankStats, + standardised_disease_result: RankedPhEvalDiseaseResult, + ) -> DiseasePrioritisationResult: + """Return the disease rank result - dealing with the specification of a threshold.""" + if float(self.threshold) == 0.0: + return self._record_disease_prioritisation_match( + disease, standardised_disease_result, rank_stats + ) + else: + return ( + self._assess_disease_with_threshold( + standardised_disease_result, disease, rank_stats + ) + if self.score_order != "ascending" + else self._assess_disease_with_threshold_ascending_order( + standardised_disease_result, disease, rank_stats + ) + ) + + def assess_disease_prioritisation( + self, rank_stats: RankStats, rank_records: defaultdict + ) -> None: + """Assess disease prioritisation.""" + for disease in self.proband_diseases: + rank_stats.total += 1 + disease_match = DiseasePrioritisationResult(self.phenopacket_path, disease) + for standardised_disease_result in self.standardised_disease_results: + if ( + disease.disease_identifier == standardised_disease_result.disease_identifier + or disease.disease_name == standardised_disease_result.disease_name + ): + disease_match = self._record_matched_disease( + disease, rank_stats, standardised_disease_result + ) + break + PrioritisationRankRecorder( + rank_stats.total, + self.results_dir, + DiseasePrioritisationResult(self.phenopacket_path, disease) + if disease_match is None + else disease_match, + rank_records, + ).record_rank() + + +def _obtain_causative_diseases(phenopacket_path: Path) -> [ProbandDisease]: + """Obtain known diseases from a phenopacket.""" + phenopacket = phenopacket_reader(phenopacket_path) + phenopacket_util = PhenopacketUtil(phenopacket) + return phenopacket_util.diagnoses() + + +def assess_phenopacket_disease_prioritisation( + standardised_disease_result: Path, + score_order: str, + results_dir_and_input: TrackInputOutputDirectories, + threshold: float, + disease_rank_stats: RankStats, + disease_rank_comparison: defaultdict, +) -> None: + """Assess disease prioritisation for a phenopacket.""" + phenopacket_path = obtain_closest_file_name( + standardised_disease_result, all_files(results_dir_and_input.phenopacket_dir) + ) + pheval_disease_result = read_standardised_result(standardised_disease_result) + proband_diseases = _obtain_causative_diseases(phenopacket_path) + AssessDiseasePrioritisation( + phenopacket_path, + results_dir_and_input.results_dir.joinpath("pheval_disease_results/"), + parse_pheval_result(RankedPhEvalDiseaseResult, pheval_disease_result), + threshold, + score_order, + proband_diseases, + ).assess_disease_prioritisation(disease_rank_stats, disease_rank_comparison) From 1021f882b6555a321d4e19774dcbbbeaf58f896e Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Thu, 14 Sep 2023 11:35:04 +0100 Subject: [PATCH 3/9] adding gene analysis specific methods --- .../analyse/gene_prioritisation_analysis.py | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 src/pheval/analyse/gene_prioritisation_analysis.py diff --git a/src/pheval/analyse/gene_prioritisation_analysis.py b/src/pheval/analyse/gene_prioritisation_analysis.py new file mode 100644 index 000000000..43e9de65c --- /dev/null +++ b/src/pheval/analyse/gene_prioritisation_analysis.py @@ -0,0 +1,135 @@ +from collections import defaultdict +from pathlib import Path + +import pandas as pd + +from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result +from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder +from pheval.analyse.prioritisation_result_types import GenePrioritisationResult +from pheval.analyse.rank_stats import RankStats +from pheval.analyse.run_data_parser import TrackInputOutputDirectories +from pheval.post_processing.post_processing import RankedPhEvalGeneResult +from pheval.utils.file_utils import all_files, obtain_closest_file_name +from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader + + +class AssessGenePrioritisation: + """Assess gene prioritisation.""" + + def __init__( + self, + phenopacket_path: Path, + results_dir: Path, + standardised_gene_results: [RankedPhEvalGeneResult], + threshold: float, + score_order: str, + proband_causative_genes: [ProbandCausativeGene], + ): + self.phenopacket_path = phenopacket_path + self.results_dir = results_dir + self.standardised_gene_results = standardised_gene_results + self.threshold = threshold + self.score_order = score_order + self.proband_causative_genes = proband_causative_genes + + def _record_gene_prioritisation_match( + self, + gene: ProbandCausativeGene, + result_entry: RankedPhEvalGeneResult, + rank_stats: RankStats, + ) -> GenePrioritisationResult: + """Record the gene prioritisation rank if found within results.""" + rank = result_entry.rank + rank_stats.add_rank(rank) + return GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol, rank) + + def _assess_gene_with_threshold_ascending_order( + self, + result_entry: RankedPhEvalGeneResult, + gene: ProbandCausativeGene, + rank_stats: RankStats, + ) -> GenePrioritisationResult: + """Record the gene prioritisation rank if it meets the ascending order threshold.""" + if float(self.threshold) > float(result_entry.score): + return self._record_gene_prioritisation_match(gene, result_entry, rank_stats) + + def _assess_gene_with_threshold( + self, + result_entry: RankedPhEvalGeneResult, + gene: ProbandCausativeGene, + rank_stats: RankStats, + ) -> GenePrioritisationResult: + """Record the gene prioritisation rank if it meets the score threshold.""" + if float(self.threshold) < float(result_entry.score): + return self._record_gene_prioritisation_match(gene, result_entry, rank_stats) + + def _record_matched_gene( + self, gene: ProbandCausativeGene, rank_stats: RankStats, standardised_gene_result: pd.Series + ) -> GenePrioritisationResult: + """Return the gene rank result - dealing with the specification of a threshold.""" + if float(self.threshold) == 0.0: + return self._record_gene_prioritisation_match( + gene, standardised_gene_result, rank_stats + ) + else: + return ( + self._assess_gene_with_threshold(standardised_gene_result, gene, rank_stats) + if self.score_order != "ascending" + else self._assess_gene_with_threshold_ascending_order( + standardised_gene_result, gene, rank_stats + ) + ) + + def assess_gene_prioritisation(self, rank_stats: RankStats, rank_records: defaultdict) -> None: + """Assess gene prioritisation.""" + for gene in self.proband_causative_genes: + rank_stats.total += 1 + gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol) + for standardised_gene_result in self.standardised_gene_results: + if ( + gene.gene_identifier == standardised_gene_result.gene_identifier + or gene.gene_symbol == standardised_gene_result.gene_symbol + ): + gene_match = self._record_matched_gene( + gene, rank_stats, standardised_gene_result + ) + break + PrioritisationRankRecorder( + rank_stats.total, + self.results_dir, + GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol) + if gene_match is None + else gene_match, + rank_records, + ).record_rank() + + +def _obtain_causative_genes(phenopacket_path: Path) -> [ProbandCausativeGene]: + """Obtain causative genes from a phenopacket.""" + phenopacket = phenopacket_reader(phenopacket_path) + phenopacket_util = PhenopacketUtil(phenopacket) + return phenopacket_util.diagnosed_genes() + + +def assess_phenopacket_gene_prioritisation( + standardised_gene_result: Path, + score_order: str, + results_dir_and_input: TrackInputOutputDirectories, + threshold: float, + gene_rank_stats: RankStats, + gene_rank_comparison: defaultdict, +) -> None: + """Assess gene prioritisation for a phenopacket.""" + phenopacket_path = obtain_closest_file_name( + standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir) + ) + pheval_gene_result = read_standardised_result(standardised_gene_result) + proband_causative_genes = _obtain_causative_genes(phenopacket_path) + AssessGenePrioritisation( + phenopacket_path, + results_dir_and_input.results_dir.joinpath("pheval_gene_results/"), + parse_pheval_result(RankedPhEvalGeneResult, pheval_gene_result), + threshold, + score_order, + proband_causative_genes, + ).assess_gene_prioritisation(gene_rank_stats, gene_rank_comparison) From 6405f162c6d8aca2d31e9af1082dd1f738c7d79b Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Thu, 14 Sep 2023 11:35:09 +0100 Subject: [PATCH 4/9] adding variant analysis specific methods --- .../variant_prioritisation_analysis.py | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 src/pheval/analyse/variant_prioritisation_analysis.py diff --git a/src/pheval/analyse/variant_prioritisation_analysis.py b/src/pheval/analyse/variant_prioritisation_analysis.py new file mode 100644 index 000000000..05c0b1bf1 --- /dev/null +++ b/src/pheval/analyse/variant_prioritisation_analysis.py @@ -0,0 +1,140 @@ +from collections import defaultdict +from pathlib import Path + +import pandas as pd + +from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result +from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder +from pheval.analyse.prioritisation_result_types import VariantPrioritisationResult +from pheval.analyse.rank_stats import RankStats +from pheval.analyse.run_data_parser import TrackInputOutputDirectories +from pheval.post_processing.post_processing import RankedPhEvalVariantResult +from pheval.utils.file_utils import all_files, obtain_closest_file_name +from pheval.utils.phenopacket_utils import GenomicVariant, PhenopacketUtil, phenopacket_reader + + +class AssessVariantPrioritisation: + """Assess variant prioritisation.""" + + def __init__( + self, + phenopacket_path: Path, + results_dir: Path, + standardised_variant_results: [RankedPhEvalVariantResult], + threshold: float, + score_order: str, + proband_causative_variants: [GenomicVariant], + ): + self.phenopacket_path = phenopacket_path + self.results_dir = results_dir + self.standardised_variant_results = standardised_variant_results + self.threshold = threshold + self.score_order = score_order + self.proband_causative_variants = proband_causative_variants + + def _record_variant_prioritisation_match( + self, + result_entry: RankedPhEvalVariantResult, + rank_stats: RankStats, + ) -> VariantPrioritisationResult: + """Record the variant prioritisation rank if found within results.""" + rank = result_entry.rank + rank_stats.add_rank(rank) + return VariantPrioritisationResult( + self.phenopacket_path, + GenomicVariant( + chrom=result_entry.chromosome, + pos=result_entry.start, + ref=result_entry.ref, + alt=result_entry.alt, + ), + rank, + ) + + def _assess_variant_with_threshold_ascending_order( + self, result_entry: RankedPhEvalVariantResult, rank_stats: RankStats + ) -> VariantPrioritisationResult: + """Record the variant prioritisation rank if it meets the ascending order threshold.""" + if float(self.threshold) > float(result_entry.score): + return self._record_variant_prioritisation_match(result_entry, rank_stats) + + def _assess_variant_with_threshold( + self, result_entry: pd.Series, rank_stats: RankStats + ) -> VariantPrioritisationResult: + """Record the variant prioritisation rank if it meets the score threshold.""" + if float(self.threshold) < float(result_entry.score): + return self._record_variant_prioritisation_match(result_entry, rank_stats) + + def _record_matched_variant( + self, rank_stats: RankStats, standardised_variant_result: pd.Series + ) -> VariantPrioritisationResult: + """Return the variant rank result - dealing with the specification of a threshold.""" + if float(self.threshold) == 0.0: + return self._record_variant_prioritisation_match( + standardised_variant_result, rank_stats + ) + else: + return ( + self._assess_variant_with_threshold(standardised_variant_result, rank_stats) + if self.score_order != "ascending" + else self._assess_variant_with_threshold_ascending_order( + standardised_variant_result, rank_stats + ) + ) + + def assess_variant_prioritisation( + self, rank_stats: RankStats, rank_records: defaultdict + ) -> None: + """Assess variant prioritisation.""" + for variant in self.proband_causative_variants: + rank_stats.total += 1 + variant_match = VariantPrioritisationResult(self.phenopacket_path, variant) + for result in self.standardised_variant_results: + result_variant = GenomicVariant( + chrom=result.chromosome, + pos=result.start, + ref=result.ref, + alt=result.alt, + ) + if variant == result_variant: + variant_match = self._record_matched_variant(rank_stats, result) + break + PrioritisationRankRecorder( + rank_stats.total, + self.results_dir, + VariantPrioritisationResult(self.phenopacket_path, variant) + if variant_match is None + else variant_match, + rank_records, + ).record_rank() + + +def _obtain_causative_variants(phenopacket_path: Path) -> [GenomicVariant]: + """Obtain causative variants from a phenopacket.""" + phenopacket = phenopacket_reader(phenopacket_path) + phenopacket_util = PhenopacketUtil(phenopacket) + return phenopacket_util.diagnosed_variants() + + +def assess_phenopacket_variant_prioritisation( + standardised_variant_result: Path, + score_order: str, + results_dir_and_input: TrackInputOutputDirectories, + threshold: float, + variant_rank_stats: RankStats, + variant_rank_comparison: defaultdict, +) -> None: + """Assess variant prioritisation for a phenopacket""" + phenopacket_path = obtain_closest_file_name( + standardised_variant_result, all_files(results_dir_and_input.phenopacket_dir) + ) + proband_causative_variants = _obtain_causative_variants(phenopacket_path) + pheval_variant_result = read_standardised_result(standardised_variant_result) + AssessVariantPrioritisation( + phenopacket_path, + results_dir_and_input.results_dir.joinpath("pheval_variant_results/"), + parse_pheval_result(RankedPhEvalVariantResult, pheval_variant_result), + threshold, + score_order, + proband_causative_variants, + ).assess_variant_prioritisation(variant_rank_stats, variant_rank_comparison) From 5e75d7fb9b253e989c18866bfaf98def687ece13 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Thu, 14 Sep 2023 11:35:25 +0100 Subject: [PATCH 5/9] adding dataclasses for prioritisation result types --- .../analyse/prioritisation_result_types.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 src/pheval/analyse/prioritisation_result_types.py diff --git a/src/pheval/analyse/prioritisation_result_types.py b/src/pheval/analyse/prioritisation_result_types.py new file mode 100644 index 000000000..7bdb584d8 --- /dev/null +++ b/src/pheval/analyse/prioritisation_result_types.py @@ -0,0 +1,31 @@ +from dataclasses import dataclass +from pathlib import Path + +from pheval.utils.phenopacket_utils import GenomicVariant, ProbandDisease + + +@dataclass +class GenePrioritisationResult: + """Store rank data for causative genes.""" + + phenopacket_path: Path + gene: str + rank: int = 0 + + +@dataclass +class VariantPrioritisationResult: + """Store rank data for causative variants.""" + + phenopacket_path: Path + variant: GenomicVariant + rank: int = 0 + + +@dataclass +class DiseasePrioritisationResult: + """Store rank data for known diseases.""" + + phenopacket_path: Path + disease: ProbandDisease + rank: int = 0 From 5de4a72e79305e9b1762d270d6266fb9859a0afb Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Thu, 14 Sep 2023 11:46:02 +0100 Subject: [PATCH 6/9] adding methods for recording ranks in prioritisation results --- .../analyse/prioritisation_rank_recorder.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 src/pheval/analyse/prioritisation_rank_recorder.py diff --git a/src/pheval/analyse/prioritisation_rank_recorder.py b/src/pheval/analyse/prioritisation_rank_recorder.py new file mode 100644 index 000000000..268d1a0f2 --- /dev/null +++ b/src/pheval/analyse/prioritisation_rank_recorder.py @@ -0,0 +1,52 @@ +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Union + +from pheval.analyse.prioritisation_result_types import ( + DiseasePrioritisationResult, + GenePrioritisationResult, + VariantPrioritisationResult, +) + + +@dataclass +class PrioritisationRankRecorder: + """Compare the ranks of different runs.""" + + index: int + directory: Path + prioritisation_result: Union[ + GenePrioritisationResult, VariantPrioritisationResult, DiseasePrioritisationResult + ] + run_comparison: defaultdict + + def _record_gene_rank(self) -> None: + """Record gene prioritisation rank.""" + self.run_comparison[self.index]["Gene"] = self.prioritisation_result.gene + + def _record_variant_rank(self) -> None: + """Record variant prioritisation rank.""" + variant = self.prioritisation_result.variant + self.run_comparison[self.index]["Variant"] = "_".join( + [variant.chrom, str(variant.pos), variant.ref, variant.alt] + ) + + def _record_disease_rank(self) -> None: + """Record disease prioritisation rank.""" + self.run_comparison[self.index][ + "Disease" + ] = self.prioritisation_result.disease.disease_identifier + + def record_rank(self) -> None: + """Records the rank for different runs.""" + self.run_comparison[self.index][ + "Phenopacket" + ] = self.prioritisation_result.phenopacket_path.name + if type(self.prioritisation_result) is GenePrioritisationResult: + self._record_gene_rank() + elif type(self.prioritisation_result) is VariantPrioritisationResult: + self._record_variant_rank() + elif type(self.prioritisation_result) is DiseasePrioritisationResult: + self._record_disease_rank() + self.run_comparison[self.index][self.directory] = self.prioritisation_result.rank From 3479715805d8d96eae3fdce09017e714e4bcc479 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Thu, 14 Sep 2023 11:46:16 +0100 Subject: [PATCH 7/9] adding methods for parsing run data txt --- src/pheval/analyse/run_data_parser.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 src/pheval/analyse/run_data_parser.py diff --git a/src/pheval/analyse/run_data_parser.py b/src/pheval/analyse/run_data_parser.py new file mode 100644 index 000000000..3a87be3eb --- /dev/null +++ b/src/pheval/analyse/run_data_parser.py @@ -0,0 +1,23 @@ +from dataclasses import dataclass +from pathlib import Path + +import pandas as pd + + +@dataclass +class TrackInputOutputDirectories: + """Track the input testdata for a corresponding pheval output directory""" + + phenopacket_dir: Path + results_dir: Path + + +def _parse_run_data_text_file(run_data_path: Path) -> [TrackInputOutputDirectories]: + """Parse run data .txt file returning a list of input testdata and corresponding output directories.""" + run_data = pd.read_csv(run_data_path, delimiter="\t", header=None) + run_data_list = [] + for _index, row in run_data.iterrows(): + run_data_list.append( + TrackInputOutputDirectories(phenopacket_dir=Path(row[0]), results_dir=Path(row[1])) + ) + return run_data_list From 6cf163e33fed7a8e3464d0886c949a147934a975 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Thu, 14 Sep 2023 11:46:23 +0100 Subject: [PATCH 8/9] refactoring imports --- tests/test_analysis.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index b1ee92e60..9241be5aa 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -3,16 +3,16 @@ from copy import copy from pathlib import Path, PosixPath -from pheval.analyse.analysis import ( - AssessDiseasePrioritisation, - AssessGenePrioritisation, - AssessVariantPrioritisation, +from pheval.analyse.disease_prioritisation_analysis import AssessDiseasePrioritisation +from pheval.analyse.gene_prioritisation_analysis import AssessGenePrioritisation +from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder +from pheval.analyse.prioritisation_result_types import ( DiseasePrioritisationResult, GenePrioritisationResult, - PrioritisationRankRecorder, VariantPrioritisationResult, ) from pheval.analyse.rank_stats import RankStats +from pheval.analyse.variant_prioritisation_analysis import AssessVariantPrioritisation from pheval.post_processing.post_processing import ( RankedPhEvalDiseaseResult, RankedPhEvalGeneResult, From 557d4e9f4e571fa188b7f6f81db78b09945f0f51 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Thu, 14 Sep 2023 16:03:31 +0100 Subject: [PATCH 9/9] reverting try and except --- src/pheval/analyse/parse_pheval_result.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/pheval/analyse/parse_pheval_result.py b/src/pheval/analyse/parse_pheval_result.py index 5eafb32cb..7638ab789 100644 --- a/src/pheval/analyse/parse_pheval_result.py +++ b/src/pheval/analyse/parse_pheval_result.py @@ -7,10 +7,7 @@ def read_standardised_result(standardised_result_path: Path) -> dict: """Read the standardised result output and return a list of dictionaries.""" - try: - return pd.read_csv(standardised_result_path, delimiter="\t").to_dict("records") - except pd.errors.EmptyDataError: - return pd.DataFrame() + return pd.read_csv(standardised_result_path, delimiter="\t").to_dict("records") def parse_pheval_result(data_class_type: PhEvalResult, pheval_result: [dict]) -> [PhEvalResult]: