Skip to content

Commit

Permalink
Merge pull request #215 from monarch-initiative/214-new-method-for-se…
Browse files Browse the repository at this point in the history
…msim-ingestion

New method for semsim ingestion
  • Loading branch information
souzadevinicius authored Jan 23, 2024
2 parents 461bf6f + d0f3096 commit d17074e
Show file tree
Hide file tree
Showing 12 changed files with 371 additions and 168 deletions.
Binary file added lib/h2-1.4.199.jar
Binary file not shown.
39 changes: 38 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ plotly = "^5.13.0"
seaborn = "^0.12.2"
matplotlib = "^3.7.0"
pyserde = "^0.9.8"
polars = "^0.19.15"

[tool.poetry.dev-dependencies]
pytest = "^7.2.0"
Expand Down
4 changes: 2 additions & 2 deletions src/pheval/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
create_spiked_vcfs_command,
generate_stats_plot,
scramble_phenopackets_command,
semsim_convert_command,
semsim_scramble_command,
semsim_to_exomiserdb_command,
update_phenopackets_command,
)

Expand Down Expand Up @@ -52,12 +52,12 @@ def pheval_utils():


pheval_utils.add_command(semsim_scramble_command)
pheval_utils.add_command(semsim_convert_command)
pheval_utils.add_command(scramble_phenopackets_command)
pheval_utils.add_command(update_phenopackets_command)
pheval_utils.add_command(create_spiked_vcfs_command)
pheval_utils.add_command(benchmark)
pheval_utils.add_command(benchmark_comparison)
pheval_utils.add_command(semsim_to_exomiserdb_command)
pheval_utils.add_command(generate_stats_plot)

if __name__ == "__main__":
Expand Down
98 changes: 49 additions & 49 deletions src/pheval/cli_pheval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
from pheval.prepare.create_spiked_vcf import spike_vcfs
from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError
from pheval.prepare.update_phenopacket import update_phenopackets
from pheval.utils.exomiser import semsim_to_exomiserdb
from pheval.utils.semsim_utils import percentage_diff, semsim_heatmap_plot
from pheval.utils.utils import semsim_convert, semsim_scramble
from pheval.utils.utils import semsim_scramble


@click.command("semsim-scramble")
Expand Down Expand Up @@ -292,54 +293,6 @@ def create_spiked_vcfs_command(
spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, template_vcf_path, vcf_dir)


@click.command("semsim-convert")
@click.option(
"--input",
"-i",
required=True,
metavar="FILE",
help="Path to the semsim file.",
type=Path,
)
@click.option(
"--output",
"-o",
required=True,
metavar="FILE",
help="Path where converted semsim will be written.",
type=Path,
)
@click.option(
"--subject-prefix",
"-s",
required=True,
metavar="FILE",
help="Subject Prefix that will be mapped to the database",
type=str,
)
@click.option(
"--object-prefix",
"-b",
required=True,
metavar="FILE",
help="Object Prefix that will be mapped to the database.",
type=str,
)
@click.option(
"--output-format",
"-O",
required=True,
metavar=str,
help="Output file format. Available formats: (exomiserdb)",
type=click.Choice(["exomiserdb"], case_sensitive=False),
)
def semsim_convert_command(
input: Path, output: Path, subject_prefix: str, object_prefix: str, output_format: str
):
"""convert semsim profile to an exomiser database file"""
semsim_convert(input, output, subject_prefix, object_prefix, output_format)


@click.command()
@click.option(
"--directory",
Expand Down Expand Up @@ -535,6 +488,53 @@ def benchmark_comparison(
)


@click.command("semsim-to-exomiserdb")
@click.option(
"--input",
"-i",
required=True,
metavar="FILE",
help="Semsim input file.",
type=Path,
)
@click.option(
"--object-prefix",
required=True,
metavar="object-prefix",
help="Object Prefix. e.g. MP",
type=str,
)
@click.option(
"--subject-prefix",
required=True,
metavar="subject-prefix",
help="Subject Prefix. e.g. HP",
type=str,
)
@click.option(
"--db-path",
"-d",
required=True,
metavar="db-path",
help="""Exomiser Phenotypic Database Folder Path.
(e.g. /exomiser_folder/2209_phenotype/2209_phenotype/).
This is the path where the phenotypic database folder will be written out.""",
type=Path,
)
def semsim_to_exomiserdb_command(
input_file: Path, object_prefix: str, subject_prefix: str, db_path: Path
):
"""ingests semsim file into exomiser phenotypic database
Args:
input_file (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv
object_prefix (str): object prefix. e.g. MP
subject_prefix (str): subject prefix e.g HP
db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/)
"""
semsim_to_exomiserdb(input_file, object_prefix, subject_prefix, db_path)


@click.command()
@click.option(
"--benchmarking-tsv",
Expand Down
147 changes: 147 additions & 0 deletions src/pheval/infra/exomiserdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# -*- coding: cp936 -*-
import logging as log
import os
from pathlib import Path

import jaydebeapi
import polars as pl
from tqdm import tqdm

info_log = log.getLogger("info")
info_debug = log.getLogger("debug")


class DBConnector:
def __init__(
self, jar: Path, driver: str, server: str, database: str, user: str, password: str
):
self.jar = jar
self.driver = driver
self.server = server
self.database = database
self.user = user
self.password = password
self.dbconn = None

def create_connection(self) -> jaydebeapi.Connection:
"""creates h2 database connection"""
return jaydebeapi.connect(
self.driver,
f"{self.server}{self.database}",
[self.user, self.password],
self.jar,
)

def __enter__(self) -> jaydebeapi.Connection:
self.dbconn = self.create_connection()
return self.dbconn

def __exit__(self, *other):
self.dbconn.close()


class DBConnection:
connection = None

def __init__(self, connection):
DBConnection.connection = connection

@classmethod
def get_connection(cls) -> jaydebeapi.Connection:
"""Creates return new Singleton database connection"""
return DBConnection.connection

def close(self):
return self.connection.close()

@classmethod
def get_cursor(cls) -> jaydebeapi.Cursor:
connection = cls.get_connection()
return connection.cursor()


class ExomiserDB:
def __init__(self, db_path: Path):
try:
self.connector = DBConnector( # noqa
jar=os.path.join(os.path.dirname(__file__), "../../../lib/h2-1.4.199.jar"),
driver="org.h2.Driver",
server=f"jdbc:h2:{db_path}",
user="sa",
password="",
database="",
)
except Exception as e:
print("An exception occurred", e)

def import_from_semsim_file(self, input_file: Path, subject_prefix: str, object_prefix: str):
"""imports semsim tsv profile into exomiser phenotype database
Args:
input_file (Path): semsim profile
subject_prefix (str): Subject Prefix. e.g HP
object_prefix (str): Object Prefix. e.g MP
"""
with self.connector as cnn:
conn = DBConnection(cnn)
reader = pl.read_csv_batched(input_file, separator="\t")
batch_length = 5
batches = reader.next_batches(batch_length)
cursor = conn.get_cursor()
# # TODO: Refactor this
with open(input_file, "r") as f:
total = sum(1 for line in f)
pbar = tqdm(total=total - 1)
mapping_id = 1
while batches:
input_data = pl.concat(batches)
sql = _semsim2h2(input_data, object_prefix, subject_prefix, mapping_id=mapping_id)
cursor.execute(sql)
len_input_data = len(input_data)
mapping_id += len_input_data
pbar.update(len_input_data)

batches = reader.next_batches(batch_length)


def _format_row(mapping_id, data):
"""format row in a exomiser database way
Args:
mapping_id (_type_): row sequencial id
data (_type_): row data
"""
# TODO:Improve string escaping. Replace this code with parametrised query
return f"""({mapping_id}, '{data['subject_id']}', '{data['subject_label'].replace("'", "")}', '{data['object_id']}', '{data['object_label'].replace("'", "")}', {data['jaccard_similarity']}, {data['ancestor_information_content']}, {data['phenodigm_score']}, '{data['ancestor_id'].split(",")[0]}', '{data['ancestor_label'].replace("'", "")}')""" # noqa


def _semsim2h2(
input_data: pl.DataFrame, subject_prefix: str, object_prefix: str, mapping_id=1
) -> None:
"""This function is responsible for generate sql insertion query for each semsim profile row
Args:
input_data (pl.DataFrame): input data. (e.g. semantic similarity profile file)
subject_prefix (str): subject prefix. (e.g HP)
object_prefix (str): object prefix. (e.g MP)
mapping_id (int, optional): MAPPING_ID.
"""
sql = ""
if mapping_id == 1:
sql += f"TRUNCATE TABLE EXOMISER.{subject_prefix}_{object_prefix}_MAPPINGS;\n"

object_id = (
f"{object_prefix}_ID_HIT" if subject_prefix == object_prefix else f"{object_prefix}_ID"
)
object_term = (
f"{object_prefix}_HIT_TERM" if subject_prefix == object_prefix else f"{object_prefix}_TERM"
)
sql += f"""INSERT INTO EXOMISER.{subject_prefix}_{object_prefix}_MAPPINGS
(MAPPING_ID, {subject_prefix}_ID, {subject_prefix}_TERM, {object_id}, {object_term}, SIMJ, IC, SCORE, LCS_ID, LCS_TERM)
VALUES"""
rows = [
_format_row(data=frame, mapping_id=mapping_id + jdx)
for jdx, frame in enumerate(input_data.iter_rows(named=True))
]
sql += ",\n".join(rows) + ";"
return sql
16 changes: 16 additions & 0 deletions src/pheval/utils/exomiser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pathlib import Path

from pheval.infra.exomiserdb import ExomiserDB


def semsim_to_exomiserdb(input_path: Path, object_prefix: str, subject_prefix: str, db_path: Path):
"""ingests semsim file into exomiser phenotypic database
Args:
input_path (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv
object_prefix (str): object prefix. e.g. MP
subject_prefix (str): subject prefix e.g HP
db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/)
"""
exomiserdb = ExomiserDB(db_path)
exomiserdb.import_from_semsim_file(input_path, object_prefix, subject_prefix)
Loading

0 comments on commit d17074e

Please sign in to comment.