Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New method for semsim ingestion #215

Merged
merged 6 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added lib/h2-1.4.199.jar
Binary file not shown.
39 changes: 38 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ plotly = "^5.13.0"
seaborn = "^0.12.2"
matplotlib = "^3.7.0"
pyserde = "^0.9.8"
polars = "^0.19.15"

[tool.poetry.dev-dependencies]
pytest = "^7.2.0"
Expand Down
4 changes: 2 additions & 2 deletions src/pheval/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
create_spiked_vcfs_command,
generate_stats_plot,
scramble_phenopackets_command,
semsim_convert_command,
semsim_scramble_command,
semsim_to_exomiserdb_command,
update_phenopackets_command,
)

Expand Down Expand Up @@ -52,12 +52,12 @@ def pheval_utils():


pheval_utils.add_command(semsim_scramble_command)
pheval_utils.add_command(semsim_convert_command)
pheval_utils.add_command(scramble_phenopackets_command)
pheval_utils.add_command(update_phenopackets_command)
pheval_utils.add_command(create_spiked_vcfs_command)
pheval_utils.add_command(benchmark)
pheval_utils.add_command(benchmark_comparison)
pheval_utils.add_command(semsim_to_exomiserdb_command)
pheval_utils.add_command(generate_stats_plot)

if __name__ == "__main__":
Expand Down
98 changes: 49 additions & 49 deletions src/pheval/cli_pheval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
from pheval.prepare.create_spiked_vcf import spike_vcfs
from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError
from pheval.prepare.update_phenopacket import update_phenopackets
from pheval.utils.exomiser import semsim_to_exomiserdb
from pheval.utils.semsim_utils import percentage_diff, semsim_heatmap_plot
from pheval.utils.utils import semsim_convert, semsim_scramble
from pheval.utils.utils import semsim_scramble


@click.command("semsim-scramble")
Expand Down Expand Up @@ -292,54 +293,6 @@ def create_spiked_vcfs_command(
spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, template_vcf_path, vcf_dir)


@click.command("semsim-convert")
@click.option(
"--input",
"-i",
required=True,
metavar="FILE",
help="Path to the semsim file.",
type=Path,
)
@click.option(
"--output",
"-o",
required=True,
metavar="FILE",
help="Path where converted semsim will be written.",
type=Path,
)
@click.option(
"--subject-prefix",
"-s",
required=True,
metavar="FILE",
help="Subject Prefix that will be mapped to the database",
type=str,
)
@click.option(
"--object-prefix",
"-b",
required=True,
metavar="FILE",
help="Object Prefix that will be mapped to the database.",
type=str,
)
@click.option(
"--output-format",
"-O",
required=True,
metavar=str,
help="Output file format. Available formats: (exomiserdb)",
type=click.Choice(["exomiserdb"], case_sensitive=False),
)
def semsim_convert_command(
input: Path, output: Path, subject_prefix: str, object_prefix: str, output_format: str
):
"""convert semsim profile to an exomiser database file"""
semsim_convert(input, output, subject_prefix, object_prefix, output_format)


@click.command()
@click.option(
"--directory",
Expand Down Expand Up @@ -535,6 +488,53 @@ def benchmark_comparison(
)


@click.command("semsim-to-exomiserdb")
@click.option(
"--input",
"-i",
required=True,
metavar="FILE",
help="Semsim input file.",
type=Path,
)
@click.option(
"--object-prefix",
required=True,
metavar="object-prefix",
help="Object Prefix. e.g. MP",
type=str,
)
@click.option(
"--subject-prefix",
required=True,
metavar="subject-prefix",
help="Subject Prefix. e.g. HP",
type=str,
)
@click.option(
"--db-path",
"-d",
required=True,
metavar="db-path",
help="""Exomiser Phenotypic Database Folder Path.
(e.g. /exomiser_folder/2209_phenotype/2209_phenotype/).
This is the path where the phenotypic database folder will be written out.""",
type=Path,
)
def semsim_to_exomiserdb_command(
input_file: Path, object_prefix: str, subject_prefix: str, db_path: Path
):
"""ingests semsim file into exomiser phenotypic database

Args:
input_file (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv
object_prefix (str): object prefix. e.g. MP
subject_prefix (str): subject prefix e.g HP
db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/)
"""
semsim_to_exomiserdb(input_file, object_prefix, subject_prefix, db_path)


@click.command()
@click.option(
"--benchmarking-tsv",
Expand Down
147 changes: 147 additions & 0 deletions src/pheval/infra/exomiserdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# -*- coding: cp936 -*-
import logging as log
import os
from pathlib import Path

import jaydebeapi
import polars as pl
from tqdm import tqdm

info_log = log.getLogger("info")
info_debug = log.getLogger("debug")


class DBConnector:
souzadevinicius marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self, jar: Path, driver: str, server: str, database: str, user: str, password: str
):
self.jar = jar
self.driver = driver
self.server = server
self.database = database
self.user = user
self.password = password
self.dbconn = None

def create_connection(self) -> jaydebeapi.Connection:
"""creates h2 database connection"""
return jaydebeapi.connect(
self.driver,
f"{self.server}{self.database}",
[self.user, self.password],
self.jar,
)

def __enter__(self) -> jaydebeapi.Connection:
self.dbconn = self.create_connection()
return self.dbconn

def __exit__(self, *other):
self.dbconn.close()


class DBConnection:
connection = None

def __init__(self, connection):
DBConnection.connection = connection

@classmethod
def get_connection(cls) -> jaydebeapi.Connection:
"""Creates return new Singleton database connection"""
return DBConnection.connection

def close(self):
return self.connection.close()

@classmethod
def get_cursor(cls) -> jaydebeapi.Cursor:
connection = cls.get_connection()
return connection.cursor()


class ExomiserDB:
def __init__(self, db_path: Path):
try:
self.connector = DBConnector( # noqa
jar=os.path.join(os.path.dirname(__file__), "../../../lib/h2-1.4.199.jar"),
driver="org.h2.Driver",
server=f"jdbc:h2:{db_path}",
user="sa",
password="",
database="",
)
except Exception as e:
print("An exception occurred", e)

def import_from_semsim_file(self, input_file: Path, subject_prefix: str, object_prefix: str):
"""imports semsim tsv profile into exomiser phenotype database

Args:
input_file (Path): semsim profile
subject_prefix (str): Subject Prefix. e.g HP
object_prefix (str): Object Prefix. e.g MP
"""
with self.connector as cnn:
conn = DBConnection(cnn)
reader = pl.read_csv_batched(input_file, separator="\t")
batch_length = 5
batches = reader.next_batches(batch_length)
cursor = conn.get_cursor()
# # TODO: Refactor this
with open(input_file, "r") as f:
total = sum(1 for line in f)
pbar = tqdm(total=total - 1)
mapping_id = 1
while batches:
input_data = pl.concat(batches)
sql = _semsim2h2(input_data, object_prefix, subject_prefix, mapping_id=mapping_id)
cursor.execute(sql)
len_input_data = len(input_data)
mapping_id += len_input_data
pbar.update(len_input_data)

batches = reader.next_batches(batch_length)


def _format_row(mapping_id, data):
souzadevinicius marked this conversation as resolved.
Show resolved Hide resolved
"""format row in a exomiser database way

Args:
mapping_id (_type_): row sequencial id
data (_type_): row data
"""
# TODO:Improve string escaping. Replace this code with parametrised query
return f"""({mapping_id}, '{data['subject_id']}', '{data['subject_label'].replace("'", "")}', '{data['object_id']}', '{data['object_label'].replace("'", "")}', {data['jaccard_similarity']}, {data['ancestor_information_content']}, {data['phenodigm_score']}, '{data['ancestor_id'].split(",")[0]}', '{data['ancestor_label'].replace("'", "")}')""" # noqa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be changed so that the code looks a bit neater, this is a bit confusing to read

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be changed so that the code looks a bit neater, this is a bit confusing to read

Yes, I'd like to use an ORM for escaping SQL, but it will be implemented in a future PR…



def _semsim2h2(
input_data: pl.DataFrame, subject_prefix: str, object_prefix: str, mapping_id=1
) -> None:
"""This function is responsible for generate sql insertion query for each semsim profile row

Args:
input_data (pl.DataFrame): input data. (e.g. semantic similarity profile file)
subject_prefix (str): subject prefix. (e.g HP)
object_prefix (str): object prefix. (e.g MP)
mapping_id (int, optional): MAPPING_ID.
"""
sql = ""
if mapping_id == 1:
sql += f"TRUNCATE TABLE EXOMISER.{subject_prefix}_{object_prefix}_MAPPINGS;\n"

object_id = (
f"{object_prefix}_ID_HIT" if subject_prefix == object_prefix else f"{object_prefix}_ID"
)
object_term = (
f"{object_prefix}_HIT_TERM" if subject_prefix == object_prefix else f"{object_prefix}_TERM"
)
sql += f"""INSERT INTO EXOMISER.{subject_prefix}_{object_prefix}_MAPPINGS
(MAPPING_ID, {subject_prefix}_ID, {subject_prefix}_TERM, {object_id}, {object_term}, SIMJ, IC, SCORE, LCS_ID, LCS_TERM)
VALUES"""
rows = [
_format_row(data=frame, mapping_id=mapping_id + jdx)
for jdx, frame in enumerate(input_data.iter_rows(named=True))
]
sql += ",\n".join(rows) + ";"
return sql
16 changes: 16 additions & 0 deletions src/pheval/utils/exomiser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pathlib import Path

from pheval.infra.exomiserdb import ExomiserDB


def semsim_to_exomiserdb(input_path: Path, object_prefix: str, subject_prefix: str, db_path: Path):
"""ingests semsim file into exomiser phenotypic database

Args:
input_path (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv
object_prefix (str): object prefix. e.g. MP
subject_prefix (str): subject prefix e.g HP
db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/)
"""
exomiserdb = ExomiserDB(db_path)
exomiserdb.import_from_semsim_file(input_path, object_prefix, subject_prefix)
Loading