From a1dd4bf90eacf5342260ccf5a4ee483ba728991f Mon Sep 17 00:00:00 2001 From: AliceeLe Date: Sat, 7 Dec 2024 14:09:20 -0500 Subject: [PATCH 1/5] Set up changes for HTMLParser --- src/harmony/parsing/html_parser.py | 59 ++++++++++++++++++++++ src/harmony/parsing/util/tika_wrapper.py | 4 +- src/harmony/parsing/wrapper_all_parsers.py | 3 ++ src/harmony/util/file_helper.py | 9 ++-- 4 files changed, 70 insertions(+), 5 deletions(-) create mode 100644 src/harmony/parsing/html_parser.py diff --git a/src/harmony/parsing/html_parser.py b/src/harmony/parsing/html_parser.py new file mode 100644 index 0000000..b8f5695 --- /dev/null +++ b/src/harmony/parsing/html_parser.py @@ -0,0 +1,59 @@ +''' +MIT License + +Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk). +Project: Harmony (https://harmonydata.ac.uk) +Maintainer: Thomas Wood (https://fastdatascience.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +''' + +import base64 +import uuid +import pdfkit +from harmony.schemas.requests.text import RawFile + +# Convert an HTML file (local or URL) into a PDF and process it using the existing PDF parser. +def convert_html_to_instruments(file: RawFile) -> None: + if file.file_type != "html": + raise ValueError("Input file must have a file_type of 'html'.") + + try: + # Convert HTML to PDF + if file.content.startswith("http"): + pdf_content = pdfkit.from_url(file.content, False) # False returns byte stream + else: + # Decode base64 content if the HTML file is provided as base64 + html_content = base64.urlsafe_b64decode(file.content).decode("utf-8") + pdf_content = pdfkit.from_string(html_content, False) + + # Encode PDF as base64 for the RawFile + file_as_base64 = base64.urlsafe_b64encode(pdf_content).decode("ascii") + + # Update the file to represent the converted PDF + file.file_type = "pdf" + file.content = "," + file_as_base64 + file.file_name = file.file_name.replace(".html", ".pdf") + + # Call the existing PDF parser + convert_pdf_to_instruments(file) + + except Exception as e: + print(f"Error during HTML conversion and parsing: {e}") diff --git a/src/harmony/parsing/util/tika_wrapper.py b/src/harmony/parsing/util/tika_wrapper.py index c0cc989..fa708ae 100644 --- a/src/harmony/parsing/util/tika_wrapper.py +++ b/src/harmony/parsing/util/tika_wrapper.py @@ -47,11 +47,11 @@ def parse_pdf_to_plain_text(contents: str) -> str: """ print("Preparing data for Tika") content_type, content_string = contents.split(",") - file_in_bytes = base64.b64decode(content_string) + file_in_bytes = base64.urlsafe_b64decode(content_string) file = io.BytesIO(file_in_bytes) print("Calling Tika") - parsed = parser.from_buffer(file, xmlContent=True, requestOptions={'timeout': 300}) + parsed = parser.from_buffer(file, xmlContent=True, requestOptions={"timeout": 300}) print("Got response from Tika") parsed_xml = parsed["content"] diff --git a/src/harmony/parsing/wrapper_all_parsers.py b/src/harmony/parsing/wrapper_all_parsers.py index b691448..8b23222 100644 --- a/src/harmony/parsing/wrapper_all_parsers.py +++ b/src/harmony/parsing/wrapper_all_parsers.py @@ -30,6 +30,7 @@ from harmony.parsing.excel_parser import convert_excel_to_instruments from harmony.parsing.pdf_parser import convert_pdf_to_instruments from harmony.parsing.text_parser import convert_text_to_instruments +from harmony.parsing.html_parser import convert_html_to_instruments from harmony.schemas.enums.file_types import FileType from harmony.schemas.requests.text import RawFile, Instrument @@ -41,6 +42,8 @@ def _get_instruments_from_file(file): instruments_from_this_file = convert_text_to_instruments(file) elif file.file_type == FileType.xlsx: instruments_from_this_file = convert_excel_to_instruments(file) + elif file.file_type == FileType.html: + instruments_from_this_file = convert_html_to_instruments(file) else: instruments_from_this_file = [] return instruments_from_this_file diff --git a/src/harmony/util/file_helper.py b/src/harmony/util/file_helper.py index 49895d9..5a98f22 100644 --- a/src/harmony/util/file_helper.py +++ b/src/harmony/util/file_helper.py @@ -46,18 +46,21 @@ def load_instruments_from_local_file(file_name: str) -> List[Instrument]: file_type = "xlsx" elif file_name.lower().endswith("docx"): file_type = "docx" + elif file_name.lower().endswith("html"): + file_type = "html" else: file_type = "txt" - if file_type == "pdf" or file_type == "xlsx" or file_type == "docx": + if file_type in ["pdf", "xlsx", "docx", "html"]: with open( file_name, "rb") as f: file_as_bytes = f.read() - file_as_base64 = base64.urlsafe_b64encode(file_as_bytes).decode('ascii') + file_as_base64 = base64.b64encode(file_as_bytes).decode('ascii') + print(f"File as Base64 (first 100 characters): {file_as_base64[:100]}") - harmony_file = RawFile(file_type=file_type, content="," + file_as_base64, file_id=uuid.uuid4().hex, + harmony_file = RawFile(file_type=file_type, content= file_as_base64, file_id=uuid.uuid4().hex, file_name=file_name) else: with open( From 69c73b95832465feccb0644c27c10554dfd00b7d Mon Sep 17 00:00:00 2001 From: AliceeLe Date: Sun, 8 Dec 2024 01:51:46 -0500 Subject: [PATCH 2/5] Add html converter to pdf for handling html inputs --- .gitignore | 4 ++++ README.md | 1 + pyproject.toml | 4 +++- src/harmony/parsing/wrapper_all_parsers.py | 6 ++--- src/harmony/util/file_helper.py | 27 ++++++++++++++++++++-- 5 files changed, 36 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 65d346a..da5dab7 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,7 @@ dmypy.json .idea/ src/log.txt +*.html +*.pdf +pdm.lock +.pdm-python diff --git a/README.md b/README.md index b8c8297..5353d6d 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,7 @@ You need a Windows, Linux or Mac system with * the requirements in [requirements.txt](./requirements.txt) * Java (if you want to extract items from PDFs) * [Apache Tika](https://tika.apache.org/download.html) (if you want to extract items from PDFs) +* Install wkhtmltopdf if you want to use HTML inputs [More instructions here](https://pypi.org/project/pdfkit/) ## 🖥 Installing Harmony Python package diff --git a/pyproject.toml b/pyproject.toml index 03ea82e..0d14910 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ maintainers = [ authors = [ { name = "Thomas Wood", email = "thomas@fastdatascience.com" }, ] -requires-python = ">=3.6,<=3.13.0" +requires-python = ">=3.11,<=3.13.0" classifiers=[ # see https://pypi.org/classifiers/ "Development Status :: 5 - Production/Stable", @@ -55,6 +55,8 @@ dependencies = [ "scikit-learn; python_version <= '3.13'", "scipy==1.14.1; python_version <= '3.13'", "huggingface-hub==0.25.0; python_version <= '3.13'", + "ipykernel>=6.29.5", + "pdfkit>=1.0.0", ] [project.optional-dependencies] diff --git a/src/harmony/parsing/wrapper_all_parsers.py b/src/harmony/parsing/wrapper_all_parsers.py index 8b23222..894c639 100644 --- a/src/harmony/parsing/wrapper_all_parsers.py +++ b/src/harmony/parsing/wrapper_all_parsers.py @@ -30,7 +30,7 @@ from harmony.parsing.excel_parser import convert_excel_to_instruments from harmony.parsing.pdf_parser import convert_pdf_to_instruments from harmony.parsing.text_parser import convert_text_to_instruments -from harmony.parsing.html_parser import convert_html_to_instruments +# from harmony.parsing.html_parser import convert_html_to_instruments from harmony.schemas.enums.file_types import FileType from harmony.schemas.requests.text import RawFile, Instrument @@ -42,8 +42,8 @@ def _get_instruments_from_file(file): instruments_from_this_file = convert_text_to_instruments(file) elif file.file_type == FileType.xlsx: instruments_from_this_file = convert_excel_to_instruments(file) - elif file.file_type == FileType.html: - instruments_from_this_file = convert_html_to_instruments(file) + # elif file.file_type == FileType.html: + # instruments_from_this_file = convert_html_to_instruments(file) else: instruments_from_this_file = [] return instruments_from_this_file diff --git a/src/harmony/util/file_helper.py b/src/harmony/util/file_helper.py index 5a98f22..6c7e82e 100644 --- a/src/harmony/util/file_helper.py +++ b/src/harmony/util/file_helper.py @@ -28,12 +28,33 @@ import base64 import uuid from typing import List +import pdfkit + from harmony.parsing.wrapper_all_parsers import convert_files_to_instruments from harmony.schemas.requests.text import Instrument from harmony.schemas.requests.text import RawFile +def convert_html_to_pdf(file_name: str) -> RawFile: + try: + # Convert HTML to PDF + if file_name.startswith("http"): + pdf_content = pdfkit.from_url(file_name, False) + else: + pdf_content = pdfkit.from_file(file_name, False) + + file_as_base64 = base64.urlsafe_b64encode(pdf_content).decode("ascii") + + return RawFile( + file_type="pdf", + content="," + file_as_base64, + file_id=uuid.uuid4().hex, + file_name=file_name, + ) + except Exception as e: + print(f"Error during HTML conversion and parsing: {e}") + def load_instruments_from_local_file(file_name: str) -> List[Instrument]: """ Open a local file (PDF, Excel, Word or TXT format) and parse it into a list of Instrument objects. @@ -51,7 +72,7 @@ def load_instruments_from_local_file(file_name: str) -> List[Instrument]: else: file_type = "txt" - if file_type in ["pdf", "xlsx", "docx", "html"]: + if file_type in ["pdf", "xlsx", "docx"]: with open( file_name, "rb") as f: @@ -60,8 +81,10 @@ def load_instruments_from_local_file(file_name: str) -> List[Instrument]: file_as_base64 = base64.b64encode(file_as_bytes).decode('ascii') print(f"File as Base64 (first 100 characters): {file_as_base64[:100]}") - harmony_file = RawFile(file_type=file_type, content= file_as_base64, file_id=uuid.uuid4().hex, + harmony_file = RawFile(file_type=file_type, content= "," + file_as_base64, file_id=uuid.uuid4().hex, file_name=file_name) + elif file_type == "html": + harmony_file = convert_html_to_pdf(file_name) else: with open( file_name, From 12e1f9520024f6d920624e04c49039c0ff702654 Mon Sep 17 00:00:00 2001 From: AliceeLe Date: Sun, 8 Dec 2024 01:55:10 -0500 Subject: [PATCH 3/5] Revert back unnecessary changes --- src/harmony/parsing/html_parser.py | 59 ---------------------- src/harmony/parsing/wrapper_all_parsers.py | 3 -- 2 files changed, 62 deletions(-) delete mode 100644 src/harmony/parsing/html_parser.py diff --git a/src/harmony/parsing/html_parser.py b/src/harmony/parsing/html_parser.py deleted file mode 100644 index b8f5695..0000000 --- a/src/harmony/parsing/html_parser.py +++ /dev/null @@ -1,59 +0,0 @@ -''' -MIT License - -Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk). -Project: Harmony (https://harmonydata.ac.uk) -Maintainer: Thomas Wood (https://fastdatascience.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -''' - -import base64 -import uuid -import pdfkit -from harmony.schemas.requests.text import RawFile - -# Convert an HTML file (local or URL) into a PDF and process it using the existing PDF parser. -def convert_html_to_instruments(file: RawFile) -> None: - if file.file_type != "html": - raise ValueError("Input file must have a file_type of 'html'.") - - try: - # Convert HTML to PDF - if file.content.startswith("http"): - pdf_content = pdfkit.from_url(file.content, False) # False returns byte stream - else: - # Decode base64 content if the HTML file is provided as base64 - html_content = base64.urlsafe_b64decode(file.content).decode("utf-8") - pdf_content = pdfkit.from_string(html_content, False) - - # Encode PDF as base64 for the RawFile - file_as_base64 = base64.urlsafe_b64encode(pdf_content).decode("ascii") - - # Update the file to represent the converted PDF - file.file_type = "pdf" - file.content = "," + file_as_base64 - file.file_name = file.file_name.replace(".html", ".pdf") - - # Call the existing PDF parser - convert_pdf_to_instruments(file) - - except Exception as e: - print(f"Error during HTML conversion and parsing: {e}") diff --git a/src/harmony/parsing/wrapper_all_parsers.py b/src/harmony/parsing/wrapper_all_parsers.py index 894c639..b691448 100644 --- a/src/harmony/parsing/wrapper_all_parsers.py +++ b/src/harmony/parsing/wrapper_all_parsers.py @@ -30,7 +30,6 @@ from harmony.parsing.excel_parser import convert_excel_to_instruments from harmony.parsing.pdf_parser import convert_pdf_to_instruments from harmony.parsing.text_parser import convert_text_to_instruments -# from harmony.parsing.html_parser import convert_html_to_instruments from harmony.schemas.enums.file_types import FileType from harmony.schemas.requests.text import RawFile, Instrument @@ -42,8 +41,6 @@ def _get_instruments_from_file(file): instruments_from_this_file = convert_text_to_instruments(file) elif file.file_type == FileType.xlsx: instruments_from_this_file = convert_excel_to_instruments(file) - # elif file.file_type == FileType.html: - # instruments_from_this_file = convert_html_to_instruments(file) else: instruments_from_this_file = [] return instruments_from_this_file From 90829851a9566a0062644414cc0c11403344fac5 Mon Sep 17 00:00:00 2001 From: AliceeLe Date: Sun, 8 Dec 2024 18:10:50 -0500 Subject: [PATCH 4/5] Leave comments and include test --- src/harmony/util/file_helper.py | 4 + test_read_pdf.ipynb | 136 ++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 test_read_pdf.ipynb diff --git a/src/harmony/util/file_helper.py b/src/harmony/util/file_helper.py index 6c7e82e..b02eb4d 100644 --- a/src/harmony/util/file_helper.py +++ b/src/harmony/util/file_helper.py @@ -35,7 +35,11 @@ from harmony.schemas.requests.text import Instrument from harmony.schemas.requests.text import RawFile + def convert_html_to_pdf(file_name: str) -> RawFile: + """ + Convert html to pdf, + """ try: # Convert HTML to PDF if file_name.startswith("http"): diff --git a/test_read_pdf.ipynb b/test_read_pdf.ipynb new file mode 100644 index 0000000..0a840ac --- /dev/null +++ b/test_read_pdf.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/chauanhle/Library/Mobile Documents/com~apple~CloudDocs/Documents/CMU/17-313/harmonyWrapper/harmonyAPI/harmonyapi/harmony/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: transformers not available. To use transformers, run pip install sentence-transformers\n" + ] + } + ], + "source": [ + "from harmony import convert_pdf_to_instruments\n", + "from harmony.schemas.requests.text import RawFile\n", + "from harmony.util.file_helper import load_instruments_from_local_file\n", + "import base64" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"gad-7.pdf\", \"rb\") as file: \n", + " byte_data = file.read() \n", + "\n", + "encoded = \",\" + base64.urlsafe_b64encode(byte_data).decode(\"ascii\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_file = RawFile.model_validate(RawFile.model_validate({\n", + " \"file_id\": \"d39f31718513413fbfc620c6b6135d0c\",\n", + " \"file_name\": \"GAD-7.pdf\",\n", + " \"file_type\": \"pdf\",\n", + " \"content\": encoded}))\n", + "convert_pdf_to_instruments(raw_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If this previous cells work, then this should work with no error \n", + "load_instruments_from_local_file(\"gad-7.pdf\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use PDFKIT to convert html to pdf, then handle the content as if it comes from a pdf file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pdfkit\n", + "import uuid " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='c9f7de7532ab4fc18fc3600ab4d88d17', instrument_id='87046904c0bc419599994aaa13437331', instrument_name='GAD-7.html', file_name='GAD-7.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Over the last 2 weeks, how often have you been bothered by the following problems?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Several days', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='More than half the days', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Nearly every day', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Feeling nervous, anxious, or on edge', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='Not being able to stop or control worrying', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Worrying too much about different things', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='Trouble relaxing', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='9', question_intro=None, question_text='Being so restless that it is hard to sit still', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='10', question_intro=None, question_text='Becoming easily annoyed or irritable', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='11', question_intro=None, question_text='Feeling afraid as if something awful might happen', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_instruments_from_local_file(\"GAD-7.html\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a5128b2ddf8b16e13e0f4ff5833f2ba3f89a8a5f Mon Sep 17 00:00:00 2001 From: AliceeLe Date: Sun, 8 Dec 2024 20:20:03 -0500 Subject: [PATCH 5/5] Add test cases --- pyproject.toml | 7 +- src/harmony/util/file_helper.py | 8 + test_read_html.ipynb | 269 ++++++++++++++++++++++++++++++++ test_read_pdf.ipynb | 136 ---------------- tests/test_convert_html.py | 124 +++++++++++++++ 5 files changed, 407 insertions(+), 137 deletions(-) create mode 100644 test_read_html.ipynb delete mode 100644 test_read_pdf.ipynb create mode 100644 tests/test_convert_html.py diff --git a/pyproject.toml b/pyproject.toml index 0d14910..bf2dade 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,12 +57,17 @@ dependencies = [ "huggingface-hub==0.25.0; python_version <= '3.13'", "ipykernel>=6.29.5", "pdfkit>=1.0.0", + "transformers>=4.47.0", + "bs4>=0.0.2", ] [project.optional-dependencies] # dev - the developer dependency set, for contributors to harmony -dev = ["check-manifest", "pytest"] +dev = [ + "check-manifest", + "pytest>=8.3.4", +] [project.urls] "Documentation" = "https://harmonydata.ac.uk/" diff --git a/src/harmony/util/file_helper.py b/src/harmony/util/file_helper.py index b02eb4d..37fb34e 100644 --- a/src/harmony/util/file_helper.py +++ b/src/harmony/util/file_helper.py @@ -29,12 +29,18 @@ import uuid from typing import List import pdfkit +from bs4 import BeautifulSoup from harmony.parsing.wrapper_all_parsers import convert_files_to_instruments from harmony.schemas.requests.text import Instrument from harmony.schemas.requests.text import RawFile +def extract_html_title(file_path): + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + soup = BeautifulSoup(content, "html.parser") + return soup.title.string.strip() if soup.title else "Untitled" def convert_html_to_pdf(file_name: str) -> RawFile: """ @@ -89,6 +95,8 @@ def load_instruments_from_local_file(file_name: str) -> List[Instrument]: file_name=file_name) elif file_type == "html": harmony_file = convert_html_to_pdf(file_name) + title = extract_html_title(file_name) + harmony_file.file_name = title else: with open( file_name, diff --git a/test_read_html.ipynb b/test_read_html.ipynb new file mode 100644 index 0000000..786b0ca --- /dev/null +++ b/test_read_html.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: transformers not available. To use transformers, run pip install sentence-transformers\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/chauanhle/Library/Mobile Documents/com~apple~CloudDocs/Documents/CMU/17-313/harmonyWrapper/harmonyAPI/harmonyapi/harmony/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from harmony import convert_pdf_to_instruments\n", + "from harmony.schemas.requests.text import RawFile\n", + "from harmony.util.file_helper import load_instruments_from_local_file\n", + "import base64" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"gad-7.pdf\", \"rb\") as file: \n", + " byte_data = file.read() \n", + "\n", + "encoded = \",\" + base64.urlsafe_b64encode(byte_data).decode(\"ascii\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='da362802746b4310851eca57aca74746', instrument_id='ca11045cd0724ba19fc0fdbe93e7a890', instrument_name='GAD-7.pdf', file_name='GAD-7.pdf', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Feeling nervous, anxious, or on edge', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Not being able to stop or control worrying', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='Worrying too much about different things', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Trouble relaxing', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Being so restless that it is hard to sit still', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='Becoming easily annoyed or irritable', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Feeling afraid, as if something awful might happen', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='If you checked any problems, how difficult have they made it for you to do your work, take care of things at home, or get along with other people?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_file = RawFile.model_validate(RawFile.model_validate({\n", + " \"file_id\": \"d39f31718513413fbfc620c6b6135d0c\",\n", + " \"file_name\": \"GAD-7.pdf\",\n", + " \"file_type\": \"pdf\",\n", + " \"content\": encoded}))\n", + "convert_pdf_to_instruments(raw_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File as Base64 (first 100 characters): JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0NhdGFsb2cvUGFnZXMgMiAwIFIvTGFuZyhlbi1VUykgL1N0cnVjdFRy\n", + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='f671203dfe0d492d81a13f7dbf39421a', instrument_id='0df8db70628a4ae6aae5b28218d816c7', instrument_name='gad-7.pdf', file_name='gad-7.pdf', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Feeling nervous, anxious, or on edge', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Not being able to stop or control worrying', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='Worrying too much about different things', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Trouble relaxing', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Being so restless that it is hard to sit still', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='Becoming easily annoyed or irritable', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Feeling afraid, as if something awful might happen', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='If you checked any problems, how difficult have they made it for you to do your work, take care of things at home, or get along with other people?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If this previous cells work, then this should work with no error \n", + "load_instruments_from_local_file(\"gad-7.pdf\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use PDFKIT to convert html to pdf, then handle the content as if it comes from a pdf file" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pdfkit\n", + "import uuid " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='cbdabbb5f909450b846cc81bb0890133', instrument_id='4fbc45efa4144a2e8004ad89ed6c97db', instrument_name='GAD-7 Chinese.html', file_name='GAD-7 Chinese.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='⼏天', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='⼀半以上天数', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='⼏乎每天', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_instruments_from_local_file(\"GAD-7 Chinese.html\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='2c3bfada19a149b08c0e4646309bc715', instrument_id='9638fa85bb2c41fe91d99e066573029d', instrument_name='GAD-7 Espanol.html', file_name='GAD-7 Espanol.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Durante las últimas 2 semanas, ¿con qué frecuencia ha sentido molestias por los siguientes problemas?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Varios días', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='Más de la mitad de los días', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Casi todos los días', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Sentirse nervioso/a, intranquilo/a o con los nervios de punta', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='No poder dejar de preocuparse o no poder controlar la preocupación', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Preocuparse demasiado por diferentes cosas', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='Dificultad para relajarse', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='9', question_intro=None, question_text='Estar tan inquieto/a que es difícil permanecer sentado/a tranquilamente', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='10', question_intro=None, question_text='Molestarse o ponerse irritable fácilmente', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='11', question_intro=None, question_text='Sentir miedo como si algo terrible pudiera pasar', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_instruments_from_local_file(\"GAD-7 Espanol.html\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='b7ef1503287843c2ae688676b26d8f31', instrument_id='d3e1849c25da4662963dc4d175387582', instrument_name='GHQ-12.html', file_name='GHQ-12.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text=\"The next questions are about how you have been feeling over the last few weeks. Have you recently been able to concentrate on whatever you're doing?\", options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Better than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Less than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Much less than usual Have you recently lost much sleep over worry?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='Much more than usual Have you recently felt that you were playing a useful part in things?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='9', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='10', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='11', question_intro=None, question_text='Less so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='12', question_intro=None, question_text='Much less than usual Have you recently felt capable of making decisions about things?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='13', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='14', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='15', question_intro=None, question_text='Less so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='16', question_intro=None, question_text='Much less capable Have you recently felt constantly under strain?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='17', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='18', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='19', question_intro=None, question_text=\"Much more than usual Have you recently felt you couldn't overcome your difficulties?\", options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='20', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='21', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='22', question_intro=None, question_text='Much more than usual Have you recently been able to enjoy your normal day-to-day activities?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='23', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='24', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='25', question_intro=None, question_text='Less so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='26', question_intro=None, question_text='Much less than usual Have you recently been able to face up to problems?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='27', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='28', question_intro=None, question_text='Same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='29', question_intro=None, question_text='Less able than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='30', question_intro=None, question_text='Much less able Have you recently been feeling unhappy or depressed?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='31', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='32', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='33', question_intro=None, question_text='Much more than usual Have you recently been losing confidence in yourself?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='34', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='35', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='36', question_intro=None, question_text='Much more than usual Have you recently been thinking of yourself as a worthless person?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='37', question_intro=None, question_text='No more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='38', question_intro=None, question_text='Rather more than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='39', question_intro=None, question_text='Much more than usual Have you recently been feeling reasonably happy, all things considered?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='40', question_intro=None, question_text='More so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='41', question_intro=None, question_text='About the same as usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='42', question_intro=None, question_text='Less so than usual', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_instruments_from_local_file(\"GHQ-12.html\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data for Tika\n", + "Calling Tika\n", + "Got response from Tika\n", + "Parsed response from Tika\n" + ] + }, + { + "data": { + "text/plain": [ + "[Instrument(file_id='b4a36b224d1949d7b3b76fd48ccfaaa9', instrument_id='3f937da7fb08483db9187828bcf633bb', instrument_name='GAD-7.html', file_name='GAD-7.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Over the last 2 weeks, how often have you been bothered by the following problems?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Several days', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='More than half the days', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Nearly every day', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Feeling nervous, anxious, or on edge', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='Not being able to stop or control worrying', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Worrying too much about different things', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='Trouble relaxing', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='9', question_intro=None, question_text='Being so restless that it is hard to sit still', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='10', question_intro=None, question_text='Becoming easily annoyed or irritable', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='11', question_intro=None, question_text='Feeling afraid as if something awful might happen', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_instruments_from_local_file(\"GAD-7.html\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/test_read_pdf.ipynb b/test_read_pdf.ipynb deleted file mode 100644 index 0a840ac..0000000 --- a/test_read_pdf.ipynb +++ /dev/null @@ -1,136 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/chauanhle/Library/Mobile Documents/com~apple~CloudDocs/Documents/CMU/17-313/harmonyWrapper/harmonyAPI/harmonyapi/harmony/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Warning: transformers not available. To use transformers, run pip install sentence-transformers\n" - ] - } - ], - "source": [ - "from harmony import convert_pdf_to_instruments\n", - "from harmony.schemas.requests.text import RawFile\n", - "from harmony.util.file_helper import load_instruments_from_local_file\n", - "import base64" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"gad-7.pdf\", \"rb\") as file: \n", - " byte_data = file.read() \n", - "\n", - "encoded = \",\" + base64.urlsafe_b64encode(byte_data).decode(\"ascii\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_file = RawFile.model_validate(RawFile.model_validate({\n", - " \"file_id\": \"d39f31718513413fbfc620c6b6135d0c\",\n", - " \"file_name\": \"GAD-7.pdf\",\n", - " \"file_type\": \"pdf\",\n", - " \"content\": encoded}))\n", - "convert_pdf_to_instruments(raw_file)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If this previous cells work, then this should work with no error \n", - "load_instruments_from_local_file(\"gad-7.pdf\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Use PDFKIT to convert html to pdf, then handle the content as if it comes from a pdf file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pdfkit\n", - "import uuid " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Preparing data for Tika\n", - "Calling Tika\n", - "Got response from Tika\n", - "Parsed response from Tika\n" - ] - }, - { - "data": { - "text/plain": [ - "[Instrument(file_id='c9f7de7532ab4fc18fc3600ab4d88d17', instrument_id='87046904c0bc419599994aaa13437331', instrument_name='GAD-7.html', file_name='GAD-7.html', file_type=None, file_section=None, study=None, sweep=None, metadata=None, language=, questions=[Question(question_no='1', question_intro=None, question_text='Over the last 2 weeks, how often have you been bothered by the following problems?', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='2', question_intro=None, question_text='Several days', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='3', question_intro=None, question_text='More than half the days', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='4', question_intro=None, question_text='Nearly every day', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='5', question_intro=None, question_text='Feeling nervous, anxious, or on edge', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='6', question_intro=None, question_text='Not being able to stop or control worrying', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='7', question_intro=None, question_text='Worrying too much about different things', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='8', question_intro=None, question_text='Trouble relaxing', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='9', question_intro=None, question_text='Being so restless that it is hard to sit still', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='10', question_intro=None, question_text='Becoming easily annoyed or irritable', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None), Question(question_no='11', question_intro=None, question_text='Feeling afraid as if something awful might happen', options=[], source_page=0, instrument_id=None, instrument_name=None, topics_auto=None, topics_strengths=None, nearest_match_from_mhc_auto=None, closest_catalogue_question_match=None, seen_in_catalogue_instruments=None)], closest_catalogue_instrument_matches=None)]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_instruments_from_local_file(\"GAD-7.html\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tests/test_convert_html.py b/tests/test_convert_html.py new file mode 100644 index 0000000..602596b --- /dev/null +++ b/tests/test_convert_html.py @@ -0,0 +1,124 @@ +import unittest +from harmony import load_instruments_from_local_file +from harmony.schemas.requests.text import Instrument +import tempfile +import os + + +class TestLoadInstrumentsFromLocalFile(unittest.TestCase): + def setUp(self): + """Set up temporary files for testing.""" + # English HTML content + self.html_content_english = """ + + + + + GAD-7 Form + + +
GAD-7
+

Over the last 2 weeks, how often have you been bothered by the following problems?

+ + + + + + + + + + + + + + + + + + + +
QuestionNot at all
(0)
Several days
(1)
More than half the days
(2)
Nearly every day
(3)
1. Feeling nervous, anxious, or on edge
+ + + """ + # Chinese HTML content + self.html_content_chinese = """ + + + + + 广泛性焦虑症量表(GAD-7) + + +

广泛性焦虑症量表(GAD-7)

+

在过去两个星期,有多少时候您受到以下问题所困扰?

+ + + + + + + + + + + + + + + + + + + +
问题完全没有
(0)
几天
(1)
一半以上天数
(2)
几乎每天
(3)
1. 感觉紧张、焦虑或不安
+ + + """ + # Temporary files + self.temp_file_english = tempfile.NamedTemporaryFile(delete=False, suffix=".html") + self.temp_file_english.write(self.html_content_english.encode('utf-8')) + self.temp_file_english.close() + + self.temp_file_chinese = tempfile.NamedTemporaryFile(delete=False, suffix=".html") + self.temp_file_chinese.write(self.html_content_chinese.encode('utf-8')) + self.temp_file_chinese.close() + + def tearDown(self): + """Remove temporary files after tests.""" + os.unlink(self.temp_file_english.name) + os.unlink(self.temp_file_chinese.name) + + def test_load_instruments_english_html(self): + """Test loading instruments from an English HTML file.""" + instruments = load_instruments_from_local_file(self.temp_file_english.name) + + self.assertIsInstance(instruments, list) + + self.assertGreater(len(instruments), 0) + + instrument = instruments[0] + self.assertIsInstance(instrument, Instrument) + + self.assertEqual(instrument.instrument_name, "GAD-7 Form") + self.assertGreater(len(instrument.questions), 0) + + + def test_load_instruments_chinese_html(self): + """Test loading instruments from a Chinese HTML file.""" + instruments = load_instruments_from_local_file(self.temp_file_chinese.name) + + self.assertIsInstance(instruments, list) + + self.assertGreater(len(instruments), 0) + + instrument = instruments[0] + self.assertIsInstance(instrument, Instrument) + + self.assertEqual(instrument.instrument_name, "广泛性焦虑症量表(GAD-7)") + self.assertGreater(len(instrument.questions), 0) + + +if __name__ == "__main__": + unittest.main()